In [1]:
import os
import pandas as pd
import numpy as np                     #visualisation
import matplotlib.pyplot as plt             #visualisation
import sklearn

from sklearn.utils import shuffle
from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
import random
import sys
path = os.path.dirname(os.getcwd())
new_path = os.path.join(path, 'src')
sys.path.insert(0,new_path)
# You may have to add this module to the sys file path
import models.modeling_pipeline as mp
from models.modeling_pipeline import get_scaled_training_test_data
from models.modeling_pipeline import pick_random_angle_rows
from models.modeling_pipeline import get_training_data
from models.train_model import perform_lassoridge_cv
from models.train_model import split_dataset
random.seed(10)
%matplotlib inline

# Generate appropriate filepaths
path = os.path.dirname(os.getcwd())
filename =os.path.join(path ,'data', 'interim', 'full_feature_data.csv')
df = pd.read_csv(filename)


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score

In [3]:
df.head()

Unnamed: 0,Tube_Alias,Flaw_ID,Angle,Amp_1,Amp_2,Amp_3,Amp_4,Amp_5,Amp_6,Amp_7,...,AB_Ratio_15,AB_Ratio_16,AB_Ratio_17,AB_Ratio_18,AB_Ratio_19,AB_Ratio_20,Flaw_Depth,Pct_Depth,Flaw_Volume,Flaw_Area
0,AP01,A,0,12.15863,17.616182,19.507299,12.688417,28.611131,23.38027,44.989959,...,1.956307,3.27593,1.817514,0.777416,0.866272,1.73343,0.076,10.3,0.864,11.3288
1,AP01,A,10,12.66239,13.679878,19.669931,14.483477,28.351937,18.583178,42.755353,...,1.985838,3.756354,1.795074,0.837584,0.89613,1.501476,0.076,10.3,0.864,11.3288
2,AP01,A,20,10.256701,13.40431,15.955676,13.250621,25.169557,18.774754,38.196417,...,1.938655,3.089216,1.856828,0.856594,0.853804,1.686915,0.076,10.3,0.864,11.3288
3,AP01,A,30,9.885306,8.746499,15.840469,16.280198,20.803812,19.185067,31.785403,...,1.932763,3.235533,1.879336,0.891017,0.882775,1.764685,0.076,10.3,0.864,11.3288
4,AP01,A,40,10.595372,13.1484,14.767161,13.241353,22.418596,17.189801,36.666878,...,1.895807,2.851098,1.837718,0.747698,0.844795,1.887925,0.076,10.3,0.864,11.3288


In [4]:
X_train, X_test, y_train, y_test, train_df, test_df = mp.get_scaled_training_test_data(df, method='five', num=5)
train_df = shuffle(train_df)
test_df = shuffle(test_df)

In [5]:
# Creating dummy variables for different pit shapes
train_df["Pit_shape"] = train_df["Tube_Alias"]

# very inefficient solution - to re-write cleaner version later    
train_df["Pit_shape"] = train_df["Pit_shape"].map({'AP01':'AP','AP02':'AP','AP03':'AP','AP04':'AP','AP05':'AP',
                                       'CP01':'CP','CP02':'CP','CP03':'CP','CP04':'CP','CP05':'CP',
                                       'RP02':'RP','RP03':'RP','RP04':'RP','AP05':'RP','AP06':'RP',
                                       'WT02':'WT','WT03':'WT','WT04':'WT','WT05':'WT'})

train_df = pd.get_dummies(train_df, columns = ["Pit_shape"])

In [6]:
# Creating dummy variables for different pit shapes
test_df["Pit_shape"] = test_df["Tube_Alias"]

# very inefficient solution - to re-write cleaner version later    
test_df["Pit_shape"] = test_df["Pit_shape"].map({'AP01':'AP','AP02':'AP','AP03':'AP','AP04':'AP','AP05':'AP',
                                       'CP01':'CP','CP02':'CP','CP03':'CP','CP04':'CP','CP05':'CP',
                                       'RP02':'RP','RP03':'RP','RP04':'RP','AP05':'RP','AP06':'RP',
                                       'WT02':'WT','WT03':'WT','WT04':'WT','WT05':'WT'})

test_df = pd.get_dummies(test_df, columns = ["Pit_shape"])

In [7]:
num_of_each_pit_train = {'AP': len(train_df[train_df['Pit_shape_AP']==1]),
               'CP': len(train_df[train_df['Pit_shape_CP']==1]),
               'RP': len(train_df[train_df['Pit_shape_RP']==1]),
               'WT': len(train_df[train_df['Pit_shape_WT']==1])
              }

num_of_each_pit_test = {'AP': len(test_df[test_df['Pit_shape_AP']==1]),
               'CP': len(test_df[test_df['Pit_shape_CP']==1]),
               'RP': len(test_df[test_df['Pit_shape_RP']==1]),
               'WT': len(test_df[test_df['Pit_shape_WT']==1])
              }

print(f'''
Number of samples in training set:\n
    Axial pits: {num_of_each_pit_train['AP']}
    Circular pits: {num_of_each_pit_train['CP']}
    Round pits: {num_of_each_pit_train['RP']}
    Welded tube pits: {num_of_each_pit_train['RP']}
''')

print(f'''
Number of samples in testing set:\n
    Axial pits: {num_of_each_pit_test['AP']}
    Circular pits: {num_of_each_pit_test['CP']}
    Round pits: {num_of_each_pit_test['RP']}
    Welded tube pits: {num_of_each_pit_test['RP']}
''')


Number of samples in training set:

    Axial pits: 95
    Circular pits: 134
    Round pits: 105
    Welded tube pits: 105


Number of samples in testing set:

    Axial pits: 36
    Circular pits: 45
    Round pits: 30
    Welded tube pits: 30



In [8]:
training_features = []
for i in range(1,21):
    training_features.append(f'Amp_{i}')
    training_features.append(f'Phase_{i}')

target_columns = ['Pit_shape_AP','Pit_shape_CP','Pit_shape_RP','Pit_shape_WT']

X_train = train_df[training_features]
y_train = train_df[target_columns]

X_test = test_df[training_features]
y_test = test_df[target_columns]



In [9]:
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

In [10]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [11]:
prediction = model.predict(X_test)

accuracy = accuracy_score(y_test, prediction)
AUC = roc_auc_score(y_test, prediction)

print("The accuracy is ", "%.2f" %accuracy)
print("The AUC is ", "%.2f" %AUC)

The accuracy is  0.16
The AUC is  0.54


## Cross-validation scoring of different models
Performs K-fold cross validation rather than train-test split.
Takes in "input_features" and "output_features", before they have been split into X_train, y_train, etc

In [12]:
models = []

# models.append(("LogisticRegression",LogisticRegression()))
# models.append(("SVC",SVC()))
# models.append(("LinearSVC",LinearSVC()))
models.append(("KNeighbors",KNeighborsClassifier()))
models.append(("DecisionTree",DecisionTreeClassifier()))
models.append(("RandomForest",RandomForestClassifier()))
rf2 = RandomForestClassifier(n_estimators=100, criterion='gini',
                                max_depth=10, random_state=0, max_features=None)
models.append(("RandomForest2",rf2))
# models.append(("MLPClassifier",MLPClassifier(solver='lbfgs', random_state=0)))

In [13]:
results = []
names = []
for name,model in models:
    result = cross_val_score(model, X_train, y_train, cv=5)
#     result = cross_val_score(model, X_train, y_train.to_numpy()[:,0],  cv=3)
    names.append(name)
    results.append(result)

print("Cross validation performance on training set:")
for i in range(len(names)):
    print(names[i],results[i].mean())

Cross validation performance on training set:
KNeighbors 0.43838383838383843
DecisionTree 0.5131313131313131
RandomForest 0.43838383838383843
RandomForest2 0.4525252525252525


### Conclusion: performance at discriminating between different pits is poor