In [1]:
import pandas as pd
import os
import sys
# import the pipeline module using whatever path you need
sys.path.insert(0, os.path.join(os.path.dirname(os.getcwd()), 'src'))
import models.modeling_pipeline as mp

from math import sqrt
from sklearn import svm
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from statistics import mean

###### read in dataframe

In [15]:
df = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()),'data','interim', 'full_feature_data.csv'))

In [16]:
df.head()

Unnamed: 0,Tube_Alias,Flaw_ID,Angle,Amp_1,Amp_2,Amp_3,Amp_4,Amp_5,Amp_6,Amp_7,...,AB_Ratio_15,AB_Ratio_16,AB_Ratio_17,AB_Ratio_18,AB_Ratio_19,AB_Ratio_20,Flaw_Depth,Pct_Depth,Flaw_Volume,Flaw_Area
0,AP01,A,0,12.15863,17.616182,19.507299,12.688417,28.611131,23.38027,44.989959,...,62.92537,73.024831,61.180318,37.86208,40.901472,60.019741,0.076,10.3,0.864,11.3288
1,AP01,A,10,12.66239,13.679878,19.669931,14.483477,28.351937,18.583178,42.755353,...,63.271739,75.092716,60.878691,39.949001,41.864482,56.33593,0.076,10.3,0.864,11.3288
2,AP01,A,20,10.256701,13.40431,15.955676,13.250621,25.169557,18.774754,38.196417,...,62.714336,72.062884,61.695195,40.583161,40.490845,59.340622,0.076,10.3,0.864,11.3288
3,AP01,A,30,9.885306,8.746499,15.840469,16.280198,20.803812,19.185067,31.785403,...,62.643229,72.825289,61.982432,41.701572,41.437257,60.460928,0.076,10.3,0.864,11.3288
4,AP01,A,40,10.595372,13.1484,14.767161,13.241353,22.418596,17.189801,36.666878,...,62.189261,70.672087,61.44704,36.785382,40.190941,62.090639,0.076,10.3,0.864,11.3288


## Need to define some useful functions...

###### gets a list of features. Here, it returns a list of all AB features.

In [17]:
def get_feature_list(feats, df):
    feats = feats
    cols = list(df.columns)
    feats_list = [col for col in cols if any(substring in col for substring in feats)]
    return feats_list

##### Gets the scaler based on the training data set

In [18]:
def get_scaler(df, feats_list):
    x = df[feats_list]
    sc = StandardScaler().fit(x)
    return sc

##### Scales the X data and gets the y column

In [19]:
def scale_the_data(sc, df, feats_list):
    x_scaled = sc.transform(df[feats_list])
    y = df['Flaw_Depth']
    return x_scaled, y

##### Gets group labels. GroupKFold requires a list of labels matching the length of your datasets. So if there are 5883 rows in a dataset, we need a group label for each row. This function adds a group number column to the dataframe based on which tube/flaw group the row is in. We can then use this group label column as our input to the "groups" part of the model fitting.

In [20]:
def get_group_labels(df):
    df_list = []
    gp = 0
    for k, g in df.groupby(['Tube_Alias', 'Flaw_ID']):
        gp += 1
        sub_frame = g.copy()
        sub_frame['group'] = gp
        df_list.append(sub_frame)
        
    test = pd.concat(df_list)
    
    group_labels = list(test['group'])
    return group_labels

##### SVM model that puts all the above together. See annotations in the function.

In [21]:
def svm_model(train, test):
    # get a list of features
    feats_list = get_feature_list(['AB'], train)
    
    # get the scaler
    sc = get_scaler(train, feats_list)
    
    # get the scaled X_train data and the y values
    X_train, y_train = scale_the_data(sc, train, feats_list)
    
    # get the scaled X_test data and the y values
    X_test, y_test = scale_the_data(sc, test, feats_list)
    
    # get group labels for the training data
    group_labels = get_group_labels(train)
    
    # specify our CV type; Group K fold in this case, with 5 folds.
    gkf = GroupKFold(n_splits=5)
    
    # model parameters to optimize
    parameters = {
        'C': (0.1, 1),
        'kernel': ('rbf', 'linear'),
        'gamma':(0.001, 0.01, 0.1)
                  }
    # the model function
    lin_svm = svm.SVR()
  
    # used Randomized Search to optimize parameters
    # we set 'cv' to 'gfk' from above.
    # model scoring is based on RMSE
    lin = RandomizedSearchCV(lin_svm, 
                             parameters, 
                             scoring='neg_root_mean_squared_error', 
                             cv=gkf, 
                             verbose=1)
    
    # fit the model, using our group labels from before.
    print('Fitting models. Might take a minutes....')
    lin.fit(X_train, y_train.values.ravel(),groups=group_labels)
    
    # make predictions
    y_pred = lin.predict(X_test)
    
    # calculate scores
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    return rmse, r2, lin

##### We can do a train, test split first using our custom grouped split function

In [22]:
train, test = mp.split_tube_flaw_between_train_test(df, train_pct=0.8, seed=42)

In [23]:
train.head()

Unnamed: 0,Tube_Alias,Flaw_ID,Angle,Amp_1,Amp_2,Amp_3,Amp_4,Amp_5,Amp_6,Amp_7,...,AB_Ratio_15,AB_Ratio_16,AB_Ratio_17,AB_Ratio_18,AB_Ratio_19,AB_Ratio_20,Flaw_Depth,Pct_Depth,Flaw_Volume,Flaw_Area
0,RP05,I,140,683.474818,471.013237,1057.933303,713.792277,1654.561228,1115.852326,2686.98886,...,65.681196,76.637551,63.111251,45.464892,37.241947,54.475204,0.737,100.1,13.192,17.935994
1,RP04,A,150,7.844994,7.958591,13.655842,11.026713,19.089036,8.399855,30.340439,...,47.747858,57.104616,80.343343,55.975698,23.065157,41.609557,0.074,10.0,0.583,7.938234
2,AP04,F,230,49.972476,40.969084,73.897339,54.321825,112.549273,83.809127,184.165243,...,78.205936,73.393702,50.292614,30.892478,47.301355,66.037486,0.445,60.4,1.68,3.7604
3,RP03,E,170,66.425438,49.14227,100.654379,63.912495,153.885837,101.320932,244.642444,...,64.647576,72.849811,61.525053,43.453735,38.844898,57.796706,0.368,50.0,1.649,4.483999
4,CP05,D,220,69.152178,52.404764,103.278496,67.616069,155.805326,100.496541,247.94858,...,62.02639,72.444847,63.511032,48.842234,37.69753,52.082467,0.292,39.6,1.472,5.0165


##### Then we can pass the train/test data and perform Grouped K folds CV to obtain the optimal hyperparameters for our model

In [26]:
rmse, r2, lin = svm_model(train, test)

Fitting models. Might take a minutes....
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   14.4s finished


In [27]:
print("RMSE: {} \n r2: {} \n best hyperparameters: {}".format(rmse, r2, lin.best_params_))

RMSE: 0.05038886909729177 
 r2: 0.9359726568658557 
 best hyperparameters: {'kernel': 'rbf', 'gamma': 0.01, 'C': 1}
