# This notebook demonstrates the use of the function 'run_model_store_JSON' within predict_model.py.

The purpose of this function is to take training and test sets, and to (1) report performance, (2) save the model as a .pkl file and (3) record details of the model in JSON file.

When designing this function, our intention was for (1) save_model to be toggled off as default, and added if a model is of particular interest and (2) for a single JSON file to be used, where multiple model details can be appended over time.

We intended to create an additional function, which extracts information from the JSON file and presents it in an easily intepretable way, but were not able to do so due to time restrictions.


### The function takes in the following parameters:
- fitted_model: the model that has been trained on the training set
- X_train and y_train: the training datasets
- X_test and Y-test: the test datasets
- dataset_used: the .csv filenmae of the dataset used for training and testing
- feature_columns: a list of the column names used for training
- JSON_filename: the desired JSON file to record information to
- optional_comment: takes a string adding any relevant information
    e.g. "PCA features were created using all phase recordings"
- task_type: default is regression task, can specify 'classification' and different performance metris will be returned
- save_model_as_pickle will save a .pkl file of the model in the 'models' folder if True.
- scaler: takes the scaler that was used to scale the training set and (1) save it as a pickle file and (2) records the directory for the file in the JSON

            
The function stores the above in a JSON format, as well as the model performance and the time of training.


In this notebook, this function is demonstrated first for a **regression task** and then later for a **classification task**.

In [1]:
import os
import sys

path = os.getcwd()
new_path = (os.path.join(os.path.dirname(path), 'src'))
sys.path.insert(0,new_path)

import pandas as pd
import numpy as np

import sklearn
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import sklearn.metrics as metrics
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import data.make_dataset
import features.process_data

import models.modeling_pipeline
from models.modeling_pipeline import get_training_data
from models.modeling_pipeline import pick_random_angle_rows
from models.train_model import perform_lassoridge_cv

from data.make_dataset import CompleteData
from data.make_dataset import get_waveform
from features.build_features import DataFeatures

from features.build_features import extract_PCA_components

import matplotlib.pyplot as plt
from constants import FilePath, Channel

filename = os.path.join(os.path.dirname(os.getcwd()), 'data','interim','full_feature_cg_data.csv')
dataframe = pd.read_csv(filename)
dataframe.head(4)# to visualise full DataFrame


Unnamed: 0,Tube_Alias,Flaw_ID,Angle,Amp_1,Amp_2,Amp_3,Amp_4,Amp_5,Amp_6,Amp_7,...,Phase_15,Phase_16,Phase_17,Phase_18,Phase_19,Phase_20,Flaw_Depth,Pct_Depth,Flaw_Volume,Flaw_Area
0,AP01,A,0,10.320653,14.854606,16.017633,12.423161,23.971155,18.113805,36.668613,...,0.330427,-0.038826,-0.681307,-0.840659,0.556654,0.333518,0.076,10.3,0.864,11.3288
1,AP01,A,10,9.256762,13.566036,12.946058,12.594721,19.365281,17.401339,28.518174,...,0.290231,-0.069128,-0.695486,-0.73928,0.548302,0.294714,0.076,10.3,0.864,11.3288
2,AP01,A,20,6.375396,8.061063,9.746397,10.879743,14.309066,18.105114,22.923653,...,0.355075,-0.013229,-0.505109,-0.781582,0.626245,0.293746,0.076,10.3,0.864,11.3288
3,AP01,A,30,9.70041,11.746437,14.777542,16.300598,18.984767,21.744765,30.582848,...,0.338012,-0.003931,-0.437232,-0.721585,0.560389,0.27756,0.076,10.3,0.864,11.3288


# EXAMPLE OF USE 1: FOR LINEAR REGRESSION MODEL GENERATED WITH PCA COMPONENTS

In [2]:
# Select the columns of features that are general to all future PCA component 
common_columns = ['Tube_Alias','Flaw_ID', 'Angle','Flaw_Depth', 'Flaw_Volume']
selected_feature_columns = dataframe.loc[:,common_columns] 

## Generate PCA components

In [3]:
# FOR PHASES
differential_phases = ['Phase_1', 'Phase_3', 'Phase_5', 'Phase_7', 'Phase_9', 'Phase_11', 'Phase_13',
                                  'Phase_15', 'Phase_17', 'Phase_19']
absolute_phases = ['Phase_2', 'Phase_4', 'Phase_6', 'Phase_8', 'Phase_10', 'Phase_12', 'Phase_14',
                                  'Phase_16', 'Phase_18', 'Phase_20']

components_phases_diff, explained_variance1  = extract_PCA_components(dataframe, differential_phases,
                                                num_components=4, comp_names="PhaseDiff")
components_phases_abs, explained_variance2 = extract_PCA_components(dataframe, absolute_phases,
                                                num_components=4, comp_names="PhaseAbs")

pca_dataframe = pd.concat([selected_feature_columns,components_phases_diff,components_phases_abs],axis=1)

explained_variance = pd.concat([explained_variance1, explained_variance2],axis=1)


# FOR AMPLITUDES
differential_amplitudes = ['Amp_1', 'Amp_3', 'Amp_5', 'Amp_7', 'Amp_9', 'Amp_11', 'Amp_13',
                                  'Amp_15', 'Amp_17', 'Amp_19']

absolute_amplitudes = ['Amp_2', 'Amp_4', 'Amp_6', 'Amp_8', 'Amp_10', 'Amp_12', 'Amp_14',
                                  'Amp_16', 'Amp_18', 'Amp_20']

components_amps_diff, explained_variance3 = extract_PCA_components(dataframe, differential_amplitudes,
                                                num_components=4, comp_names="AmpDiff")

components_amps_abs, explained_variance4 = extract_PCA_components(dataframe, absolute_amplitudes,
                                                num_components=4, comp_names="AmpAbs")

pca_dataframe = pd.concat([pca_dataframe,components_amps_diff,components_amps_abs], axis=1)

explained_variance = pd.concat([explained_variance, explained_variance3, explained_variance4],axis=1)

# alternative: create separate dataframe for amplitudes
# pca_dataframe = pd.concat([selected_feature_columns,components_phases_diff,components_phases_abs],axis=1)a




The explained variance (as a ratio) for the 4 principle components (using PhaseDiff if specified) are as follows:
 [0.60239673 0.12807611 0.09336869 0.06946589]
The explained variance (as a ratio) for the 4 principle components (using PhaseAbs if specified) are as follows:
 [0.32953421 0.18224222 0.10589191 0.07824185]
The explained variance (as a ratio) for the 4 principle components (using AmpDiff if specified) are as follows:
 [9.94252321e-01 5.03896834e-03 6.27853133e-04 3.31979202e-05]
The explained variance (as a ratio) for the 4 principle components (using AmpAbs if specified) are as follows:
 [0.77755723 0.14731867 0.03628239 0.03136264]


## Train linear regression model

In [4]:
# select random angle for each pit, to be training samples
training_samples = pick_random_angle_rows(pca_dataframe, 1)


In [5]:
# SELECT INPUTS AND OUTPUTS - COMMENT IN THE APPROPRIATE ONE

inputs = training_samples.drop(common_columns, axis=1) # all of the components
# inputs = pca_dataframe[['PhaseDiffComp1','PhaseDiffComp2','PhaseDiffComp3','PhaseDiffComp4']]

target_column = 'Flaw_Depth'
outputs = training_samples[[target_column]]
# outputs = training_samples[['Flaw_Volume']]


In [6]:
# Define the percentage of training points to be used for the validation set
valid_ratio = 0.15

# Define the training and validation samples
num_samples = len(training_samples)
num_valid = int(np.ceil(num_samples * valid_ratio))
num_train = num_samples - num_valid

X_train = inputs._slice(slice(num_valid,num_samples)).reset_index().drop(['index'],axis=1)
y_train = outputs._slice(slice(num_valid,num_samples)).reset_index().drop(['index'],axis=1)

X_valid = inputs._slice(slice(0,num_valid)).reset_index().drop(['index'],axis=1)
y_valid = outputs._slice(slice(0,num_valid)).reset_index().drop(['index'],axis=1)


In [7]:
# Fit model 
model_linear = LinearRegression().fit(X_train, y_train)


# SAVE MODEL AND STORE IN JSON FUNCTION

In [8]:
feature_columns = training_samples.drop(common_columns, axis=1).columns

In [9]:
from models.predict_model import run_model_store_JSON

run_model_store_JSON(model_linear, X_train, y_train, X_valid, y_valid, filename, feature_columns,
                     target_column, JSON_filename="temp.txt", task_type ="regression",
                     save_model = True)



Model saved in models folder as LinearRegression_14-04-2020_11-38-26.

              Mean Absolute Error: 0.051093991188801986
              Mean Squared Error: 0.004348568776646117
              Root Mean Squared Error: 0.0659436788225082
              R Squared Score: 0.8902137011681799 
              

              Mean Absolute Error: 0.08370936386864836
              Mean Squared Error: 0.01915870847783486
              Root Mean Squared Error: 0.13841498646402006
              R Squared Score: 0.5295376522637629 
              
Record of model and performance stored in temp.txt within models folder.


# EXAMPLE OF USE 2: FOR CLASSIFICATION MODELS

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier


In [11]:
df = pd.read_csv(filename)

# Creating column for whether flaw is through hole (full thickness)
df["Through_Hole"] = df['Flaw_ID']
df["Through_Hole"] = df["Through_Hole"].map({'A':0,'B':0,'C':0,'D':0,'E':0,'F':0,'G':0,'H':0,'I':1})


In [12]:
training_samples = pick_random_angle_rows(df, 1)
training_samples = shuffle(training_samples)

num_through_in_sample = len(training_samples[training_samples['Through_Hole']==1])
print("There are", num_through_in_sample, "examples of through hole flaws in the training sample.")

There are 18 examples of through hole flaws in the training sample.


In [13]:
feature_columns = []
for i in range(1,21):
    feature_columns.append(f'Amp_{i}')
    feature_columns.append(f'Phase_{i}')

target_column = 'Through_Hole'   
    
input_features = training_samples[feature_columns]
output_features = training_samples[target_column]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(input_features, output_features, test_size = 0.2)


In [15]:
y_train = y_train.to_numpy().reshape(-1,1)
y_test = y_test.to_numpy().reshape(-1,1)


sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)
y_test = sc_y.transform(y_test)

y_train = y_train[:,0]
y_test = y_test[:,0]

# encode y values as binary, rather than continuous values
lab_enc = preprocessing.LabelEncoder()
y_train = lab_enc.fit_transform(y_train)
y_test = lab_enc.fit_transform(y_test)


In [16]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

# SAVE MODEL AND STORE IN JSON FUNCTION

In [17]:
from models.predict_model import run_model_store_JSON

run_model_store_JSON(model, X_train, y_train, X_test, y_test, filename, feature_columns, 
                     target_column, JSON_filename="temp3.txt", task_type="classification",
                     save_model = True, scaler = sc_x)



Model saved in models folder as RandomForestClassifier_14-04-2020_11-38-27.
Scaler saved in models folder as RandomForestClassifier_14-04-2020_11-38-27_scaler.

MODEL PERFORMANCE ON TRAINING SET:

              Accuracy: 1.0
              AUC: 1.0
              F1 score: 1.0
              Weighted F1 score: 1.0
              Matthew's Correlation Coefficient: 1.0
              

MODEL PERFORMANCE ON TEST SET:

              Accuracy: 0.9705882352941176
              AUC: 0.9848484848484849
              F1 score: 0.6666666666666666
              Weighted F1 score: 0.9752639517345398
              Matthew's Correlation Coefficient: 0.6963106238227914
              
Record of model and performance stored in temp3.txt within models folder.
