In [76]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade
# install joblib to save model. 
# !pip install joblib

In [1]:
import pandas as pd
import tensorflow
import numpy as np

In [2]:
# load the training/test data
%store -r X_train
%store -r X_test
%store -r y_train
%store -r y_test
%store -r X_train_scaled
%store -r X_test_scaled 
%store -r X_train_stand
%store -r X_test_stand

# Train the Model

In [3]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
model1 = SVC(kernel='linear')

In [None]:
# model1.fit(X_train, y_train)
# model1.score(X_train, y_train)

In [None]:
# print(f"The cross-validated accuracy for unprocessed training set:\n{round(cross_val_score(model1,X_train, y_train,scoring='accuracy', cv=5).mean()*100,2)}%")
# # failed to converge without data's being scaled or standardized

In [4]:
print(f"The cross-validated accuracy for normalized training set: \n{round(cross_val_score(model1,X_train_scaled, y_train,scoring='accuracy', cv=5).mean()*100,2)}%")
print(f"The cross-validated accuracy for standardized training set:\n{round(cross_val_score(model1,X_train_stand, y_train,scoring='accuracy', cv=5).mean()*100,2)}%")

The cross-validated accuracy for normalized training set: 
84.13%
The cross-validated accuracy for standardized training set:
89.13%


Standardization produces higher training accuracy than normalization

In [22]:
model2 = SVC(kernel='rbf')
print(f"The cross-validated accuracy for normalized training set: \n{round(cross_val_score(model2,X_train_scaled, y_train,scoring='accuracy', cv=5).mean()*100,2)}%")
print(f"The cross-validated accuracy for standardized training set:\n{round(cross_val_score(model2,X_train_stand, y_train,scoring='accuracy', cv=5).mean()*100,2)}%")

The cross-validated accuracy for normalized training set: 
82.72%
The cross-validated accuracy for standardized training set:
87.18%


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [15]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[0.00001, 0.0001,0.001,0.01,0.1, 1, 10, 100, 1000],
              'kernel':['linear', 'rbf'],
             'gamma':[0.0001, 0.0005, 0.001, 0.005, 0.01]}
grid1 = GridSearchCV(model1, param_grid, verbose=3)

In [16]:
# Train the model with GridSearch
grid1.fit(X_train_scaled,y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV] C=1e-05, gamma=0.0001, kernel=linear ............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=1e-05, gamma=0.0001, kernel=linear, score=0.501, total=   1.0s
[CV] C=1e-05, gamma=0.0001, kernel=linear ............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV]  C=1e-05, gamma=0.0001, kernel=linear, score=0.501, total=   0.9s
[CV] C=1e-05, gamma=0.0001, kernel=linear ............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.9s remaining:    0.0s


[CV]  C=1e-05, gamma=0.0001, kernel=linear, score=0.501, total=   0.9s
[CV] C=1e-05, gamma=0.0001, kernel=linear ............................
[CV]  C=1e-05, gamma=0.0001, kernel=linear, score=0.501, total=   0.9s
[CV] C=1e-05, gamma=0.0001, kernel=linear ............................
[CV]  C=1e-05, gamma=0.0001, kernel=linear, score=0.501, total=   0.9s
[CV] C=1e-05, gamma=0.0001, kernel=rbf ...............................
[CV] ... C=1e-05, gamma=0.0001, kernel=rbf, score=0.501, total=   1.2s
[CV] C=1e-05, gamma=0.0001, kernel=rbf ...............................
[CV] ... C=1e-05, gamma=0.0001, kernel=rbf, score=0.501, total=   1.2s
[CV] C=1e-05, gamma=0.0001, kernel=rbf ...............................
[CV] ... C=1e-05, gamma=0.0001, kernel=rbf, score=0.501, total=   1.2s
[CV] C=1e-05, gamma=0.0001, kernel=rbf ...............................
[CV] ... C=1e-05, gamma=0.0001, kernel=rbf, score=0.501, total=   1.2s
[CV] C=1e-05, gamma=0.0001, kernel=rbf ...............................
[CV] .

[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed: 14.5min finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                               1000],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01],
                         'kernel': ['linear', 'rbf']},
             verbose=3)

In [23]:
print(f"The best paramaters for the tuned model:\n {grid1.best_params_}\n") 
# {'C': 10, 'gamma': 0.0001}
#  {'C': 1000, 'gamma': 0.0001, 'kernel': 'linear'}
print(f"The cross-validated accuracy for the tuned model with normalized training set:\n{round(grid1.best_score_*100,2)}%")
# 86.8%
# 89.09%

The best paramaters for the tuned model:
 {'C': 1000, 'gamma': 0.0001, 'kernel': 'linear'}

The cross-validated accuracy for the tuned model with normalized training set:
89.09%


In [18]:
from sklearn.metrics import classification_report
predictions1 = grid1.predict(X_train_scaled)
print(classification_report(y_train, predictions1)) # 88, 90
print(f"{round(grid1.score(X_train_scaled, y_train)*100,2)}%") # 87.62, 89.57

                precision    recall  f1-score   support

     CANDIDATE       0.84      0.72      0.78      1265
     CONFIRMED       0.78      0.86      0.81      1350
FALSE POSITIVE       0.98      1.00      0.99      2628

      accuracy                           0.90      5243
     macro avg       0.87      0.86      0.86      5243
  weighted avg       0.90      0.90      0.89      5243

89.57%


In [19]:
grid2 = GridSearchCV(model1, param_grid, verbose=3)
grid2.fit(X_train_stand,y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV] C=1e-05, gamma=0.0001, kernel=linear ............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=1e-05, gamma=0.0001, kernel=linear, score=0.501, total=   1.0s
[CV] C=1e-05, gamma=0.0001, kernel=linear ............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV]  C=1e-05, gamma=0.0001, kernel=linear, score=0.501, total=   0.9s
[CV] C=1e-05, gamma=0.0001, kernel=linear ............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.9s remaining:    0.0s


[CV]  C=1e-05, gamma=0.0001, kernel=linear, score=0.501, total=   1.1s
[CV] C=1e-05, gamma=0.0001, kernel=linear ............................
[CV]  C=1e-05, gamma=0.0001, kernel=linear, score=0.501, total=   1.1s
[CV] C=1e-05, gamma=0.0001, kernel=linear ............................
[CV]  C=1e-05, gamma=0.0001, kernel=linear, score=0.501, total=   0.9s
[CV] C=1e-05, gamma=0.0001, kernel=rbf ...............................
[CV] ... C=1e-05, gamma=0.0001, kernel=rbf, score=0.501, total=   1.3s
[CV] C=1e-05, gamma=0.0001, kernel=rbf ...............................
[CV] ... C=1e-05, gamma=0.0001, kernel=rbf, score=0.501, total=   1.2s
[CV] C=1e-05, gamma=0.0001, kernel=rbf ...............................
[CV] ... C=1e-05, gamma=0.0001, kernel=rbf, score=0.501, total=   1.3s
[CV] C=1e-05, gamma=0.0001, kernel=rbf ...............................
[CV] ... C=1e-05, gamma=0.0001, kernel=rbf, score=0.501, total=   1.2s
[CV] C=1e-05, gamma=0.0001, kernel=rbf ...............................
[CV] .

[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed: 79.2min finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                               1000],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01],
                         'kernel': ['linear', 'rbf']},
             verbose=3)

In [21]:
print(f"The best paramaters for the tuned model:\n {grid2.best_params_}")
# {'C': 10, 'gamma': 0.0001}
#  {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
print(f"The cross-validated accuracy for the tuned model with standardized training set:\n{round(grid2.best_score_*100,2)}%")
# 89.32%
# 89.32%

The best paramaters for the tuned model:
 {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
The cross-validated accuracy for the tuned model with standardized training set:
89.32%


Compared with the first optimized model based on normalized training data (89.09%), the second optimized model based on standardized data has slightly higher cross-validated accuracy (89.32%)

In [24]:
predictions2 = grid2.predict(X_train_stand)
print(classification_report(y_train, predictions2))
print(f"{round(grid2.score(X_train_stand, y_train)*100,2)}%") #89.74% 90.64%

                precision    recall  f1-score   support

     CANDIDATE       0.85      0.75      0.80      1265
     CONFIRMED       0.80      0.87      0.83      1350
FALSE POSITIVE       0.99      1.00      0.99      2628

      accuracy                           0.91      5243
     macro avg       0.88      0.87      0.88      5243
  weighted avg       0.91      0.91      0.91      5243

90.64%


# Save the Model

In [25]:
import joblib
filename = 'SVM.sav'
joblib.dump(grid2, filename)

['SVM.sav']