In [1]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade
# install joblib to save model. 
# !pip install joblib

In [2]:
import pandas as pd
import tensorflow
import numpy as np

In [3]:
# load the training/test data
%store -r X_train
%store -r X_test
%store -r y_train
%store -r y_test
%store -r X_train_scaled
%store -r X_test_scaled 
%store -r X_train_stand
%store -r X_test_stand

# Train the Model

In [4]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
model1 = SVC(kernel='linear')

In [5]:
# model1.fit(X_train, y_train)
# model1.score(X_train, y_train)

In [6]:
# print(f"The cross-validated accuracy for unprocessed training set:\n{round(cross_val_score(model1,X_train, y_train,scoring='accuracy', cv=5).mean()*100,2)}%")
# # failed to converge without data's being scaled or standardized

In [7]:
print(f"The cross-validated accuracy for normalized training set: \n{round(cross_val_score(model1,X_train_scaled, y_train,scoring='accuracy', cv=5).mean()*100,2)}%")
print(f"The cross-validated accuracy for standardized training set:\n{round(cross_val_score(model1,X_train_stand, y_train,scoring='accuracy', cv=5).mean()*100,2)}%")

The cross-validated accuracy for normalized training set: 
84.28%
The cross-validated accuracy for standardized training set:
89.05%


Standardization produces higher training accuracy than normalization

In [8]:
model2 = SVC(kernel='rbf')
print(f"The cross-validated accuracy for normalized training set: \n{round(cross_val_score(model2,X_train_scaled, y_train,scoring='accuracy', cv=5).mean()*100,2)}%")
print(f"The cross-validated accuracy for standardized training set:\n{round(cross_val_score(model2,X_train_stand, y_train,scoring='accuracy', cv=5).mean()*100,2)}%")

The cross-validated accuracy for normalized training set: 
82.65%
The cross-validated accuracy for standardized training set:
87.63%


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [17]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[0.00001, 0.0001,0.001,0.01,0.1, 1, 10, 100, 1000],
              'kernel':['linear', 'rbf'],
             'gamma':[0.0001, 0.0005, 0.001, 0.005, 0.01]}
param_grid2 = {'C':[0.00001, 0.0001,0.001,0.01,0.1, 1, 10, 100, 1000],
             'gamma':[0.0001, 0.0005, 0.001, 0.005, 0.01]}
grid1 = GridSearchCV(model1, param_grid, verbose=3, n_jobs = -1)

Normalized data

In [13]:
# Train the model with GridSearch
grid1.fit(X_train_scaled,y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:  4.7min finished


GridSearchCV(estimator=SVC(kernel='linear'), n_jobs=-1,
             param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                               1000],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01],
                         'kernel': ['linear', 'rbf']},
             verbose=3)

In [14]:
print(f"The best paramaters for the tuned model:\n {grid1.best_params_}\n") 
print(f"The cross-validated accuracy for the tuned model with normalized training set:\n{round(grid1.best_score_*100,2)}%")

The best paramaters for the tuned model:
 {'C': 1000, 'gamma': 0.0001, 'kernel': 'linear'}

The cross-validated accuracy for the tuned model with normalized training set:
88.96%


In [15]:
from sklearn.metrics import classification_report
predictions1 = grid1.predict(X_train_scaled)
print(classification_report(y_train, predictions1))
print(f"{round(grid1.score(X_train_scaled, y_train)*100,2)}%")

                precision    recall  f1-score   support

     CANDIDATE       0.84      0.72      0.78      1586
     CONFIRMED       0.78      0.86      0.82      1704
FALSE POSITIVE       0.98      1.00      0.99      3268

      accuracy                           0.89      6558
     macro avg       0.87      0.86      0.86      6558
  weighted avg       0.89      0.89      0.89      6558

89.42%


Standardized data

In [16]:
grid2 = GridSearchCV(model1, param_grid2, verbose=3, n_jobs = -1)
grid2.fit(X_train_stand,y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  2.9min


KeyboardInterrupt: 

In [None]:
print(f"The best paramaters for the tuned model:\n {grid2.best_params_}")
# {'C': 10, 'gamma': 0.0001}
#  {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
print(f"The cross-validated accuracy for the tuned model with standardized training set:\n{round(grid2.best_score_*100,2)}%")
# 89.32%
# 89.32%

Compared with the first optimized model based on normalized training data (89.09%), the second optimized model based on standardized data has slightly higher cross-validated accuracy (89.32%)

In [None]:
predictions2 = grid2.predict(X_train_stand)
print(classification_report(y_train, predictions2))
print(f"{round(grid2.score(X_train_stand, y_train)*100,2)}%") #89.74% 90.64%

# Save the Model

In [None]:
import joblib
filename = 'SVM2.sav'
joblib.dump(grid2, filename)