In [1]:
# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install sklearn
# %pip install imblearn

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.multiclass import OneVsRestClassifier
from collections import Counter

from sklearn.svm import SVC
from sklearn import metrics

import time

In [2]:
X_train = pd.read_csv('../data/X_train_final.csv')
y_train = pd.read_csv('../data/y_train_final.csv')
X_test = pd.read_csv('../data/X_test_final.csv')
y_test = pd.read_csv('../data/y_test_final.csv')

# Train Model

In [3]:
# one hot encode the categories
features_nominal = ['order_1', 'order_2', 'order_3', 'order_6', 'order_7']
X_train = pd.get_dummies(X_train, columns=features_nominal)
X_test = pd.get_dummies(X_test, columns=features_nominal)

In [4]:
start_time = time.time()
clf = OneVsRestClassifier(SVC(probability=True, cache_size=1000), n_jobs=-1).fit(X_train, y_train)
print("--- %s mins ---" % int((time.time() - start_time)/60))

--- 79 mins ---


In [5]:
def print_score(model):
    y_predict = model.predict(X_test)
    y_predict_prob = model.predict_proba(X_test)[:, 1]
    print(metrics.confusion_matrix(y_test, y_predict))
    # TN FP
    # FN TP

    print(f'accuracy:  {metrics.accuracy_score(y_test, y_predict)}')
    print(f'precision: {metrics.precision_score(y_test, y_predict)}')
    print(f'recall:    {metrics.recall_score(y_test, y_predict)}')
    print(f'roc auc:   {metrics.roc_auc_score(y_test, y_predict_prob)}')
    print(f'pr auc:    {metrics.average_precision_score(y_test, y_predict_prob)}')

print_score(clf)

[[21648  1000]
 [  415   769]]
accuracy:  0.9406260490097348
precision: 0.4347088750706614
recall:    0.6494932432432432
roc auc:   0.9244133334367571
pr auc:    0.5017911686709257


# Hyperparameter Tuning (on small df)

In [6]:
kernel = ['linear', 'poly', 'rbf', 'sigmoid']

models = []
for name in kernel:
    start_time = time.time()
    print(f'kernel: {name}')
    model = OneVsRestClassifier(SVC(probability=True, cache_size=1000, kernel=name), n_jobs=-1)
    models.append(model)
    model.fit(X_train_std, y_train)
    print_score(model)
    print("--- %s seconds ---" % (time.time() - start_time))
    print()

kernel: linear
[[5212  704]
 [  40  164]]
accuracy:  0.8784313725490196
precision: 0.1889400921658986
recall:    0.803921568627451
roc auc:   0.8424611223799865
pr auc:    0.15842896298304268
--- 690.8784210681915 seconds ---

kernel: poly
[[5496  420]
 [  77  127]]
accuracy:  0.9187908496732026
precision: 0.23217550274223034
recall:    0.6225490196078431
roc auc:   0.7757775524002705
pr auc:    0.15712233095553885
--- 248.67693185806274 seconds ---

kernel: rbf
[[5617  299]
 [  70  134]]
accuracy:  0.9397058823529412
precision: 0.3094688221709007
recall:    0.6568627450980392
roc auc:   0.8031609195402298
pr auc:    0.2147164485501668
--- 261.742835521698 seconds ---

kernel: sigmoid
[[4252 1664]
 [  59  145]]
accuracy:  0.7184640522875817
precision: 0.08015478164731896
recall:    0.7107843137254902
roc auc:   0.7147565922920893
pr auc:    0.06661328434082311
--- 329.61398124694824 seconds ---


# For Future Usage

In [7]:
import pickle

# save the model to disk
filename = '../data/svm_model.sav'
pickle.dump(clf, open(filename, 'wb'))
 
# load the model from disk
model = pickle.load(open(filename, 'rb'))