# Support Vector Machine

In [3]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV  
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score  
from sklearn import svm

## Import data

In [4]:
# Read file
from deezerData import readData
df, X, y, X_train, X_test, y_train, y_test = readData()
# Making arrays
X, y, X_train, X_test, y_train, y_test = X.values, y.values, X_train.values, X_test.values, y_train.values, y_test.values

## Way 1: Pipeline for classification, Grid Search with cross validation

In this part, we use the default kernel rbf:

In [6]:
# Setup the pipeline
steps = [('scaler', StandardScaler()), ('SVM', SVC())]

pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'SVM__C':[1, 10, 100],
              'SVM__gamma':[0.1, 0.01]}

# Instantiate the GridSearchCV object: cv
cv = GridSearchCV(pipeline, param_grid=parameters)

# Fit to the training set
cv.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = cv.predict(X_test)

# Compute and print metrics
print("Tuned Model Parameters: {}".format(cv.best_params_))
print("Accuracy for grid search cross validation: {}".format(cv.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
print("AUC: {}".format(roc_auc_score(y_test, y_pred)))

Tuned Model Parameters: {'SVM__C': 10, 'SVM__gamma': 0.1}
Accuracy for grid search cross validation: 0.7678916827852998
              precision    recall  f1-score   support

           0       0.65      0.62      0.64       847
           1       0.82      0.84      0.83      1738

    accuracy                           0.77      2585
   macro avg       0.74      0.73      0.73      2585
weighted avg       0.77      0.77      0.77      2585

AUC: 0.7302430021072139


In [7]:
# Fit model with tuned parameters
svm_final = SVC(C=10, gamma=0.1)
svm_final.fit(X_train, y_train)
y_pred_final = svm_final.predict(X_test)
print("Accuracy for fitting model with chosen parameters: {}".format(svm_final.score(X_test, y_test)))
print("AUC for the model with chosen parameters: {}".format(roc_auc_score(y_test, y_pred_final)))

Accuracy for fitting model with chosen parameters: 0.6731141199226306
AUC for the model with chosen parameters: 0.5011806375442739


## Way 2: Kernel sigmoid 

In [9]:
steps2 = [('scaler', StandardScaler()), ('svc_sigmoid', SVC(kernel='sigmoid'))]

pipeline2 = Pipeline(steps2)

# svc_sigmoid = SVC(kernel='sigmoid')

# Fit to the training set
pipeline2.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred_2 = pipeline2.predict(X_test)

# Compute and print metrics
print(classification_report(y_test, y_pred_2))
print("Accuracy of SVM with sigmoid kernel: {}".format(pipeline2.score(X_test, y_test)))
print("AUC sigmoid kernel: {}".format(roc_auc_score(y_test, y_pred_2)))

              precision    recall  f1-score   support

           0       0.47      0.48      0.47       847
           1       0.74      0.74      0.74      1738

    accuracy                           0.65      2585
   macro avg       0.61      0.61      0.61      2585
weighted avg       0.65      0.65      0.65      2585

Accuracy of SVM with sigmoid kernel: 0.6533849129593811
AUC sigmoid kernel: 0.6084692062827852


## Way 3: With several kernels

In [None]:
# Specify the hyperparameter space
tuned_parameters = [
    {"kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [1, 10, 100]},
    {"kernel": ["linear"], "C": [1, 10, 100]},
    # {"kernel": ["sigmoid"], "C": [1, 10, 100, 1000]}
]

# scores = ["accuracy", "roc_auc"]
scores = ["precision", "recall"]

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, scoring="%s_macro" % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_["mean_test_score"]
    stds = clf.cv_results_["std_test_score"]
    for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred_3 = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred_3))
    print("AUC: {}".format(roc_auc_score(y_true, y_pred_3)))
    print()