# SVC

In the following, we try to implement a SVC to classify between the real and simulated data. Then we improve the performance of the model step-by-step by doing GridSearch and optimizing the parameters around the results.

Loading the data and splitting  them into train and test set

In [24]:
import numpy as np
from sklearn.model_selection import train_test_split

z = np.zeros((1500,1), dtype=int)
o = np.ones((1507, 1), dtype=int)

gen = np.loadtxt("qcs_gen.txt", delimiter=",")
gen = np.concatenate((gen, z), axis=1)
print(gen.shape)

real = np.loadtxt("qcs_real.txt", delimiter=",")
real = np.concatenate((real, o), axis=1)
print(real.shape)

data = np.concatenate((gen, real), axis=0)
print(data.shape)

train, test = train_test_split(data, test_size=0.2, random_state=42, shuffle=True, stratify=None)
print(train.shape)

train, val = train_test_split(train, test_size=0.2, random_state=42, shuffle=True, stratify=None)

#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
#https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

(1500, 1102)
(1507, 1102)
(3007, 1102)
(2405, 1102)


Now we perform the SVC without optimised parameters

In [33]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X = train[:, :-1]
y = train[:, -1]

from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X, y)
#https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma='auto',
                     kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

We evaluate the performance of the SVC using accuracy:

In [35]:
from sklearn.metrics import accuracy_score
y_pred = clf.predict(test[:, :-1])
y_true = test[:, -1]
accuracy_score(y_true, y_pred)

0.8837209302325582

Now, we try to increase the performance of the model by optimizing the parameters with Grid Search

In [63]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

parameters = {'svc__kernel':('linear', 'rbf', 'poly', 'sigmoid'), "svc__C": [0.001, 0.01, 0.1, 1, 10]}
pipeline = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf = GridSearchCV(estimator=pipeline, param_grid=parameters)
clf.fit(train[:, :-1], train[:, -1])



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('svc',
                                        SVC(C=1.0, cache_size=200,
                                            class_weight=None, coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='auto',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.001, verbose=False))],
              

Let's see if the accuracy has increased

In [64]:
y_pred = clf.predict(test[:, :-1])
y_true = test[:, -1]
accuracy_score(y_true, y_pred)

0.9518272425249169

In [66]:
clf.get_params()

{'cv': 'warn',
 'error_score': 'raise-deprecating',
 'estimator__memory': None,
 'estimator__steps': [('standardscaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
       decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
       max_iter=-1, probability=False, random_state=None, shrinking=True,
       tol=0.001, verbose=False))],
 'estimator__verbose': False,
 'estimator__standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'estimator__svc': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False),
 'estimator__standardscaler__copy': True,
 'estimator__standardscaler__with_mean': True,
 'estimator__standardscaler__with_std': True,
 'estimator__svc__C': 1.0,
 'estimator__svc__cache_size':

We have already been able to increase the accuracy from 88 to 95 percent using GridSearch. Now we try to improve the performance even more. Above we can see the "optimal" values. Using the determined best kernel "rbf", we do grid search again, around the "optimal" value for C, 1.0 .

In [73]:
parameters = {"svc__C": [0.1,0.5,0.7, 1,1.2,1.5,2, 2.5, 3, 3.5, 4,5,7]}
pipeline = make_pipeline(StandardScaler(), SVC(kernel='rbf', gamma='auto'))
clf = GridSearchCV(estimator=pipeline, param_grid=parameters)
clf.fit(train[:, :-1], train[:, -1])
y_pred = clf.predict(test[:, :-1])
y_true = test[:, -1]
accuracy_score(y_true, y_pred)



0.9401993355481728

In [74]:
clf.get_params()

{'cv': 'warn',
 'error_score': 'raise-deprecating',
 'estimator__memory': None,
 'estimator__steps': [('standardscaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
       decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
       max_iter=-1, probability=False, random_state=None, shrinking=True,
       tol=0.001, verbose=False))],
 'estimator__verbose': False,
 'estimator__standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'estimator__svc': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False),
 'estimator__standardscaler__copy': True,
 'estimator__standardscaler__with_mean': True,
 'estimator__standardscaler__with_std': True,
 'estimator__svc__C': 1.0,
 'estimator__svc__cache_size':

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

x_data = test[:, :-1]

#standard scaling the data
standardscaler = StandardScaler()
x_data = standardscaler.fit_transform(x_data)

###---##---###  Nested CV & AUC
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
outer_cv = StratifiedKFold(n_splits=10, shuffle = True, random_state = 42)

# Nested CV with parameter optimization
clf = GridSearchCV(estimator=pipeline, param_grid=parameters, cv=inner_cv, scoring='roc_auc')

# Making predictions
predictions_autoencoder = cross_val_predict(clf, X = x_data, y = test[:, -1], cv
= outer_cv, method = 'predict_proba', n_jobs = -1)

#everything for the standard method
svm = SVC(kernel="rbf", probability=True)
pipeline = Pipeline([('svm', svm)])
clf_std = GridSearchCV(estimator=pipeline, param_grid=p_grid, cv=inner_cv, scoring='roc_auc')
predictions_std = cross_val_predict(clf_std, X = x_data_std, y = y_data, cv
= outer_cv, method = 'predict_proba', n_jobs = -1)

In [None]:
#keep only positive probabilities
#predictions_autoencoder = predictions_autoencoder[:,1]
predictions_nopreprocessing = predictions_nopreprocessing[:,1]
predictions_std = predictions_std[:,1]

#calculating the ROC curves
fpr_autoencoder, tpr_autoencoder, _ = metrics.roc_curve(y_data, predictions_autoencoder)
fpr_nopreprocessing, tpr_nopreprocessing, _ = metrics.roc_curve(y_data, predictions_nopreprocessing)
fpr_std, tpr_std, _ = metrics.roc_curve(y_data, predictions_std)

#calculating the AUC's
auc_autoencoder = metrics.auc(fpr_autoencoder, tpr_autoencoder)
auc_nopreprocessing = metrics.auc(fpr_nopreprocessing, tpr_nopreprocessing)
auc_std = metrics.auc(fpr_std, tpr_std)

#sklearn.metrics.roc_curve(y_true, y_score, *, pos_label=None, sample_weight=None, drop_intermediate=True)
import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay

fig, ax = plt.subplots(figsize=(6, 6))


#fpr_autoencoder, tpr_autoencoder, _ =
roc_autoencoder = RocCurveDisplay(fpr = fpr_autoencoder, 
                                  tpr = tpr_autoencoder, 
                                  roc_auc= auc_autoencoder, 
                                  estimator_name="autoencoder")

roc_autoencoder.plot(color="red", ax=ax)

roc_std = RocCurveDisplay(fpr = fpr_std,
                          tpr = tpr_std,
                          roc_auc = auc_std, 
                          estimator_name="standard preprocessing method")

roc_std.plot(color="darkorange", ax=ax)

roc_nopreprocessing = RocCurveDisplay(fpr = fpr_nopreprocessing,
                                      tpr = tpr_nopreprocessing, 
                                      roc_auc= auc_nopreprocessing, 
                                      estimator_name="no preprocessing")

roc_nopreprocessing.plot(color="darkblue", ax=ax)

plt.plot([0, 1], [0, 1], "k--", label="chance level")
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristics")
plt.legend()
plt.show()
