In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import auc, plot_roc_curve

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, average_precision_score
from pprint import pprint
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from sklearn.utils import shuffle

df = pd.read_csv("Data.csv", sep=";")
y = df["Target"]
X = df.iloc[:,1:-1]

# Standardize data
scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, test_size=0.30, random_state=1, shuffle=True)
X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_train, 
                                         y_train, test_size=0.20, random_state=1, shuffle=True)

## Grid search

In [None]:
svc = SVC(C=10, gamma="scale", kernel="rbf",class_weight="balanced", random_state = 1)
print('Parameters currently in use:\n')
pprint(svc.get_params())

# Number of features to consider at every split
# Create the parameter grid
param_grid = {"kernel": ["rbf"], 
               "C": [2000, 1000, 500, 250, 100, 10, 1], 
               "gamma":["scale", "auto"]}

pprint(param_grid)

scores = ["accuracy"]
cv = StratifiedKFold(n_splits=5)
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, n_jobs=-1, 
                           cv=cv, scoring='accuracy',error_score=0)

grid_result = grid_search.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Test data 

In [None]:
# 0.674512 (0.035359) with: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}

svc = SVC(C=10, gamma="auto", kernel="rbf",class_weight="balanced",random_state = 1)

svc.fit(X_train, y_train)
pred = svc.predict(X_test)
print(classification_report(y_test,pred,digits=3))
print(confusion_matrix(y_test,pred))
print(f"Average precision score {average_precision_score(y_test, pred):.3f} ")

## Cross validation

In [None]:
cv = StratifiedKFold(n_splits=5)
svc = SVC(C=10, gamma="scale", kernel="rbf",class_weight="balanced",random_state = 1)

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

fig, ax = plt.subplots()
for i, (train, test) in enumerate(cv.split(X_train, y_train)):
    svc.fit(X_train[train], y_train.iloc[train])
    viz = plot_roc_curve(svc, X_train[test], y_train.iloc[test],
                         name='ROC fold {}'.format(i),
                         alpha=0.3, lw=1, ax=ax)
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)

ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
        label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b',
        label=r'Mean ROC (AUC = %0.3f $\pm$ %0.3f)' % (mean_auc, std_auc),
        lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                label=r'$\pm$ 1 std. dev.')

ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="b) Receiver operating characteristics: SVM-C")
ax.legend(loc="lower right")
plt.savefig("SVC.pdf", dpi=600, transparent=True)
plt.savefig("SVC.jpg", dpi=600, transparent=True)
plt.show()

## Y Randomisierung

In [None]:
from sklearn.metrics import average_precision_score
Accur = []
for i in range(1,100):
    y_rand = shuffle(y)
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y_rand, test_size=0.30, random_state=1)
    svc.fit(X_train, y_train)
    pred = svc.predict(X_test)
    Accur.append(average_precision_score(y_test, pred))
    print(classification_report(y_test,pred,digits=3))
    print(confusion_matrix(y_test,pred))
print(f"Mean f1 score = {np.mean(Accur):.3f}, std {np.std(Accur):.3f}")