In [17]:
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns
import pandas as pd
from tqdm.notebook import tqdm

from scipy.special import softmax
from scipy.spatial.distance import cdist
import numpy as np
import torch

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split

from mlxtend.plotting import plot_decision_regions

In [6]:
df = pd.read_csv("diabetes.csv")
data = df.drop('Outcome', axis = 1)   # Features
labels = df.drop(['Glucose', 'Insulin', 'Pregnancies', 'BloodPressure', 'SkinThickness', 'BMI', 'DiabetesPedigreeFunction', 'Age'], axis = 1)   # Targets

In [7]:
data.shape

(768, 8)

In [24]:
X_train, X_test,Y_train, Y_test = train_test_split(data, labels, test_size = 168, random_state = 0)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

<h4>SVM with RBF kernel</h4>

In [27]:
model = SVC(kernel = 'rbf', probability=True, random_state = 0)
model.fit(X_train, Y_train)

pred_proba = model.predict_proba(X_test)
print(f'roc_auc with rbf-kernel = {roc_auc_score(Y_test, pred_proba[:, 1])}')


roc_auc with rbf-kernel = 0.8556193601312552


  return f(**kwargs)


<h4>SVM with sigmoid kernel</h4>


In [28]:
model = SVC(kernel = 'sigmoid', probability=True, random_state = 0)
model.fit(X_train, Y_train)

pred_proba = model.predict_proba(X_test)
print(f'roc_auc with sigmoid-kernel = {roc_auc_score(Y_test, pred_proba[:, 1])}')


roc_auc with sigmoid-kernel = 0.8070549630844955


  return f(**kwargs)


<h4>SVM c linear kernel</h4>

In [40]:
model = SVC(kernel = 'linear', probability=True, random_state = 0)
model.fit(X_train, Y_train)

pred_proba = model.predict_proba(X_test)
print(f'roc_auc with linear-kernel = {roc_auc_score(Y_test, pred_proba[:, 1])}')

print('Count of support objects: ', X_train[model.support_].shape[0])

roc_auc with linear-kernel = 0.8674323215750616
Count of support objects:  325


  return f(**kwargs)


<h4>SVM with polynomial kernel</h4>

In [32]:
model = SVC(kernel = 'poly', probability=True, random_state = 0)
model.fit(X_train, Y_train)

pred_proba = model.predict_proba(X_test)
print(f'roc_auc with poly-kernel = {roc_auc_score(Y_test, pred_proba[:, 1])}')



roc_auc with poly-kernel = 0.8333059885151763


  return f(**kwargs)


<h4>SVM with a linear kernel performed better</h4>

<h4>Добавим в выборку новые признаки, равные расстоянию от объекта до опорных объектов</h4>

Попробуем обучить SVM с ядром rbf, узнать опорные объекты и добавить в обучающую и тестовую выборки новые признаки. В вектор признаков каждого объекта добавятся расстояния от данного объекта до опорных векторов. Соответственно, кол-во признаков увеличится на кол-во опорных векторов

In [60]:
model = SVC(kernel = 'rbf', probability=True, random_state = 0)
model.fit(X_train, Y_train)

pred_proba = model.predict_proba(X_test)
print(f'roc_auc with rbf-kernel = {roc_auc_score(Y_test, pred_proba[:, 1])}')


roc_auc with rbf-kernel = 0.8556193601312552


  return f(**kwargs)


In [89]:
# Простое rbf-ядро с некоторым параметро гамма
def kernel_rbf(X1, X2):
    return np.exp(-0.04 * cdist(X1, X2)**2)

In [90]:
X_train_new = np.hstack([kernel_rbf(X_train, X_train[model.support_]), X_train])
X_test_new = np.hstack([kernel_rbf(X_test, X_train[model.support_]) , X_test])

model = SVC(kernel = 'rbf', probability=True, random_state = 0)
model.fit(X_train_new, Y_train)
pred_proba = model.predict_proba(X_test_new)
print(f'roc_auc with rbf-kernel and new features = {roc_auc_score(Y_test, pred_proba[:, 1])}')

  return f(**kwargs)


roc_auc with rbf-kernel and new features = 0.8739950779327318


Благодаря добавлению новых признаков качество классификации увеличилось