# Benchmarking against the Wisconsin Diagnostic Breast Cancer (WDBC) Dataset

In [1]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '../svm2plus/'))
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from svm2plus import SVC2Plus
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [2]:
data = load_breast_cancer()
X, y = load_breast_cancer(return_X_y=True)

In [3]:
# Original parameter grid as proposed by Vapnik
param_grid_svc = [{'C': np.exp2(np.linspace(-5, 5, 21)),
                   'gamma': np.exp2(np.linspace(-6, 6, 25)),
                   'kernel': ['rbf']}]

# lmbda is a regularization parameter, just like C. So we CV it in the same way.
param_grid_svc2p = [{'C': np.exp2(np.linspace(-5, 5, 21)),
                     'lmbda': np.exp2(np.linspace(-5, 5, 21)),
                     'gamma': np.exp2(np.linspace(-6, 6, 25)),
                     'decision_kernel': ['rbf'],
                     'correcting_kernel': ['rbf']}]

## Privileged Features = `mean`

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1618)

Z_train = X_train[:, :10]
X_train = X_train[:, 10:]
X_test = X_test[:, 10:]

In [5]:
# Train SVC with grid searching
svc = GridSearchCV(SVC(), param_grid_svc, scoring='accuracy', return_train_score=True)
svc.fit(X_train, y_train);
precision_recall_fscore_support(y_test, svc.predict(X_test))

(array([0.90625   , 0.89719626]),
 array([0.84057971, 0.94117647]),
 array([0.87218045, 0.91866029]),
 array([ 69, 102]))

In [6]:
accuracy_score(y_test, svc.predict(X_test))

0.9005847953216374

In [7]:
# Train SVC2+ with grid searching
svc2p = GridSearchCV(SVC2Plus(), param_grid_svc2p, scoring='accuracy', return_train_score=True)
svc2p.fit(X=X_train, y=y_train, Z=Z_train)

preds = svc2p.predict(X_test)
precision_recall_fscore_support(y_test, preds)

(array([0.89393939, 0.9047619 ]),
 array([0.85507246, 0.93137255]),
 array([0.87407407, 0.9178744 ]),
 array([ 69, 102]))

In [8]:
accuracy_score(y_test, preds)

0.9005847953216374

## Privileged Features = `standard error`

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1618)

Z_train = X_train[:, 10:20]
X_train = np.hstack([X_train[:, :10], X_train[:, 20:]])
X_test = np.hstack([X_test[:, :10], X_test[:, 20:]])

In [12]:
# Train SVC with grid searching
svc = GridSearchCV(SVC(), param_grid_svc, scoring='accuracy', return_train_score=True)
svc.fit(X_train, y_train);
precision_recall_fscore_support(y_test, svc.predict(X_test))

  'precision', 'predicted', average, warn_for)


(array([0.        , 0.59649123]),
 array([0., 1.]),
 array([0.        , 0.74725275]),
 array([ 69, 102]))

In [13]:
accuracy_score(y_test, svc.predict(X_test))

0.5964912280701754

In [14]:
# Train SVC2+ with grid searching
svc2p = GridSearchCV(SVC2Plus(), param_grid_svc2p, scoring='accuracy', return_train_score=True)
svc2p.fit(X=X_train, y=y_train, Z=Z_train)

preds = svc2p.predict(X_test)
precision_recall_fscore_support(y_test, preds)

(array([1.       , 0.6035503]),
 array([0.02898551, 1.        ]),
 array([0.05633803, 0.75276753]),
 array([ 69, 102]))

In [15]:
accuracy_score(y_test, preds)

0.6081871345029239

## Privileged Features = `worst`

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1618)

Z_train = X_train[:, 20:]
X_train = X_train[:, :20]
X_test = X_test[:, :20]

In [19]:
# Train SVC with grid searching
svc = GridSearchCV(SVC(), param_grid_svc, scoring='accuracy', return_train_score=True)
svc.fit(X_train, y_train);
precision_recall_fscore_support(y_test, svc.predict(X_test))

(array([0.88709677, 0.87155963]),
 array([0.79710145, 0.93137255]),
 array([0.83969466, 0.90047393]),
 array([ 69, 102]))

In [20]:
accuracy_score(y_test, svc.predict(X_test))

0.8771929824561403

In [27]:
# Train SVC2+ with grid searching
svc2p = GridSearchCV(SVC2Plus(), param_grid_svc2p, scoring='accuracy', return_train_score=True)
svc2p.fit(X=X_train, y=y_train, Z=Z_train)

preds = svc2p.predict(X_test)
precision_recall_fscore_support(y_test, preds)

(array([0.8852459 , 0.86363636]),
 array([0.7826087 , 0.93137255]),
 array([0.83076923, 0.89622642]),
 array([ 69, 102]))

In [28]:
accuracy_score(y_test, preds)

0.8713450292397661