Note that this largely leverages 'Breast Cancer Classification Using Support Vector Machine (SVM)' – Towards Data Science post available [here](https://towardsdatascience.com/breast-cancer-classification-using-support-vector-machine-svm-a510907d4878) and repository available [here](https://github.com/nalamidi/Breast-Cancer-Classification-with-Support-Vector-Machine/blob/master/Breast%20Cancer%20Classification.ipynb). The scikit-learn SVM documentation is available [here](https://scikit-learn.org/stable/modules/svm.html#svm-kernels), and the C-Support Vector Classification documentation is available [here](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC). This also leverages an (unofficial) implementation of the Relevance Vector Machine (RVM) available from [here](https://github.com/JamesRitchie/scikit-rvm). 

Evaluating performance scikit-learn documentation can be found [here](https://scikit-learn.org/stable/developers/performance.html) and [here](https://jakevdp.github.io/PythonDataScienceHandbook/01.07-timing-and-profiling.html).  

Next steps – 
1. Expand GridSearch hyperparamaters to optimize. Make sure paramaters are comperable across model comparisons!
2. Be more thoughtful about kernels. 
3. Bootstrap original dataset. 
4. Evaluate computational efficiency. 
5. Draft EDA for write-up / appendix. 

In [12]:
import time
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC 
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from skrvm import RVC

import svm_rvm_helpers as helpers

pd.set_option('display.max_columns', 500)

In [13]:
# load dataset
cancer = load_breast_cancer()
cancer_df = pd.DataFrame(np.c_[cancer['data'], cancer['target']], 
                         columns = np.append(cancer['feature_names'], ['target']))
print(cancer_df.shape)
cancer_df.head()

(569, 31)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [14]:
# compute share of positive class for baseline comparison
cancer_df['target'].sum() / cancer_df['target'].count()

0.6274165202108963

In [15]:
# split into training and testing 
def prep_data(cancer_df):
    '''
    Takes: cancer dataframe
    Returns: normalized train and test data, ready for modeling
    '''
    
    X = cancer_df.drop(['target'], axis = 1)
    y = cancer_df['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

    # normalize data
    X_train_scaled = helpers.normalize(X_train)
    X_test_scaled = helpers.normalize(X_test)
    
    return X_train_scaled, y_train, X_test_scaled, y_test

In [16]:
# build simple SVM 
def build_simple_SVM(X_train_scaled, y_train, X_test_Scaled, y_test):
    '''
    Takes: normalized training features, training labels, normalized test features, test labels
    Returns: none, but it prints stuff
    '''
    svc_model = SVC(gamma = 'auto')
    start = time.time()
    svc_model.fit(X_train_scaled, y_train)
    print("time to fit simple SVM: ", time.time() - start)
    svc_predict = svc_model.predict(X_test_scaled)

    # evaluate simple SVM 
    print(helpers.confusion_matrix(y_test, svc_predict))
    print(classification_report(y_test, svc_predict))
    return 

In [17]:
X_train_scaled, y_train, X_test_scaled, y_test = prep_data(cancer_df)

build_simple_SVM(X_train_scaled, y_train, X_test_scaled, y_test)

time to fit simple SVM:  0.004952669143676758
[[39  2]
 [ 0 73]]
              precision    recall  f1-score   support

         0.0       1.00      0.95      0.97        41
         1.0       0.97      1.00      0.99        73

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



In [18]:
def grid_svm(param_grid, X_train_scaled, y_train):
    '''
    Takes: hyperparameter dict, normalized training data, training labels
    Returns: optimized grid
    '''
    grid = GridSearchCV(SVC(), param_grid, refit = True, cv = 5)
    print("fitting SVM grid:")
    %timeit grid.fit(X_train_scaled, y_train)

    # print best SVM parameters 
    print("SVM best params:")
    print(grid.best_params_)
    print(grid.best_estimator_)
    print("\n")
    
    return grid


def predict_optimized_SVM(X_test_scaled, y_test, grid):
    '''
    Takes: training data, optimized grid
    Returns: nothing, but it prints stuff
    '''
    
    # predict with optimized SVM
    start = time.time()
    grid_predict = grid.predict(X_test_scaled)
    print("time to predict with optimized SVM: ", time.time()-start)
    print(helpers.confusion_matrix(y_test, grid_predict))
    print(classification_report(y_test, grid_predict))
    
    return

In [19]:
# optimize SVM hyperparametes 
param_grid = {'C': [0.1, 1, 10, 100], 
              'gamma': [1, 0.1, 0.01, 0.001], 
              'kernel': ['rbf']}

grid = grid_svm(param_grid, X_train_scaled, y_train)
predict_optimized_SVM(X_test_scaled, y_test, grid)

fitting SVM grid:
466 ms ± 18.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
SVM best params:
{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


time to predict with optimized SVM:  0.0009720325469970703
[[40  1]
 [ 2 71]]
              precision    recall  f1-score   support

         0.0       0.95      0.98      0.96        41
         1.0       0.99      0.97      0.98        73

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [20]:
def build_and_run_rvc(X_train_scaled, y_train, X_test_scaled, y_test):
    '''
    Takes: training and testing data
    Returns: nothing, but it prints
    '''
    
    
    # build RVC 
    rvc_model = RVC()
    start = time.time()
    rvc_model.fit(X_train_scaled, y_train)
    print("time to fit RVM: ", time.time()-start)
    start = time.time()
    rvc_predict = rvc_model.predict(X_test_scaled)
    print("time to predict with RVM: ", time.time() - start)

    # print parameters
    print("RVM hyperparameters:")
    print(rvc_model.get_params())

    # evaluate RVC
    print(helpers.confusion_matrix(y_test, rvc_predict))
    print(classification_report(y_test, rvc_predict))

In [21]:
build_and_run_rvc(X_train_scaled, y_train, X_test_scaled, y_test)

time to fit RVM:  14.110707998275757
time to predict with RVM:  0.0008928775787353516
RVM hyperparameters:
{'kernel': 'rbf', 'degree': 3, 'coef1': None, 'coef0': 0.0, 'n_iter': 3000, 'tol': 0.001, 'alpha': 1e-06, 'threshold_alpha': 1000000000.0, 'beta': 1e-06, 'beta_fixed': False, 'bias_used': True, 'verbose': False, 'n_iter_posterior': 50}
[[40  1]
 [ 2 71]]
              precision    recall  f1-score   support

         0.0       0.95      0.98      0.96        41
         1.0       0.99      0.97      0.98        73

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [22]:
# explore what happens if we introduce random noise

for i in [1, 3, 5, 10]:
    
    print("\n")
    print("BEGIN RUN WITH BOOTSTRAP MULTIPLE ", i)
    print("\n")
    
    noisy_cancer_df = helpers.bootstrap_with_noise(cancer_df, i)
    print(noisy_cancer_df.shape)
    noisy_cancer_df.head()

    # prep data
    X_train_scaled, y_train, X_test_scaled, y_test = prep_data(noisy_cancer_df)

    # train SVM
    build_simple_SVM(X_train_scaled, y_train, X_test_scaled, y_test)

    grid = grid_svm(param_grid, X_train_scaled, y_train)
    predict_optimized_SVM(X_test_scaled, y_test, grid)
    
    build_and_run_rvc(X_train_scaled, y_train, X_test_scaled, y_test)
    
    print("\n")
    print("#======================================#")



BEGIN RUN WITH BOOTSTRAP MULTIPLE  1


(569, 31)
time to fit simple SVM:  0.006591796875
[[34 16]
 [ 0 64]]
              precision    recall  f1-score   support

         0.0       1.00      0.68      0.81        50
         1.0       0.80      1.00      0.89        64

    accuracy                           0.86       114
   macro avg       0.90      0.84      0.85       114
weighted avg       0.89      0.86      0.85       114

fitting SVM grid:
552 ms ± 20.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
SVM best params:
{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


time to predict with optimized SVM:  0.0013129711151123047
[[40 10]
 [ 3 61]]
              precision    recall  f1-score   support

         0.0       0.93      0.80      0.86        50
 

KeyboardInterrupt: 

In [None]:
noisy_df = helpers.bootstrap_with_noise(cancer_df, 1.5)
print(noisy_df.shape)
X_train_scaled, y_train, X_test_scaled, y_test = prep_data(noisy_cancer_df)
build_and_run_rvc(X_train_scaled, y_train, X_test_scaled, y_test)

(854, 31)
