# Performance Tuning the SVC Model

## 1. Import the necessary libraries

In [3]:
%matplotlib inline
from IPython.display import display, clear_output
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, auc, classification_report, confusion_matrix, make_scorer, precision_recall_curve, precision_score, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer
import ipywidgets as widgets
import joblib
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time

plt.style.use("ggplot")

print("The libraries were successfully loaded!")


The libraries were successfully loaded!


## 2. Import the data

In [4]:
# Import Dataset
df = pd.read_csv('data/cleansed-healthcare-dataset-stroke-data.csv',delimiter=',',header='infer')

# Display top rows of the dataset
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,1,80.0,0,1,1,2,0,105.92,32.5,2,1
2,0,49.0,0,0,1,2,1,171.23,34.4,3,1
3,0,79.0,1,0,1,3,0,174.12,24.0,2,1
4,1,81.0,0,0,1,2,1,186.21,29.0,1,1


## 3. Split the data into a training and testing set

In [5]:
# Separate features and labels
features = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status']
label = 'stroke'
X, y = df[features].values, df[label].values

# Split data 70%-30% into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)


print('Training cases: %d\nTest cases: %d' % (X_train.size, X_test.size))

Training cases: 34360
Test cases: 14730


## 4. Define the SVC parameters to test

In [6]:
# Create model
clf = SVC(random_state=0, probability=True)

# Define models hyperparameters
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
    'degree': [1, 3, 5],
    'gamma' : ['scale', 'auto'],
    'coef0': [0, 1],
    'shrinking': [True, False],
    'tol': [0.00001, 0.001, 0.1],
    'cache_size': [100, 200, 300],
    'verbose': [True, False],
    'max_iter': [1, 3, 5],
    'decision_function_shape': ['ovo', 'ovr'],
    'break_ties': [True, False]
}

# Define scores to test
scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

param_grid

{'C': [0.1, 1, 10],
 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
 'degree': [1, 3, 5],
 'gamma': ['scale', 'auto'],
 'coef0': [0, 1],
 'shrinking': [True, False],
 'tol': [1e-05, 0.001, 0.1],
 'cache_size': [100, 200, 300],
 'verbose': [True, False],
 'max_iter': [1, 3, 5],
 'decision_function_shape': ['ovo', 'ovr'],
 'break_ties': [True, False]}

In [9]:
def grid_search_wrapper(refit_score='precision_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=10)
    #  Define grid search
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,
                           cv=skf, return_train_score=True, n_jobs=-1)
    # Train the models 
    grid_search.fit(X_train.values, y_train.values)

    # Test the models
    y_pred = grid_search.predict(X_test.values)
    
    # Print the best hyperparameters
    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data.
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    return grid_search

## 5. Test to see which parameters achieve the largest recall_score

In [13]:
# Perform grid search to identify best hyperparameters for recall_score
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

  return f(*args, **kwargs)


[LibSVM]Best params for recall_score
{'C': 0.1, 'break_ties': True, 'cache_size': 100, 'coef0': 0, 'decision_function_shape': 'ovr', 'degree': 1, 'gamma': 'auto', 'kernel': 'sigmoid', 'max_iter': 1, 'shrinking': True, 'tol': 1e-05, 'verbose': True}

Confusion matrix of Random Forest optimized for recall_score on the test data:
     pred_neg  pred_pos
neg         0      1418
pos         0        55


In [10]:
# Define new hyperparameters
param_grid = {'C': [0.01, 0.1, 0.5],
 'kernel': ['sigmoid'],
 'degree': [0, 1, 2],
 'gamma': ['auto'],
 'coef0': [0],
 'shrinking': [True],
 'tol': [1.00000005, 1.0000005, 1.000005],
 'cache_size': [50, 100, 150],
 'verbose': [True],
 'max_iter': [0, 1, 2],
 'decision_function_shape': ['ovr'],
 'break_ties': [True]}


In [11]:
# Perform grid search to identify best hyperparameters for recall_score
print("Round 2")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

Round 2
[LibSVM]Best params for recall_score
{'C': 0.01, 'break_ties': True, 'cache_size': 50, 'coef0': 0, 'decision_function_shape': 'ovr', 'degree': 0, 'gamma': 'auto', 'kernel': 'sigmoid', 'max_iter': 0, 'shrinking': True, 'tol': 1.00000005, 'verbose': True}

Confusion matrix of Random Forest optimized for recall_score on the test data:
     pred_neg  pred_pos
neg         0      1418
pos         0        55


  return f(*args, **kwargs)


In [12]:
# Define models hyperparameters
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
    'degree': [1, 3, 5],
    'gamma' : ['scale', 'auto'],
    'coef0': [0, 1],
    'shrinking': [True, False],
    'tol': [0.00001, 0.001, 0.1],
    'cache_size': [100, 200, 300],
    'verbose': [True, False],
    'max_iter': [1, 3, 5],
    'decision_function_shape': ['ovo', 'ovr'],
    'break_ties': [True, False]
}
# Perform grid search to identify best hyperparameters for precision_score
print("Round 3")
grid_search_clf = grid_search_wrapper(refit_score='precision_score')

Round 3


  return f(*args, **kwargs)


[LibSVM]Best params for precision_score
{'C': 0.1, 'break_ties': True, 'cache_size': 100, 'coef0': 0, 'decision_function_shape': 'ovr', 'degree': 5, 'gamma': 'auto', 'kernel': 'poly', 'max_iter': 5, 'shrinking': True, 'tol': 1e-05, 'verbose': True}

Confusion matrix of Random Forest optimized for precision_score on the test data:
     pred_neg  pred_pos
neg      1352        66
pos        42        13


In [13]:
# Define models hyperparameters
param_grid = {
    'C': [0.001, 0.01, 0.1],
    'kernel': ['poly'], 
    'degree': [5, 10, 15],
    'gamma' : ['auto'],
    'coef0': [0],
    'shrinking': [True],
    'tol': [0.0000001, 0.000001, 0.00001],
    'cache_size': [50, 100, 150],
    'verbose': [True],
    'max_iter': [5, 10, 15],
    'decision_function_shape': ['ovr'],
    'break_ties': [True]
}
# Perform grid search to identify best hyperparameters for precision_score
print("Round 4")
grid_search_clf = grid_search_wrapper(refit_score='precision_score')

Round 4
[LibSVM]Best params for precision_score
{'C': 0.001, 'break_ties': True, 'cache_size': 50, 'coef0': 0, 'decision_function_shape': 'ovr', 'degree': 5, 'gamma': 'auto', 'kernel': 'poly', 'max_iter': 5, 'shrinking': True, 'tol': 1e-07, 'verbose': True}

Confusion matrix of Random Forest optimized for precision_score on the test data:
     pred_neg  pred_pos
neg      1352        66
pos        42        13


  return f(*args, **kwargs)


In [14]:
 # Define models hyperparameters
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear'], 
    'degree': [1, 3, 5],
    'gamma' : ['scale', 'auto'],
    'coef0': [0, 1],
    'shrinking': [True, False],
    'tol': [0.00001, 0.001, 0.1],
    'cache_size': [100, 200, 300],
    'verbose': [True, False],
    'max_iter': [1, 3, 5],
    'decision_function_shape': ['ovo', 'ovr'],
    'break_ties': [True, False]
}
print("Round 5")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

Round 5
[LibSVM]Best params for recall_score
{'C': 0.1, 'break_ties': True, 'cache_size': 100, 'coef0': 0, 'decision_function_shape': 'ovr', 'degree': 1, 'gamma': 'scale', 'kernel': 'linear', 'max_iter': 3, 'shrinking': True, 'tol': 1e-05, 'verbose': True}

Confusion matrix of Random Forest optimized for recall_score on the test data:
     pred_neg  pred_pos
neg       633       785
pos         1        54


  return f(*args, **kwargs)
