# Support Vector Machines Heart Disease Dataset with Python

## Imports

In [235]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

This is my own made up dataset. I have used this with many other algorithms. Although the dataset is not real, the real world scenarios are considered when creating the dataset.
<br />
<br />
The dataset contains 7 features and 1 target which is either 1 or 0 ( Male or Female).

In [236]:
df = pd.read_csv('gender_classification_v7.csv')

In [237]:
df.head(2)

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,Male
0,1,11.8,6.1,1,0,1,1,1
1,0,14.0,5.4,0,0,1,0,0


In [238]:
# 1 being Male and 0 being Female
df['Male'].unique()

array([1, 0], dtype=int64)

In [239]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   long_hair                  5001 non-null   int64  
 1   forehead_width_cm          5001 non-null   float64
 2   forehead_height_cm         5001 non-null   float64
 3   nose_wide                  5001 non-null   int64  
 4   nose_long                  5001 non-null   int64  
 5   lips_thin                  5001 non-null   int64  
 6   distance_nose_to_lip_long  5001 non-null   int64  
 7   Male                       5001 non-null   int64  
dtypes: float64(2), int64(6)
memory usage: 312.7 KB


In [240]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
long_hair,5001.0,0.869626,0.336748,0.0,1.0,1.0,1.0,1.0
forehead_width_cm,5001.0,13.181484,1.107128,11.4,12.2,13.1,14.0,15.5
forehead_height_cm,5001.0,5.946311,0.541268,5.1,5.5,5.9,6.4,7.1
nose_wide,5001.0,0.493901,0.500013,0.0,0.0,0.0,1.0,1.0
nose_long,5001.0,0.507898,0.499988,0.0,0.0,1.0,1.0,1.0
lips_thin,5001.0,0.493101,0.500002,0.0,0.0,0.0,1.0,1.0
distance_nose_to_lip_long,5001.0,0.4989,0.500049,0.0,0.0,0.0,1.0,1.0
Male,5001.0,0.4999,0.50005,0.0,0.0,0.0,1.0,1.0


## Train Test Split

In [241]:
from sklearn.model_selection import train_test_split

In [242]:
X = df.drop('Male',axis=1)
y = df['Male']

In [243]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

# Train the Support Vector Classifier

In [244]:
from sklearn.svm import SVC

In [245]:
# C=1.0, kernel='rbf', gamma='scale'
model = SVC()

In [246]:
model.fit(X_train,y_train)

SVC()

## Predictions and Evaluations

Now let's predict using the trained model.

In [247]:
predictions = model.predict(X_test)

In [248]:
from sklearn.metrics import classification_report,confusion_matrix

In [249]:
print(confusion_matrix(y_test,predictions))

[[365  11]
 [ 10 365]]


In [250]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       376
           1       0.97      0.97      0.97       375

    accuracy                           0.97       751
   macro avg       0.97      0.97      0.97       751
weighted avg       0.97      0.97      0.97       751



We are classifying everything into a single class! This means our model needs to have it parameters adjusted (it may also help to normalize the data).

We can search for parameters using a GridSearch!

# Gridsearch

Let's find the right parameters like what C or gamma values to use using the grid seaarch.

In [251]:
grid_params = {'C': [0.001,0.01,0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 

In [252]:
from sklearn.model_selection import GridSearchCV

We should add refit=True and choose verbose to whatever number you want, higher the number, the more verbose (verbose just means the text output describing the process).

In [253]:
grid = GridSearchCV(SVC(), grid_params, refit=True, verbose=3)

In [254]:
# May take awhile!
grid.fit(X_train,y_train)

Fitting 5 folds for each of 35 candidates, totalling 175 fits
[CV] C=0.001, gamma=1, kernel=rbf ....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........ C=0.001, gamma=1, kernel=rbf, score=0.939, total=   1.0s
[CV] C=0.001, gamma=1, kernel=rbf ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[CV] ........ C=0.001, gamma=1, kernel=rbf, score=0.942, total=   0.9s
[CV] C=0.001, gamma=1, kernel=rbf ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.7s remaining:    0.0s


[CV] ........ C=0.001, gamma=1, kernel=rbf, score=0.952, total=   0.9s
[CV] C=0.001, gamma=1, kernel=rbf ....................................
[CV] ........ C=0.001, gamma=1, kernel=rbf, score=0.948, total=   0.9s
[CV] C=0.001, gamma=1, kernel=rbf ....................................
[CV] ........ C=0.001, gamma=1, kernel=rbf, score=0.932, total=   0.9s
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV] ...... C=0.001, gamma=0.1, kernel=rbf, score=0.949, total=   0.9s
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV] ...... C=0.001, gamma=0.1, kernel=rbf, score=0.945, total=   0.9s
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV] ...... C=0.001, gamma=0.1, kernel=rbf, score=0.953, total=   0.9s
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV] ...... C=0.001, gamma=0.1, kernel=rbf, score=0.956, total=   0.9s
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV] .

[CV] ....... C=0.1, gamma=0.01, kernel=rbf, score=0.964, total=   0.4s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] ....... C=0.1, gamma=0.01, kernel=rbf, score=0.965, total=   0.4s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] ....... C=0.1, gamma=0.01, kernel=rbf, score=0.962, total=   0.4s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] ....... C=0.1, gamma=0.01, kernel=rbf, score=0.969, total=   0.4s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] ....... C=0.1, gamma=0.01, kernel=rbf, score=0.962, total=   0.4s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.925, total=   0.9s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.924, total=   0.9s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] .

[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.968, total=   0.2s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV] ...... C=10, gamma=0.0001, kernel=rbf, score=0.964, total=   0.4s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV] ...... C=10, gamma=0.0001, kernel=rbf, score=0.965, total=   0.3s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV] ...... C=10, gamma=0.0001, kernel=rbf, score=0.962, total=   0.3s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV] ...... C=10, gamma=0.0001, kernel=rbf, score=0.969, total=   0.4s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV] ...... C=10, gamma=0.0001, kernel=rbf, score=0.965, total=   0.3s
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.966, total=   0.2s
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .

[Parallel(n_jobs=1)]: Done 175 out of 175 | elapsed:  1.2min finished


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [255]:
# Check the best parameters
grid.best_params_

{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}

In [256]:
grid.best_estimator_

SVC(C=1000, gamma=0.01)

Then you can re-run predictions on this grid object just like you would with a normal model.

In [257]:
grid_predictions = grid.predict(X_test)

In [258]:
print(confusion_matrix(y_test,grid_predictions))

[[368   8]
 [  9 366]]


In [259]:
print(classification_report(y_test,grid_predictions))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       376
           1       0.98      0.98      0.98       375

    accuracy                           0.98       751
   macro avg       0.98      0.98      0.98       751
weighted avg       0.98      0.98      0.98       751



## Test a Sample

In [260]:
test_index = 1

test_dic = {0:'Female', 1: 'Male'}
y_test_np = np.array(y_test)
print(f'Actual     --> {test_dic[y_test_np[test_index]]}\nPrediction --> {test_dic[predictions[test_index]]}')

Actual     --> Female
Prediction --> Female
