# Import Dependencies

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.utils import to_categorical
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV

Using TensorFlow backend.


# Create Dataframe

In [2]:
df = pd.read_csv("../Resources/heart.csv", sep=";")
df["age"] = (df["age"]/365).apply(np.floor).astype(int)
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,2,51,1,165,64.0,130,70,3,1,0,0,0,1
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,4,47,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
cardiodf = df[["age", "gender", "height", "weight", "ap_hi", "ap_lo",
              "cholesterol", "gluc", "smoke", "alco", "active", "cardio"]]
cardiodf.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,51,1,165,64.0,130,70,3,1,0,0,0,1
3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,47,1,156,56.0,100,60,1,1,0,0,0,0


# Data Preprocessing

In [4]:
X = cardiodf.drop("cardio", axis=1)
y = cardiodf["cardio"]
feature_names = X.columns
print(X.shape, y.shape)

(70000, 11) (70000,)


# Train_Test_Split & MinMaxScaler

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
X_scaler = MinMaxScaler().fit(X_train)

In [7]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# One-Hot Encoding

In [8]:
# one-hot encoding y feature
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

# one-hot encoding X features
X_train_categorical = to_categorical(X_train[["gluc", "smoke", "alco", "active", "cholesterol", "gender"]])
X_test_categorical = to_categorical(X_test[["gluc", "smoke", "alco", "active", "cholesterol", "gender"]])

# Support Vector Machine

In [9]:
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

# R2 Score

In [10]:
trainSVM = model.score(X_train_scaled, y_train)
testSVM = model.score(X_test_scaled, y_test)

print(f"Training Score: {trainSVM}")
print(f"Testing Score: {testSVM}")

Training Score: 0.6489333333333334
Testing Score: 0.6462857142857142


# Hyperparameter Tuning with GridSearch

In [11]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=2)

In [12]:
grid.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................ C=1, gamma=0.0001, total=  34.7s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   34.7s remaining:    0.0s


[CV] ................................ C=1, gamma=0.0001, total=  34.7s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=  34.6s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................................ C=1, gamma=0.0005, total=  35.7s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................................ C=1, gamma=0.0005, total=  34.9s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................................ C=1, gamma=0.0005, total=  34.5s
[CV] C=1, gamma=0.001 ................................................
[CV] ................................. C=1, gamma=0.001, total=  38.1s
[CV] C=1, gamma=0.001 ................................................
[CV] ................................. C=1, gamma=0.001, total=  39.9s
[CV] C=1, gamma=0.001 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 36.9min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

# Best Score:

In [13]:
print(grid.best_score_)

0.6925904761904762


In [14]:
trainCV = grid.score(X_train_scaled, y_train)
testCV = grid.score(X_test_scaled, y_test)

print(f"Training Score BEFORE GridSearchCV: {trainSVM}")
print(f"Testing Score BEFORE GridSearchCV: {testSVM}")
print("--------------------------------------------------------------")
print(f"Training Score AFTER GridSearchCV: {trainCV}")
print(f"Testing Score AFTER GridSearchCV: {testCV}")

Training Score BEFORE GridSearchCV: 0.6489333333333334
Testing Score BEFORE GridSearchCV: 0.6462857142857142
--------------------------------------------------------------
Training Score AFTER GridSearchCV: 0.7007809523809524
Testing Score AFTER GridSearchCV: 0.6972


# Predictions

In [15]:
predictions = grid.predict(X_test_scaled)

In [16]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.66      0.78      0.72      8609
           1       0.74      0.62      0.67      8891

    accuracy                           0.70     17500
   macro avg       0.70      0.70      0.70     17500
weighted avg       0.70      0.70      0.70     17500



# This is not a viable model

# Save the Model

In [17]:
import pickle

filename = 'svm_minmax_gridsearch.sav'
pickle.dump(model, open(filename, 'wb'))