# Objective
- base model without oversampling and feature selection & fine tune it
- base model with oversampling and feature selection & fine tune it

In [47]:

import warnings

import pandas as pd
from numpy import arange
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np
from sklearn.feature_selection import chi2,SelectKBest
from imblearn.over_sampling import SMOTE

import models.uci_heart_disease_dataset as uci

warnings.filterwarnings("ignore")

In [36]:
data = pd.read_csv(uci.UCIHeartDiseaseDataFile.cleveland_preprocessed, names = uci.get_standard_features())
print(f'Data shape: {data.shape}.')
data.head(3)

Data shape: (299, 14).


Unnamed: 0,Age,Gender,Chest Pain,BP Systolic,Cholesterol,Blood Sugar,Rest ECG,Exe. Max Heartrate,Exe. Induced Angina,Exe. ST Depression,Exe. ST Segment Slope,Major Vessels,Thalassemia,Target
0,63,1,1,145,233.0,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286.0,0,2,108,1,1.5,2,3,3,1
2,67,1,4,120,229.0,0,2,129,1,2.6,2,2,7,1


In [37]:
# A function to build base model.
def build_base_model(x_set, y_set):
    scaler = StandardScaler()
    x_set = scaler.fit_transform(x_set)
    x_train, x_test, y_train, y_test = train_test_split(x_set, y_set, test_size = 0.20, random_state=0)

    svc= SVC()
    svc.fit(x_train, y_train)
    y_pred = svc.predict(x_test)
    acc=accuracy_score(y_test, y_pred)*100
    print("SVM - Accuracy: {:.3f}.".format(acc))
    print("\nClassification Report")
    print(classification_report(y_test, y_pred))

In [38]:
# A function to build and fine tune model using GridSearchCV.
def fine_tune_model_with_grid_search_cv(x_set, y_set):
    scaler = StandardScaler()
    x_set = scaler.fit_transform(x_set)
    x_train, x_test, y_train, y_test = train_test_split(x_set, y_set, test_size = 0.20, random_state=0)

    grid_svc= SVC()
    parameters = dict()
    parameters['kernel'] = ['rbf', 'poly', 'linear', 'sigmoid']
    parameters['C'] = arange(1, 10, 1)
    parameters['gamma'] = ['scale', 'auto']
    parameters['class_weight'] = ['dict', 'balanced']

    ## Building Grid Search algorithm with cross-validation and acc score.

    # grid_search_svc = GridSearchCV(grid_svc, parameters, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search_svc = GridSearchCV(grid_svc, parameters, scoring='accuracy', cv=5)

    grid_search_svc.fit(x_train,y_train)
    best_parameters_svc = grid_search_svc.best_params_
    best_score_svc = grid_search_svc.best_score_
    print(best_parameters_svc)
    print(best_score_svc)

    y_pred = grid_search_svc.predict(x_test)

    # Get the accuracy score
    svc_acc=accuracy_score(y_test, y_pred)*100
    svc_pre=precision_score(y_test, y_pred, average='micro')
    svc_recall=recall_score(y_test, y_pred, average='micro')
    svc_f1_=f1_score(y_test, y_pred, average='micro')

    print("\nSVM - Accuracy: {:.3f}.".format(svc_acc))
    print("SVM - Precision: {:.3f}.".format(svc_pre))
    print("SVM - Recall: {:.3f}.".format(svc_recall))
    print("SVM - F1_Score: {:.3f}.".format(svc_f1_))
    print("\nClassification Report")
    print(classification_report(y_test, y_pred))

In [39]:
# A function to build and fine tune model using RandomSearchCV.
def fine_tune_model_with_random_search_cv(x_set, y_set):
    scaler = StandardScaler()
    x_set = scaler.fit_transform(x_set)
    x_train, x_test, y_train, y_test = train_test_split(x_set, y_set, test_size = 0.20, random_state=0)

    rand_svc= SVC()

    parameters = dict()
    parameters['kernel'] = ['rbf', 'poly', 'linear', 'sigmoid']
    parameters['C'] = arange(1, 10, 1)
    parameters['gamma'] = ['scale', 'auto']
    parameters['class_weight'] = ['dict', 'balanced']

    ## Building Grid Search algorithm with cross-validation and acc score.

    rand_search_svc = RandomizedSearchCV(rand_svc, parameters, scoring='accuracy', cv=5)

    rand_search_svc.fit(x_train,y_train)
    best_parameters_svc = rand_search_svc.best_params_
    best_score_svc = rand_search_svc.best_score_
    print(best_parameters_svc)
    print(best_score_svc)

    y_pred = rand_search_svc.predict(x_test)

    # Get the accuracy score
    svc_acc=accuracy_score(y_test, y_pred)*100
    svc_pre=precision_score(y_test, y_pred, average='micro')
    svc_recall=recall_score(y_test, y_pred, average='micro')
    svc_f1_=f1_score(y_test, y_pred, average='micro')

    print("\nSVM - Accuracy: {:.3f}.".format(svc_acc))
    print("SVM - Precision: {:.3f}.".format(svc_pre))
    print("SVM - Recall: {:.3f}.".format(svc_recall))
    print("SVM - F1_Score: {:.3f}.".format(svc_f1_))
    print("\nClassification Report")
    print(classification_report(y_test, y_pred))

### Data without over-sampling and feature selection

In [40]:
X = data.drop(uci.UCIHeartDiseaseData.target, axis=1)
y = data[uci.UCIHeartDiseaseData.target]
build_base_model(X, y)

SVM - Accuracy: 85.000.

Classification Report
              precision    recall  f1-score   support

           0       0.78      0.97      0.86        29
           1       0.96      0.74      0.84        31

    accuracy                           0.85        60
   macro avg       0.87      0.85      0.85        60
weighted avg       0.87      0.85      0.85        60



In [41]:
X = data.drop(uci.UCIHeartDiseaseData.target, axis=1)
y = data[uci.UCIHeartDiseaseData.target]
fine_tune_model_with_random_search_cv(X, y)

{'kernel': 'rbf', 'gamma': 'auto', 'class_weight': 'balanced', 'C': 1}
0.799290780141844

SVM - Accuracy: 85.000.
SVM - Precision: 0.850.
SVM - Recall: 0.850.
SVM - F1_Score: 0.850.

Classification Report
              precision    recall  f1-score   support

           0       0.78      0.97      0.86        29
           1       0.96      0.74      0.84        31

    accuracy                           0.85        60
   macro avg       0.87      0.85      0.85        60
weighted avg       0.87      0.85      0.85        60



In [42]:
X = data.drop(uci.UCIHeartDiseaseData.target, axis=1)
y = data[uci.UCIHeartDiseaseData.target]
fine_tune_model_with_grid_search_cv(X, y)

{'C': 1, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'sigmoid'}
0.8284574468085106

SVM - Accuracy: 85.000.
SVM - Precision: 0.850.
SVM - Recall: 0.850.
SVM - F1_Score: 0.850.

Classification Report
              precision    recall  f1-score   support

           0       0.79      0.93      0.86        29
           1       0.92      0.77      0.84        31

    accuracy                           0.85        60
   macro avg       0.86      0.85      0.85        60
weighted avg       0.86      0.85      0.85        60



### Data with over-sampling and feature selection

In [54]:
chi_squared = SelectKBest(score_func=chi2, k=13)
chi_squared.fit(X, y)
feature_score = pd.DataFrame({"Score":chi_squared.scores_, "P_Value":np.round(chi_squared.pvalues_,3)}, index=X.columns)
feature_score.nlargest(n=13,columns="Score")

Unnamed: 0,Score,P_Value
Exe. Max Heartrate,197.354493,0.0
Major Vessels,80.790297,0.0
Exe. ST Depression,68.733906,0.0
Thalassemia,66.601762,0.0
Exe. Induced Angina,36.486887,0.0
Age,22.263468,0.0
BP Systolic,16.867552,0.0
Cholesterol,15.115732,0.0
Chest Pain,14.902669,0.0
Exe. ST Segment Slope,7.973535,0.005


In [57]:
chi_squared = SelectKBest(score_func=chi2, k=9)
X_fs = chi_squared.fit_transform(X, y)

In [58]:
X_b, y_b = SMOTE().fit_resample(X_fs, y)
X_b.shape

build_base_model(X_b, y_b)

SVM - Accuracy: 78.125.

Classification Report
              precision    recall  f1-score   support

           0       0.71      0.86      0.77        28
           1       0.87      0.72      0.79        36

    accuracy                           0.78        64
   macro avg       0.79      0.79      0.78        64
weighted avg       0.80      0.78      0.78        64



In [59]:
fine_tune_model_with_random_search_cv(X_b, y_b)

{'kernel': 'rbf', 'gamma': 'auto', 'class_weight': 'balanced', 'C': 4}
0.8279034690799396

SVM - Accuracy: 78.125.
SVM - Precision: 0.781.
SVM - Recall: 0.781.
SVM - F1_Score: 0.781.

Classification Report
              precision    recall  f1-score   support

           0       0.72      0.82      0.77        28
           1       0.84      0.75      0.79        36

    accuracy                           0.78        64
   macro avg       0.78      0.79      0.78        64
weighted avg       0.79      0.78      0.78        64



In [60]:
fine_tune_model_with_random_search_cv(X_b,y_b)

{'kernel': 'rbf', 'gamma': 'auto', 'class_weight': 'balanced', 'C': 3}
0.8200603318250377

SVM - Accuracy: 76.562.
SVM - Precision: 0.766.
SVM - Recall: 0.766.
SVM - F1_Score: 0.766.

Classification Report
              precision    recall  f1-score   support

           0       0.70      0.82      0.75        28
           1       0.84      0.72      0.78        36

    accuracy                           0.77        64
   macro avg       0.77      0.77      0.77        64
weighted avg       0.78      0.77      0.77        64

