# Objective
- base model without oversampling and feature selection & fine tune it
- base model with oversampling and feature selection & fine tune it

In [None]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.svm import SVC

import models.uci_heart_disease_dataset as uci

warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv(uci.UCIHeartDiseaseDataFile.cleveland_preprocessed, names = uci.get_standard_features())
print(f'Data shape: {data.shape}.')
data.head(3)

In [None]:
X = data.drop(uci.UCIHeartDiseaseData.target, axis=1)
y = data[uci.UCIHeartDiseaseData.target]

columns = X.columns
X

In [None]:
y

In [None]:
def build_base_model():
    svc= SVC()
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    acc=accuracy_score(y_test, y_pred)*100
    print("SVM - Accuracy: {:.3f}.".format(acc))
    print("\nClassification Report")
    print(classification_report(y_test, y_pred))

## Feature Selection

In [None]:
from sklearn.feature_selection import chi2,SelectKBest

chi_squared = SelectKBest(score_func=chi2, k=13)
chi_squared.fit(X, y)
feature_score = pd.DataFrame({"Score":chi_squared.scores_, "P_Value":np.round(chi_squared.pvalues_,3)}, index=X.columns)
feature_score.nlargest(n=13,columns="Score")

## Cross Balancing

In [None]:
from collections import Counter
# Need to install imbalanced-learn
from imblearn.over_sampling import SMOTE
X_b, y_b = SMOTE().fit_resample(X, y)

plt.subplots(figsize=(3,3))
sns.countplot(x=y_b)
print(Counter(y_b))
X_b.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_b, y_b, test_size = 0.20, random_state=0)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
X[0:5]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=0)

In [None]:
X_test[0:5]

In [None]:
y_test.head()

# Base model
- classification report
- confusion matrix

In [None]:
svc= SVC(verbose=True)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
acc=accuracy_score(y_test, y_pred)*100
print("SVM - Accuracy: {:.3f}.".format(acc))
print("\nClassification Report")
print(classification_report(y_test, y_pred))
print(svc.get_params())

In [None]:
# There were 60 records in test with equal class (1 and 0).
# Meanwhile, the prediction was 24:36
pd.DataFrame(y_pred).value_counts()

- <b>Precision</b> - 92% (22/24) prediction was correct in overall disease prediction
- <b>Recall</b> - 73% (22/30) prediction for disease was correct

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
plt.figure(figsize=(5, 5))
sns.heatmap(cm, annot=True, linewidths=.5, square=True, cmap='Blues', cbar=False)
# plt.ylabel('Actual');
# plt.xlabel('Predicted');
plt.ylabel('Truth');
plt.xlabel('Predicted');
plt.title("Confusion Matrix for Predicting Heart Disease (True=1, False=0)", size=12);

### GridSerachCV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from numpy import arange
grid_svc= SVC()

parameters = dict()
parameters['kernel'] = ['rbf', 'poly', 'linear', 'sigmoid']
parameters['C'] = arange(1, 10, 1)
parameters['gamma'] = ['scale', 'auto']
parameters['class_weight'] = ['dict', 'balanced']

## Building Grid Search algorithm with cross-validation and acc score.

# grid_search_svc = GridSearchCV(grid_svc, parameters, scoring='accuracy', cv=5, n_jobs=-1)
grid_search_svc = GridSearchCV(grid_svc, parameters, scoring='accuracy', cv=5)

grid_search_svc.fit(X_train,y_train)
best_parameters_svc = grid_search_svc.best_params_
best_score_svc = grid_search_svc.best_score_
print(best_parameters_svc)
print(best_score_svc)

y_pred = grid_search_svc.predict(X_test)

# Get the accuracy score
svc_acc=accuracy_score(y_test, y_pred)*100
svc_pre=precision_score(y_test, y_pred, average='micro')
svc_recall=recall_score(y_test, y_pred, average='micro')
svc_f1_=f1_score(y_test, y_pred, average='micro')

print("\nSVM - Accuracy: {:.3f}.".format(svc_acc))
print("SVM - Precision: {:.3f}.".format(svc_pre))
print("SVM - Recall: {:.3f}.".format(svc_recall))
print("SVM - F1_Score: {:.3f}.".format(svc_f1_))
print("\nClassification Report")
print(classification_report(y_test, y_pred))

### RandomSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from numpy import arange
rand_svc= SVC()

parameters = dict()
parameters['kernel'] = ['rbf', 'poly', 'linear', 'sigmoid']
parameters['C'] = arange(1, 10, 1)
parameters['gamma'] = ['scale', 'auto']
parameters['class_weight'] = ['dict', 'balanced']

## Building Grid Search algorithm with cross-validation and acc score.

rand_search_svc = RandomizedSearchCV(rand_svc, parameters, scoring='accuracy', cv=5)

rand_search_svc.fit(X_train,y_train)
best_parameters_svc = rand_search_svc.best_params_
best_score_svc = rand_search_svc.best_score_
print(best_parameters_svc)
print(best_score_svc)

y_pred = rand_search_svc.predict(X_test)

# Get the accuracy score
svc_acc=accuracy_score(y_test, y_pred)*100
svc_pre=precision_score(y_test, y_pred, average='micro')
svc_recall=recall_score(y_test, y_pred, average='micro')
svc_f1_=f1_score(y_test, y_pred, average='micro')

print("\nSVM - Accuracy: {:.3f}.".format(svc_acc))
print("SVM - Precision: {:.3f}.".format(svc_pre))
print("SVM - Recall: {:.3f}.".format(svc_recall))
print("SVM - F1_Score: {:.3f}.".format(svc_f1_))
print("\nClassification Report")
print(classification_report(y_test, y_pred))

In [24]:
# From another book.
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import warnings
import models.uci_heart_disease_dataset as uci
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the preprocessed dataset (299 records; with 14 features).
data = pd.read_csv(uci.UCIHeartDiseaseDataFile.cleveland_preprocessed, names = uci.get_standard_features())
print(f'Data shape: {data.shape}.')
data.head(3)

X = data.drop(uci.UCIHeartDiseaseData.target, axis=1)
y = data[uci.UCIHeartDiseaseData.target]

# X_train, X_test, y_train, y_test = scale_and_split(X, y)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Do the train and test splits.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=0)

svc= SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
acc=accuracy_score(y_test, y_pred)*100
print("SVM - Accuracy: {:.3f}.".format(acc))
print("\nClassification Report")
print(classification_report(y_test, y_pred))
print(svc.get_params())

Data shape: (299, 14).
SVM - Accuracy: 85.000.

Classification Report
              precision    recall  f1-score   support

           0       0.78      0.97      0.86        29
           1       0.96      0.74      0.84        31

    accuracy                           0.85        60
   macro avg       0.87      0.85      0.85        60
weighted avg       0.87      0.85      0.85        60

{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, stratify=y, random_state=0)

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from numpy import arange

# Load the preprocessed dataset (299 records; with 14 features).
data2 = pd.read_csv(uci.UCIHeartDiseaseDataFile.cleveland_preprocessed, names = uci.get_standard_features())
print(f'Data shape: {data.shape}.')
# data.head(3)

X = data.drop(uci.UCIHeartDiseaseData.target, axis=1)
y = data[uci.UCIHeartDiseaseData.target]

# X_train, X_test, y_train, y_test = scale_and_split(X, y)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Do the train and test splits.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=0)


grid_svc= SVC()

parameters = dict()
parameters['kernel'] = ['rbf', 'poly', 'linear', 'sigmoid']
parameters['C'] = arange(1, 10, 1)
parameters['gamma'] = ['scale', 'auto']
parameters['class_weight'] = ['dict', 'balanced']

## Building Grid Search algorithm with cross-validation and acc score.

# grid_search_svc = GridSearchCV(grid_svc, parameters, scoring='accuracy', cv=5, n_jobs=-1)
grid_search_svc = GridSearchCV(grid_svc, parameters, scoring='accuracy', cv=5)

grid_search_svc.fit(X_train,y_train)
best_parameters_svc = grid_search_svc.best_params_
best_score_svc = grid_search_svc.best_score_
print(best_parameters_svc)
print(best_score_svc)

y_pred = grid_search_svc.predict(X_test)

# Get the accuracy score
svc_acc=accuracy_score(y_test, y_pred)*100
svc_pre=precision_score(y_test, y_pred, average='micro')
svc_recall=recall_score(y_test, y_pred, average='micro')
svc_f1_=f1_score(y_test, y_pred, average='micro')

print("\nSVM - Accuracy: {:.3f}.".format(svc_acc))
print("SVM - Precision: {:.3f}.".format(svc_pre))
print("SVM - Recall: {:.3f}.".format(svc_recall))
print("SVM - F1_Score: {:.3f}.".format(svc_f1_))
print("\nClassification Report")
print(classification_report(y_test, y_pred))
print(grid_search_svc.get_params())

Data shape: (299, 14).
{'C': 1, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'sigmoid'}
0.8284574468085106

SVM - Accuracy: 85.000.
SVM - Precision: 0.850.
SVM - Recall: 0.850.
SVM - F1_Score: 0.850.

Classification Report
              precision    recall  f1-score   support

           0       0.79      0.93      0.86        29
           1       0.92      0.77      0.84        31

    accuracy                           0.85        60
   macro avg       0.86      0.85      0.85        60
weighted avg       0.86      0.85      0.85        60

{'cv': 5, 'error_score': nan, 'estimator__C': 1.0, 'estimator__break_ties': False, 'estimator__cache_size': 200, 'estimator__class_weight': None, 'estimator__coef0': 0.0, 'estimator__decision_function_shape': 'ovr', 'estimator__degree': 3, 'estimator__gamma': 'scale', 'estimator__kernel': 'rbf', 'estimator__max_iter': -1, 'estimator__probability': False, 'estimator__random_state': None, 'estimator__shrinking': True, 'estimator__tol': 0.