In [2]:
#Importing necessary python libraries
import sqlite3
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.linear_model  import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, fbeta_score
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
%matplotlib inline

In [3]:
#Read in the cleaned Coronary Artery Disease (CAD) dataset.
file_path = '../Data/Cleaned Data/complete_cleaned_cad.csv'
cad_df = pd.read_csv(file_path)
cad_df.head()

Unnamed: 0,Age,Gender,BMI,Race,Pulse,Arrhythmia,Glu,SBP,DBP,FH_MI,DTS,Smoker,Non_HDL,DM,MI,CAD
0,69,1,26.7,4,86.0,0,554.0,113.0,74.0,0,600.0,0,102.0,0,0,0
1,54,1,28.6,3,74.0,0,219.0,157.0,61.0,0,540.0,1,120.0,0,0,0
2,72,1,28.9,3,68.0,0,183.0,142.0,82.0,1,300.0,0,66.0,0,0,0
3,73,0,19.7,3,92.0,0,104.0,137.0,87.0,1,480.0,0,116.0,0,0,0
4,56,1,41.7,1,60.0,0,104.0,157.0,82.0,0,360.0,0,188.0,0,1,1


In [4]:
#Drop MI (Miocardial Infarction (Heart Attack)) Column
cad_df.drop(columns=(['MI']), inplace=True)
cad_df2 = cad_df.drop(columns=(['FH_MI', 'Arrhythmia']))
cad_df.head()

Unnamed: 0,Age,Gender,BMI,Race,Pulse,Arrhythmia,Glu,SBP,DBP,FH_MI,DTS,Smoker,Non_HDL,DM,CAD
0,69,1,26.7,4,86.0,0,554.0,113.0,74.0,0,600.0,0,102.0,0,0
1,54,1,28.6,3,74.0,0,219.0,157.0,61.0,0,540.0,1,120.0,0,0
2,72,1,28.9,3,68.0,0,183.0,142.0,82.0,1,300.0,0,66.0,0,0
3,73,0,19.7,3,92.0,0,104.0,137.0,87.0,1,480.0,0,116.0,0,0
4,56,1,41.7,1,60.0,0,104.0,157.0,82.0,0,360.0,0,188.0,0,1


In [5]:
#Encoding race categories to binary categories
cad_df = pd.get_dummies(cad_df, columns=(['Race']))
cad_df.head()

Unnamed: 0,Age,Gender,BMI,Pulse,Arrhythmia,Glu,SBP,DBP,FH_MI,DTS,Smoker,Non_HDL,DM,CAD,Race_1,Race_2,Race_3,Race_4,Race_5
0,69,1,26.7,86.0,0,554.0,113.0,74.0,0,600.0,0,102.0,0,0,0,0,0,1,0
1,54,1,28.6,74.0,0,219.0,157.0,61.0,0,540.0,1,120.0,0,0,0,0,1,0,0
2,72,1,28.9,68.0,0,183.0,142.0,82.0,1,300.0,0,66.0,0,0,0,0,1,0,0
3,73,0,19.7,92.0,0,104.0,137.0,87.0,1,480.0,0,116.0,0,0,0,0,1,0,0
4,56,1,41.7,60.0,0,104.0,157.0,82.0,0,360.0,0,188.0,0,1,1,0,0,0,0


In [6]:
# Define Target Variable
y = cad_df.CAD

# Drop Target Variable From Data
X = cad_df.drop(columns=('CAD'))
X.head()

Unnamed: 0,Age,Gender,BMI,Pulse,Arrhythmia,Glu,SBP,DBP,FH_MI,DTS,Smoker,Non_HDL,DM,Race_1,Race_2,Race_3,Race_4,Race_5
0,69,1,26.7,86.0,0,554.0,113.0,74.0,0,600.0,0,102.0,0,0,0,0,1,0
1,54,1,28.6,74.0,0,219.0,157.0,61.0,0,540.0,1,120.0,0,0,0,1,0,0
2,72,1,28.9,68.0,0,183.0,142.0,82.0,1,300.0,0,66.0,0,0,0,1,0,0
3,73,0,19.7,92.0,0,104.0,137.0,87.0,1,480.0,0,116.0,0,0,0,1,0,0
4,56,1,41.7,60.0,0,104.0,157.0,82.0,0,360.0,0,188.0,0,1,0,0,0,0


In [7]:
#Split Data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

smote = SMOTE()
X_train_smote, y_train_smote,  = smote.fit_resample(X_train, y_train)

print(f'Minority Class before smote: {y_train.value_counts().iloc[1]}')
print(f'Majority Class before smote: {y_train.value_counts().iloc[0]}')
print(f'Minority Class after smote: {y_train_smote.value_counts().iloc[1]}')
print(f'Majority Class after smote: {y_train_smote.value_counts().iloc[0]}')

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
xgb_clf = XGBClassifier()

In [None]:
def clf(classifier):
    model = classifier
    model.fit(X_train_smote, y_train_smote)
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train_smote)
    acc_train = accuracy_score(y_train_smote, y_train_pred)*100
    acc_test = accuracy_score(y_test, y_test_pred)*100
    f2_train = fbeta_score(y_train_smote, y_train_pred, beta=2.0)*100
    f2_test = fbeta_score(y_test, y_test_pred, beta=2.0)*100
    cm = confusion_matrix(y_test, y_test_pred)
    print(f'''Accuracy of Predictions on Training Data: {round(acc_train, 2)}
Accuracy of Predictions on Test Data: {round(acc_test, 2)}
                                 ''')
    print(f'''F2 Score for Training Data: {round(f2_train, 2)}
F2 Score for Test Data: {round(f2_test, 2)}
                    ''')
    print('Confusion Matrix Numbers for Test Data')
    print(' ')
#     print(f'''True Negatives: {cm[0][0]}
# False Negatives: {cm[1][0]}
# False Positives: {cm[0][1]}
# True Positives: {cm[1][1]}
#  ''')
    cm_disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    cm_disp.plot()
    plt.title(f'Confusion Matrix')
    plt.show()

In [None]:
clf(dt_clf)

In [None]:
def clf_compare(clf_dict):
    classifiers = clf_dict
    for i in range(len(clf_dict)):
        print('---------------------------')
        print(list(clf_dict.keys())[i])
        print('---------------------------')
        clf(list(clf_dict.values())[i])

In [None]:
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
xgb_clf = XGBClassifier()
clf_dict = {'Decision Tree Classifier': dt_clf, 'Random Forest Classifier': rf_clf, 'XGBoost Classifier': xgb_clf}
clf_compare(clf_dict)

In [None]:
def ftr_imp(model):
    n_features = X_train.shape[1]
    plt.figure(figsize=(8,8))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X_train.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')
    plt.show()

In [None]:
ftr_imp(dt_clf)

In [None]:
def ftr_imp_comp(clf_dict):
    for i in range(len(clf_dict)):
        print(f'                          {list(clf_dict.keys())[i]}')
        ftr_imp(list(clf_dict.values())[i])
        print('''          ----------------------------------------------------------
        ''');

In [None]:
ftr_imp_comp(clf_dict)

In [None]:
# Define Target Variable
y = cad_df2.CAD

# Drop Target Variable From Data
X = cad_df2.drop(columns=('CAD'))
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

smote = SMOTE()
X_train_smote, y_train_smote,  = smote.fit_resample(X_train, y_train)

In [1]:
clf_compare(clf_dict)

NameError: name 'clf_compare' is not defined