In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('dataset_grade_2.csv').drop(columns= ['Unnamed: 0', 'SEQN'])
print(df.shape)
print(df.columns)

(560, 13)
Index(['angina', 'DPQ020', 'DPQ090', 'OHQ850', 'OHQ835', 'OHQ620', 'SMQ020',
       'WHD020', 'PAQ650', 'BPQ020', 'RIAGENDR', 'RIDAGEYR', 'DBQ700'],
      dtype='object')


## Data Preprocessing

In [3]:
# first we'll one hot encode categorical variables
dummy_depr = pd.get_dummies(df.DPQ020.astype(int), prefix='depr')
dummy_death = pd.get_dummies(df.DPQ090.astype(int), prefix = "death")
dummy_diet = pd.get_dummies(df.DBQ700, prefix = "diet")
# print(dummy_death)

df.drop(columns = ['DPQ020', 'DPQ090', 'DBQ700'])

diet_columns = ['diet_1.0', 'diet_2.0', 'diet_3.0', 'diet_4.0', 'diet_5.0' ]
depr_columns = ['depr_0', 'depr_1', 'depr_2', 'depr_3']
death_columns = ['death_0', 'death_1', 'death_2', 'death_3']

for c in diet_columns:
    df[c] = dummy_diet[c]
for c in depr_columns:
    df[c] = dummy_depr[c]
for c in death_columns:
    df[c] = dummy_death[c]
    
print(df.shape)

# #standardize the continuous columns
# age_m, age_sd = df['RIDAGEYR'].mean(), df['RIDAGEYR'].std()
# weight_m, weight_sd = df['WHD020'].mean(), df['WHD020'].std()
# mouth_m, mouth_sd = df['OHQ620'].mean(), df['OHQ620'].std()
# df['WHD020'] = (df['WHD020'] - weight_m)/weight_sd
# df['RIDAGEYR'] = (df['RIDAGEYR']-age_m)/age_sd
# df['OHQ620'] = (df['OHQ620']-mouth_m)/mouth_sd
# print(df['RIDAGEYR'].mean(), df['RIDAGEYR'].std())
# print(df['WHD020'].mean(), df['WHD020'].std())
# print(df['OHQ620'].mean(), df['OHQ620'].std())

(560, 26)


## Upsample

In [4]:
# from sklearn.utils import resample
# #create two different dataframe of majority and minority class 
# df_majority = df[(df['angina']==False)] 
# df_minority = df[(df['angina']==True)] 
# # upsample minority class
# df_minority_upsampled = resample(df_minority, 
#                                  replace=True,    # sample with replacement
#                                  n_samples= 392, # to match majority class
#                                  random_state=42)  # reproducible results
# # Combine majority class with upsampled minority class
# df = pd.concat([df_minority_upsampled, df_majority])
# print(df.shape)

In [5]:
# from imblearn.over_sampling import SMOTE
# # Resampling the minority class. The strategy can be changed as required.
# sm = SMOTE(sampling_strategy='minority', random_state=42)
# # Fit the model to generate the data.
# oversampled_X, oversampled_Y = sm.fit_sample(df_train.drop('Is_Lead', axis=1), df_train['Is_Lead'])
# oversampled = pd.concat([pd.DataFrame(oversampled_Y), pd.DataFrame(oversampled_X)], axis=1)

## Split dataset into X, y (and convert to NumPy Ndarray)

In [6]:
"""
Split dataset into X, y
Converted to NumPy Ndarray
"""
X = df.iloc[:, 1:]
X = X.drop(columns = ['OHQ850', 'OHQ835', 'OHQ620'])
print(X.shape)
cols = X.columns
print(cols)
X = X.to_numpy()
y = df['angina'].to_numpy()
print(pd.DataFrame(y).value_counts())


(560, 22)
Index(['DPQ020', 'DPQ090', 'SMQ020', 'WHD020', 'PAQ650', 'BPQ020', 'RIAGENDR',
       'RIDAGEYR', 'DBQ700', 'diet_1.0', 'diet_2.0', 'diet_3.0', 'diet_4.0',
       'diet_5.0', 'depr_0', 'depr_1', 'depr_2', 'depr_3', 'death_0',
       'death_1', 'death_2', 'death_3'],
      dtype='object')
False    430
True     130
dtype: int64


## Split total dataset dataset into 80:20 shuffled split (train/test)

In [7]:
"""
Split total dataset into 80:20 split (train/test)
Shuffled
"""
X_train_validation, X_test, y_train_validation, y_test = train_test_split(X, y, test_size=0.20, random_state=59, shuffle=True, stratify=y)
# print(X_train)
# print(X_test)
# print(y_train)
# print(y_test)
print(X_train_validation.shape)
print(X_test.shape)
print(y_train_validation.shape)
print(y_test.shape)

(448, 22)
(112, 22)
(448,)
(112,)


## Upsample use SMOTE

In [8]:
from imblearn.over_sampling import SMOTE
# Resampling the minority class. The strategy can be changed as required.
# sm = SMOTE(sampling_strategy='minority', random_state=42)
# # Fit the model to generate the data.
# oversampled_X, oversampled_Y = sm.fit_resample(X_train_validation, y_train_validation)
# X_train_validation = oversampled_X
# y_train_validation = oversampled_Y
# print(pd.DataFrame(y_train_validation).value_counts())

## Hyperparameter Tuning (k-fold validation)

In [10]:
def hyperparam_tune(clf, alphas, testing, n_splits = 4, prnt=False, cols = cols):
    N_MODELS = len(alphas)
    accuracy_scores = np.zeros((N_MODELS,))
    f1_scores = np.zeros((N_MODELS,))
    ROC_scores = np.zeros((N_MODELS,))
    kf = StratifiedKFold(n_splits=n_splits)
    
    for i, alpha in enumerate(alphas):
        average_accuracy = 0
        average_f1_score = 0
        average_roc_score = 0
        # run k_fold validation and sum performance metrics
        for train_index, test_index in kf.split(X_train_validation, y_train_validation):
            X_train, X_validation = X_train_validation[train_index], X_train_validation[test_index]
            y_train, y_validation = y_train_validation[train_index], y_train_validation[test_index]
            
            sm = SMOTE(sampling_strategy='minority', random_state=42)
            # Fit the model to generate the data.
            oversampled_X, oversampled_Y = sm.fit_resample(X_train, y_train)
            X_train = pd.DataFrame(oversampled_X)
            X_train.columns = cols
            y_train = oversampled_Y
#             print(pd.DataFrame(y_train_validation).value_counts())

            #standardize the continuous columns
            age_m, age_sd = X_train['RIDAGEYR'].mean(), X_train['RIDAGEYR'].std()
            weight_m, weight_sd = X_train['WHD020'].mean(), X_train['WHD020'].std()
#             mouth_m, mouth_sd = X_train['OHQ620'].mean(), X_train['OHQ620'].std()
            X_train['WHD020'] = (X_train['WHD020'] - weight_m)/weight_sd
            X_train['RIDAGEYR'] = (X_train['RIDAGEYR']-age_m)/age_sd
#             X_train['OHQ620'] = (X_train['OHQ620']-mouth_m)/mouth_sd
#             print(X_train['RIDAGEYR'].mean(), X_train['RIDAGEYR'].std())
#             print(X_train['WHD020'].mean(), X_train['WHD020'].std())
#             print(X_train['OHQ620'].mean(), X_train['OHQ620'].std())
            X_train = X_train.to_numpy()
    
    
            if testing == 'logistic':
                clf.C = alpha 
            elif testing == 'ridge':
                clf.alpha = alpha
            clf.fit(X_train, y_train)
            y_predictions = clf.predict(X_validation)
            average_accuracy = average_accuracy + accuracy_score(y_validation, y_predictions)
            average_f1_score = average_f1_score + f1_score(y_validation, y_predictions)
            average_roc_score = average_roc_score + roc_auc_score(y_validation, y_predictions)
          # divide performance metrics by n_splits to get averages
        accuracy_scores[i] = average_accuracy / n_splits
        f1_scores[i] = average_f1_score / n_splits
        ROC_scores[i] = average_roc_score / n_splits
        
        #TODO: Write code to print all results
        
    """
    Evalute best hyperparameter
    """
    alpha_with_max_accuracy = alphas[np.where(accuracy_scores == max(accuracy_scores))]
    alpha_with_max_f1_score = alphas[np.where(f1_scores == max(f1_scores))]
    alpha_with_max_ROC_score = alphas[np.where(ROC_scores == max(ROC_scores))]
    
        
    return {"max_acc": (alpha_with_max_accuracy[0], max(accuracy_scores)), 
            "max_f1": (alpha_with_max_f1_score[0], max(f1_scores)), 
            "max_roc": (alpha_with_max_ROC_score[0], max(ROC_scores))}
        

In [11]:
N_MODELS = 100
alphas = np.logspace(-3, 6, N_MODELS)
model = LogisticRegression(max_iter=1000000)

print(hyperparam_tune(model, alphas, 'logistic', n_splits = 4, prnt=False))

{'max_acc': (0.03511191734215131, 0.6339285714285714), 'max_f1': (0.001, 0.18840579710144928), 'max_roc': (0.01873817422860384, 0.5031305903398926)}


In [12]:
from sklearn.linear_model import RidgeClassifier
alphas = np.logspace(-3, 6, N_MODELS)
model = RidgeClassifier()

print(hyperparam_tune(model, alphas, 'ridge', n_splits = 4, prnt=False))

{'max_acc': (0.001, 0.6339285714285714), 'max_f1': (284.8035868435805, 0.18840579710144928), 'max_roc': (187.3817422860383, 0.506484794275492)}


In [13]:
from sklearn.svm import SVC

alphas = np.logspace(-6, 2, N_MODELS)
model = SVC()

print(hyperparam_tune(model, alphas, 'logistic', n_splits = 4, prnt=False))

{'max_acc': (1e-06, 0.7678571428571429), 'max_f1': (1e-06, 0.0), 'max_roc': (1e-06, 0.5)}


In [14]:
from sklearn.tree import DecisionTreeClassifier

alphas = np.logspace(0, 0, 1)
model = SVC()

print(hyperparam_tune(model, alphas, 'tree', n_splits = 4, prnt=False))

{'max_acc': (1.0, 0.7678571428571429), 'max_f1': (1.0, 0.0), 'max_roc': (1.0, 0.5)}


In [15]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

alphas = np.logspace(0, 0, 1)
model = AdaBoostClassifier()

print(hyperparam_tune(model, alphas, 'tree', n_splits = 4, prnt=False))

{'max_acc': (1.0, 0.7678571428571429), 'max_f1': (1.0, 0.03333333333333334), 'max_roc': (1.0, 0.5067084078711985)}


In [16]:
alphas = np.logspace(0, 0, 1)
model = RandomForestClassifier(max_features='log2', max_depth = 100)

print(hyperparam_tune(model, alphas, 'tree', n_splits = 4, prnt=False))

{'max_acc': (1.0, 0.734375), 'max_f1': (1.0, 0.19121113594797806), 'max_roc': (1.0, 0.5251565295169945)}


## TODO: Make a list of classifiers with their parameters we wish to hyper-tune

In [18]:
# X_train_validation, X_test, y_train_validation, y_test = train_test_split(X, y, test_size=0.2, random_state=59, shuffle=True, stratify=y)
# Resampling the minority class. The strategy can be changed as required.
sm = SMOTE(sampling_strategy='minority', random_state=42)
# Fit the model to generate the data.
oversampled_X, oversampled_Y = sm.fit_resample(X_train_validation, y_train_validation)
X_train_validation = oversampled_X
X_train_validation = pd.DataFrame(oversampled_X)
X_train_validation.columns = cols
y_train_validation = oversampled_Y
print(pd.DataFrame(y_train_validation).value_counts())
print(X_train_validation.shape)

#standardize the continuous columns
age_m, age_sd = X_train_validation['RIDAGEYR'].mean(), X_train_validation['RIDAGEYR'].std()
weight_m, weight_sd = X_train_validation['WHD020'].mean(), X_train_validation['WHD020'].std()
# mouth_m, mouth_sd = X_train_validation['OHQ620'].mean(), X_train_validation['OHQ620'].std()
X_train_validation['WHD020'] = (X_train_validation['WHD020'] - weight_m)/weight_sd
X_train_validation['RIDAGEYR'] = (X_train_validation['RIDAGEYR']-age_m)/age_sd
# X_train_validation['OHQ620'] = (X_train_validation['OHQ620']-mouth_m)/mouth_sd
#             print(X_train['RIDAGEYR'].mean(), X_train['RIDAGEYR'].std())
#             print(X_train['WHD020'].mean(), X_train['WHD020'].std())
#             print(X_train['OHQ620'].mean(), X_train['OHQ620'].std())
X_train_validation = X_train_validation.to_numpy()

model = RandomForestClassifier(max_features='log2', max_depth = 5)
model.fit(X_train_validation, y_train_validation)
rf_predictions = model.predict(X_test)
rf_predictions2 = model.predict(X_train_validation)
print("TESTING RESULTS")
print(accuracy_score(y_test, rf_predictions))
print(f1_score(y_test, rf_predictions))
print(roc_auc_score(y_test, rf_predictions))

print("TRAINING RESULTS")
print(accuracy_score(y_train_validation, rf_predictions2))
print(f1_score(y_train_validation, rf_predictions2))
print(roc_auc_score(y_train_validation, rf_predictions2))

False    344
True     344
dtype: int64
(688, 22)
TESTING RESULTS
0.6964285714285714
0.15
0.4937388193202147
TRAINING RESULTS
0.8691860465116279
0.8538961038961039
0.8691860465116279


In [21]:
model = LogisticRegression(max_iter=1000000, C=0.01873817422860384)
model.fit(X_train_validation, y_train_validation)
lr_predictions = model.predict(X_test)
lr_predictions2 = model.predict(X_train_validation)
print("TESTING RESULTS")
print(accuracy_score(y_test, lr_predictions))
print(f1_score(y_test, lr_predictions))
print(roc_auc_score(y_test, lr_predictions))

print("TRAINING RESULTS")
print(accuracy_score(y_train_validation, lr_predictions2))
print(f1_score(y_train_validation, lr_predictions2))
print(roc_auc_score(y_train_validation, lr_predictions2))

TESTING RESULTS
0.23214285714285715
0.37681159420289856
0.5
TRAINING RESULTS
0.6017441860465116
0.6085714285714285
0.6017441860465116


In [22]:
model = AdaBoostClassifier()
model.fit(X_train_validation, y_train_validation)
ab_predictions = model.predict(X_test)
ab_predictions2 = model.predict(X_train_validation)
print("TESTING RESULTS")
print(accuracy_score(y_test, ab_predictions))
print(f1_score(y_test, ab_predictions))
print(roc_auc_score(y_test, ab_predictions))

print("TRAINING RESULTS")
print(accuracy_score(y_train_validation, ab_predictions2))
print(f1_score(y_train_validation, ab_predictions2))
print(roc_auc_score(y_train_validation, ab_predictions2))

TESTING RESULTS
0.7410714285714286
0.06451612903225806
0.4959749552772809
TRAINING RESULTS
0.8401162790697675
0.8231511254019293
0.8401162790697674


## Calculate Precision + Recall

In [None]:
def create_truth(row):
    if row['truth'] == 1 and row['pred'] == 1:
        return "TP"
    if row['truth'] == 1 and row['pred'] == 0:
        return "FN"
    if row['truth'] ==0  and row['pred'] == 1:
        return "FP"
    if row['truth'] == 0 and row['pred'] == 0:
        return "TN"
    

def precision_recall(dct):
    if 'FN' not in dct.keys():
        recall = dct['TP'] / (dct['TP']+0)
    else:
        recall = dct['TP'] / (dct['TP'] + dct['FN'])
        
    precision = dct['TP'] / (dct['TP'] + dct['FP'])
    return precision, recall

def create_dict(pred, truth, verbose= False):
    df = pd.DataFrame()
    df['truth'] = truth
    df['pred'] = pred
    df['result'] = df.apply(lambda row: create_truth(row), axis = 1)
    if verbose == True:
        print(df['result'].value_counts())
    return dict(df['result'].value_counts())

rf_dct = create_dict(rf_predictions, y_test)
rf_prec, rf_rec = precision_recall(rf_dct)
print(f"RandomForest precision: {rf_prec}, recall: {rf_rec}")
# print((2*(rf_prec * rf_rec))/(rf_prec + rf_rec))

svm_dct = create_dict(lr_predictions, y_test)
svm_prec, svm_rec = precision_recall(svm_dct)
print(f"LR precision: {svm_prec}, recall: {svm_rec}")

ab_dct = create_dict(ab_predictions, y_test)
ab_prec, ab_rec = precision_recall(ab_dct)
print(f"AdaBoost precision: {ab_prec}, recall: {ab_rec}")


print("\nTraining results:")
rf_dct = create_dict(rf_predictions2, y_train_validation)
rf_prec, rf_rec = precision_recall(rf_dct)
print(f"RandomForest precision: {rf_prec}, recall: {rf_rec}")
# print((2*(rf_prec * rf_rec))/(rf_prec + rf_rec))

svm_dct = create_dict(lr_predictions2,  y_train_validation)
svm_prec, svm_rec = precision_recall(svm_dct)
print(f"LR precision: {svm_prec}, recall: {svm_rec}")

ab_dct = create_dict(ab_predictions2,  y_train_validation)
ab_prec, ab_rec = precision_recall(ab_dct)
print(f"AdaBoost precision: {ab_prec}, recall: {ab_rec}")