In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('dataset.csv').drop(columns= ['Unnamed: 0', 'SEQN'])
print(df.shape)
print(df.columns)

(560, 13)
Index(['angina', 'DPQ020', 'DPQ090', 'OHQ850', 'OHQ835', 'OHQ620', 'SMQ020',
       'WHD020', 'PAQ650', 'BPQ020', 'RIAGENDR', 'RIDAGEYR', 'DBQ700'],
      dtype='object')


## Data Preprocessing

In [3]:
# first we'll one hot encode categorical variables
dummy_depr = pd.get_dummies(df.DPQ020.astype(int), prefix='depr')
dummy_death = pd.get_dummies(df.DPQ090.astype(int), prefix = "death")
dummy_diet = pd.get_dummies(df.DBQ700, prefix = "diet")
# print(dummy_death)

df.drop(columns = ['DPQ020', 'DPQ090', 'DBQ700'])

diet_columns = ['diet_1.0', 'diet_2.0', 'diet_3.0', 'diet_4.0', 'diet_5.0' ]
depr_columns = ['depr_0', 'depr_1', 'depr_2', 'depr_3']
death_columns = ['death_0', 'death_1', 'death_2', 'death_3']

for c in diet_columns:
    df[c] = dummy_diet[c]
for c in depr_columns:
    df[c] = dummy_depr[c]
for c in death_columns:
    df[c] = dummy_death[c]
    
print(df.shape)

#standardize the continuous columns
age_m, age_sd = df['RIDAGEYR'].mean(), df['RIDAGEYR'].std()
weight_m, weight_sd = df['WHD020'].mean(), df['WHD020'].std()
mouth_m, mouth_sd = df['OHQ620'].mean(), df['OHQ620'].std()
df['WHD020'] = (df['WHD020'] - weight_m)/weight_sd
df['RIDAGEYR'] = (df['RIDAGEYR']-age_m)/age_sd
df['OHQ620'] = (df['OHQ620']-mouth_m)/mouth_sd
print(df['RIDAGEYR'].mean(), df['RIDAGEYR'].std())
print(df['WHD020'].mean(), df['WHD020'].std())
print(df['OHQ620'].mean(), df['OHQ620'].std())

(560, 26)
1.9349601286324157e-16 0.9999999999999999
-1.8080774972466835e-16 1.0
6.026924990822278e-17 1.0


## Split dataset into X, y (and convert to NumPy Ndarray)

In [4]:
"""
Split dataset into X, y
Converted to NumPy Ndarray
"""
X = df.iloc[:, 1:]
# X = X.drop(columns = ['OHQ850', 'OHQ835'])
print(X.shape)
print(X.columns)
X = X.to_numpy()
y = df['angina'].to_numpy()
print(y.shape)


(560, 25)
Index(['DPQ020', 'DPQ090', 'OHQ850', 'OHQ835', 'OHQ620', 'SMQ020', 'WHD020',
       'PAQ650', 'BPQ020', 'RIAGENDR', 'RIDAGEYR', 'DBQ700', 'diet_1.0',
       'diet_2.0', 'diet_3.0', 'diet_4.0', 'diet_5.0', 'depr_0', 'depr_1',
       'depr_2', 'depr_3', 'death_0', 'death_1', 'death_2', 'death_3'],
      dtype='object')
(560,)


## Split total dataset dataset into 80:20 shuffled split (train/test)

In [5]:
"""
Split total dataset into 80:20 split (train/test)
Shuffled
"""
X_train_validation, X_test, y_train_validation, y_test = train_test_split(X, y, test_size=0.2, random_state=59, shuffle=True, stratify=None)
# print(X_train)
# print(X_test)
# print(y_train)
# print(y_test)
print(X_train_validation.shape)
print(X_test.shape)
print(y_train_validation.shape)
print(y_test.shape)

(448, 25)
(112, 25)
(448,)
(112,)


## Hyperparameter Tuning (k-fold validation)

In [16]:
def hyperparam_tune(clf, alphas, testing, n_splits = 4, prnt=False):
    N_MODELS = len(alphas)
    accuracy_scores = np.zeros((N_MODELS,))
    f1_scores = np.zeros((N_MODELS,))
    ROC_scores = np.zeros((N_MODELS,))
    kf = KFold(n_splits=n_splits)
    
    for i, alpha in enumerate(alphas):
        average_accuracy = 0
        average_f1_score = 0
        average_roc_score = 0
        # run k_fold validation and sum performance metrics
        for train_index, test_index in kf.split(X_train_validation):
            X_train, X_validation = X_train_validation[train_index], X_train_validation[test_index]
            y_train, y_validation = y_train_validation[train_index], y_train_validation[test_index]
            if testing == 'logistic':
                clf.C = alpha 
            elif testing == 'ridge':
                clf.alpha = alpha
            clf.fit(X_train, y_train)
            y_predictions = clf.predict(X_validation)
            average_accuracy = average_accuracy + accuracy_score(y_validation, y_predictions)
            average_f1_score = average_f1_score + f1_score(y_validation, y_predictions)
            average_roc_score = average_roc_score + roc_auc_score(y_validation, y_predictions)
          # divide performance metrics by n_splits to get averages
        accuracy_scores[i] = average_accuracy / n_splits
        f1_scores[i] = average_f1_score / n_splits
        ROC_scores[i] = average_roc_score / n_splits
        
        #TODO: Write code to print all results
        
        """
        Evalute best hyperparameter
        """
        alpha_with_max_accuracy = alphas[np.where(accuracy_scores == max(accuracy_scores))]
        alpha_with_max_f1_score = alphas[np.where(f1_scores == max(f1_scores))]
        alpha_with_max_ROC_score = alphas[np.where(ROC_scores == max(ROC_scores))]
        
    return {"max_acc": (alpha_with_max_accuracy[0], max(accuracy_scores)), 
            "max_f1": (alpha_with_max_f1_score[0], max(f1_scores)), 
            "max_roc": (alpha_with_max_ROC_score[0], max(ROC_scores))}
        

In [11]:
N_MODELS = 100
alphas = np.logspace(-3, 6, N_MODELS)
model = LogisticRegression(max_iter=1000000)

print(hyperparam_tune(model, alphas, 'logistic', n_splits = 4, prnt=False))

{'max_acc': (0.1873817422860385, 0.5379464285714286), 'max_f1': (0.23101297000831605, 0.5069529013859295), 'max_roc': (0.23101297000831605, 0.5376977109735731)}


In [12]:
from sklearn.linear_model import RidgeClassifier
alphas = np.logspace(-3, 6, N_MODELS)
model = RidgeClassifier()

print(hyperparam_tune(model, alphas, 'ridge', n_splits = 4, prnt=False))

{'max_acc': (10.0, 0.5357142857142857), 'max_f1': (15.199110829529332, 0.5080171181170071), 'max_roc': (15.199110829529332, 0.5358630513802927)}


In [15]:
from sklearn.svm import SVC

alphas = np.logspace(-3, 6, N_MODELS)
model = SVC()

print(hyperparam_tune(model, alphas, 'logistic', n_splits = 4, prnt=False))

{'max_acc': (3511.1917342151346, 0.5446428571428571), 'max_f1': (3511.1917342151346, 0.52188740684325), 'max_roc': (3511.1917342151346, 0.5445408438942922)}


In [18]:
from sklearn.tree import DecisionTreeClassifier

alphas = np.logspace(0, 0, 1)
model = SVC()

print(hyperparam_tune(model, alphas, 'tree', n_splits = 4, prnt=False))

{'max_acc': (1.0, 0.4977678571428572), 'max_f1': (1.0, 0.45425571109263496), 'max_roc': (1.0, 0.4996641369486198)}


In [20]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

alphas = np.logspace(0, 0, 1)
model = AdaBoostClassifier()

print(hyperparam_tune(model, alphas, 'tree', n_splits = 4, prnt=False))

{'max_acc': (1.0, 0.5602678571428571), 'max_f1': (1.0, 0.5388935914171494), 'max_roc': (1.0, 0.5599033549464584)}


## TODO: Make a list of classifiers with their parameters we wish to hyper-tune