In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [2]:
df = pd.read_csv('dataset.csv').drop(columns= ['Unnamed: 0', 'SEQN'])
print(df.shape)
print(df.columns)

(562, 11)
Index(['angina', 'DPQ020', 'OHQ850', 'OHQ835', 'SMQ020', 'WHD020', 'PAQ650',
       'BPQ020', 'RIAGENDR', 'RIDAGEYR', 'DBQ700'],
      dtype='object')


## Split dataset into X, y (and convert to NumPy Ndarray)

In [3]:
"""
Split dataset into X, y
Converted to NumPy Ndarray
"""
X = df.iloc[:, 1:]
X = X.drop(columns = ['OHQ850', 'OHQ835'])
print(X.shape)
print(X.columns)
X = X.to_numpy()
y = df['angina'].to_numpy()
print(y.shape)


(562, 8)
Index(['DPQ020', 'SMQ020', 'WHD020', 'PAQ650', 'BPQ020', 'RIAGENDR',
       'RIDAGEYR', 'DBQ700'],
      dtype='object')
(562,)


## Split total dataset dataset into 80:20 shuffled split (train/test)

In [4]:
"""
Split total dataset into 80:20 split (train/test)
Shuffled
"""
X_train_validation, X_test, y_train_validation, y_test = train_test_split(X, y, test_size=0.2, random_state=59, shuffle=True, stratify=None)
# print(X_train)
# print(X_test)
# print(y_train)
# print(y_test)
print(X_train_validation.shape)
print(X_test.shape)
print(y_train_validation.shape)
print(y_test.shape)

(449, 8)
(113, 8)
(449,)
(113,)


## Hyperparameter Tuning (k-fold validation)

In [5]:
def hyperparam_tune(clf, alphas, n_splits = 4, prnt=False):
    N_MODELS = len(alphas)
    accuracy_scores = np.zeros((N_MODELS,))
    f1_scores = np.zeros((N_MODELS,))
    ROC_scores = np.zeros((N_MODELS,))
    kf = KFold(n_splits=n_splits)
    
    for i, alpha in enumerate(alphas):
        average_accuracy = 0
        average_f1_score = 0
        average_roc_score = 0
        # run k_fold validation and sum performance metrics
        for train_index, test_index in kf.split(X_train_validation):
            X_train, X_validation = X[train_index], X[test_index]
            y_train, y_validation = y[train_index], y[test_index]
            clf.C = alpha 
            clf.fit(X_train, y_train)
            y_predictions = clf.predict(X_validation)
            average_accuracy = average_accuracy + accuracy_score(y_validation, y_predictions)
            average_f1_score = average_f1_score + f1_score(y_validation, y_predictions)
            average_roc_score = average_roc_score + roc_auc_score(y_validation, y_predictions)
          # divide performance metrics by n_splits to get averages
        accuracy_scores[i] = average_accuracy / n_splits
        f1_scores[i] = average_f1_score / n_splits
        ROC_scores[i] = average_roc_score / n_splits
        
        #TODO: Write code to print all results
        
        """
        Evalute best hyperparameter
        """
        alpha_with_max_accuracy = alphas[np.where(accuracy_scores == max(accuracy_scores))]
        alpha_with_max_f1_score = alphas[np.where(f1_scores == max(f1_scores))]
        alpha_with_max_ROC_score = alphas[np.where(ROC_scores == max(ROC_scores))]
        
    return {"max_acc": (alpha_with_max_accuracy[0], max(accuracy_scores)), 
            "max_f1": (alpha_with_max_f1_score[0], max(f1_scores)), 
            "max_roc": (alpha_with_max_ROC_score[0], max(ROC_scores))}
        

In [6]:
N_MODELS = 100
alphas = np.logspace(-3, 6, N_MODELS)
model = LogisticRegression(max_iter=1000000)

print(hyperparam_tune(model, alphas, n_splits = 4, prnt=False))

{'max_acc': (0.001873817422860383, 0.5590826485461442), 'max_f1': (0.001873817422860383, 0.6257470261615841), 'max_roc': (0.001873817422860383, 0.5627155763539882)}


## TODO: Make a list of classifiers with their parameters we wish to hyper-tune