In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [2]:
df = pd.read_csv('dataset.csv').drop(columns= ['Unnamed: 0', 'SEQN'])
print(df.shape)
print(df.columns)

(560, 13)
Index(['angina', 'DPQ020', 'DPQ090', 'OHQ850', 'OHQ835', 'OHQ620', 'SMQ020',
       'WHD020', 'PAQ650', 'BPQ020', 'RIAGENDR', 'RIDAGEYR', 'DBQ700'],
      dtype='object')


## Split dataset into X, y (and convert to NumPy Ndarray)

In [3]:
"""
Split dataset into X, y
Converted to NumPy Ndarray
"""
X = df.iloc[:, 1:]
# X = X.drop(columns = ['OHQ850', 'OHQ835'])
print(X.shape)
print(X.columns)
X = X.to_numpy()
y = df['angina'].to_numpy()
print(y.shape)


(560, 12)
Index(['DPQ020', 'DPQ090', 'OHQ850', 'OHQ835', 'OHQ620', 'SMQ020', 'WHD020',
       'PAQ650', 'BPQ020', 'RIAGENDR', 'RIDAGEYR', 'DBQ700'],
      dtype='object')
(560,)


## Split total dataset dataset into 80:20 shuffled split (train/test)

In [9]:
"""
Split total dataset into 80:20 split (train/test)
Shuffled
"""
X_train_validation, X_test, y_train_validation, y_test = train_test_split(X, y, test_size=0.2, random_state=59, shuffle=True, stratify=None)
# print(X_train)
# print(X_test)
# print(y_train)
# print(y_test)
print(X_train_validation.shape)
print(X_test.shape)
print(y_train_validation.shape)
print(y_test.shape)

(448, 12)
(112, 12)
(448,)
(112,)


## Hyperparameter Tuning (k-fold validation)

In [10]:
"""
Used for hyperparameter tuning:
Take training data and split into 75:25 splits for k-fold (use sklearn k-fold here) (train/ validation)
4-5 folds 
Will use logistic regression in this example of code
"""
N_MODELS = 100
alphas = np.logspace(-3, 6, N_MODELS)

n_splits = 4
kf = KFold(n_splits=n_splits)

accuracy_scores = np.zeros((N_MODELS,))
f1_scores = np.zeros((N_MODELS,))
ROC_scores = np.zeros((N_MODELS,))
 
for i, alpha in enumerate(alphas):
  average_accuracy = 0
  average_f1_score = 0
  average_roc_score = 0
  
  # run k_fold validation and sum performance metrics
  for train_index, test_index in kf.split(X_train_validation):
    """
      # print("TRAIN:", len(train_index), "TEST:", len(test_index))
      TRAIN: 336 TEST: 113
      TRAIN: 337 TEST: 112
      TRAIN: 337 TEST: 112
      TRAIN: 337 TEST: 112
    """
    X_train, X_validation = X_train_validation[train_index], X_train_validation[test_index]
    y_train, y_validation = y_train_validation[train_index], y_train_validation[test_index]
    clf = LogisticRegression(C=alpha, max_iter=1000000).fit(X_train, y_train)
    y_predictions = clf.predict(X_validation)
    average_accuracy = average_accuracy + accuracy_score(y_validation, y_predictions)
    average_f1_score = average_f1_score + f1_score(y_validation, y_predictions)
    average_roc_score = average_roc_score + roc_auc_score(y_validation, y_predictions)
    
  # divide performance metrics by n_splits to get averages
  accuracy_scores[i] = average_accuracy / n_splits
  f1_scores[i] = average_f1_score / n_splits
  ROC_scores[i] = average_roc_score / n_splits

# print(accuracy_scores)
# print(f1_scores)
# print(ROC_scores)
    
    


## Evalute best hyperparameter (Needs to be changed)

In [11]:
"""
Evalute best hyperparameter
"""
alpha_with_max_accuracy = alphas[np.where(accuracy_scores == max(accuracy_scores))]
alpha_with_max_f1_score = alphas[np.where(f1_scores == max(f1_scores))]
alpha_with_max_ROC_score = alphas[np.where(ROC_scores == max(ROC_scores))]
print(max(accuracy_scores), max(f1_scores), max(ROC_scores))
print(alpha_with_max_accuracy, alpha_with_max_f1_score, alpha_with_max_ROC_score)

0.5379464285714286 0.48908827658827664 0.5361160231849886
[3.51119173] [0.15199111] [3.51119173]


## Evalute model on test set using selected hyperparameter


In [13]:
"""
Evalute model on test set using selected hyperparameter
"""
alpha = 3.51119173
clf = LogisticRegression(C=alpha, max_iter=1000000).fit(X_train_validation, y_train_validation)
y_predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_predictions)
f1 = f1_score(y_test, y_predictions)
roc_score = roc_auc_score(y_test, y_predictions)
print(accuracy, f1, roc_score)

0.5178571428571429 0.509090909090909 0.5191570881226054
