## Explore the results of the 24 month data model hyperparameter tuning on Max Categorical UPDRS

In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

In [112]:
# import the data
updrs_df = pd.read_csv("~/parkinsons_proj_1/parkinsons_project/parkinsons_1/data/processed/train_24month_protein_data.csv")

In [113]:
# import the hyperparameter tuning results
lgb_hyperparams = pd.read_csv('~/parkinsons_proj_1/parkinsons_project/parkinsons_1/data/processed/lgboost_future_cat_24m_hyperparam_results.csv', index_col=0)
xgb_hyperparams = pd.read_csv('~/parkinsons_proj_1/parkinsons_project/parkinsons_1/data/processed/xgboost_future_cat_24m_hyperparam_results.csv', index_col=0)

#### boosting: "gbdt", "dart", "rf"
#### is_unbalance: False, True
#### tree_learner: "serial", "feature", "data", "voting"

In [101]:
# convert the categoricals parameter to correct names by mapping the values
lgb_hyperparams.loc['tree_learner'] = lgb_hyperparams.loc['tree_learner'].map({0: 'serial', 1: 'feature', 2: 'data', 3: 'voting'})
lgb_hyperparams.loc['boosting'] = lgb_hyperparams.loc['boosting'].map({0: 'gbdt', 1: 'dart', 2: 'rf'})
lgb_hyperparams.loc['is_unbalance'] = lgb_hyperparams.loc['is_unbalance'].map({0: "true", 1: "false"})

# convert floats to ints
lgb_hyperparams.loc['max_depth'] = lgb_hyperparams.loc['max_depth'].astype(int)
lgb_hyperparams.loc['max_delta_step'] = lgb_hyperparams.loc['max_delta_step'].astype(int)
lgb_hyperparams.loc['min_data_in_leaf'] = lgb_hyperparams.loc['min_data_in_leaf'].astype(int)
lgb_hyperparams.loc['bagging_freq'] = lgb_hyperparams.loc['bagging_freq'].astype(int)

In [102]:

lgb_hyperparams

Unnamed: 0,updrs_1,updrs_2,updrs_3
bagging_fraction,0.397911,0.58424,0.397045
bagging_freq,6,9,7
boosting,gbdt,rf,gbdt
feature_fraction,0.862267,0.84325,0.865509
is_unbalance,true,true,true
lambda_l1,4.658813,6.55777,2.908397
lambda_l2,2.440111,4.311875,3.075566
learning_rate,0.509808,0.132102,0.010848
max_delta_step,5,5,4
max_depth,18,1,13


In [103]:
auc_results = {}

for updrs in ['updrs_1', 'updrs_2', 'updrs_3']:
    updrs_lgb = lgb_hyperparams[updrs]
    
    # create a dictionary of the hyperparameters with the index as the key and the hyperparameters as the value
    updrs_lgb_dict = updrs_lgb.to_dict()
    
    # create the LGBMClassifier with the hyperparameters
    model = LGBMClassifier(**updrs_lgb_dict, n_estimators=500, metric='auc')
    # get only the updrs of interest data
    if updrs == 'updrs_1':
        model_updrs_df = updrs_df.drop(columns=['updrs_2_max', 'updrs_3_max'])
    if updrs == 'updrs_2':
        model_updrs_df = updrs_df.drop(columns=['updrs_1_max', 'updrs_3_max'])
    if updrs == 'updrs_3':
        model_updrs_df = updrs_df.drop(columns=['updrs_1_max', 'updrs_2_max'])
    X = model_updrs_df.drop(columns=['patient_id', f'{updrs}_max']).values
    y = model_updrs_df[f'{updrs}_max'].values
    
    # perform the stratified k-fold cross validation
    kf = model_selection.StratifiedKFold(n_splits=5)
    auc = []
    for idx in kf.split(X=X, y=y):
        train_idx, test_idx = idx[0], idx[1]
        xtrain = X[train_idx]
        ytrain = y[train_idx]
        xtest = X[test_idx]
        ytest = y[test_idx]
        model.fit(xtrain, ytrain)
        preds = model.predict(xtest)
        fold_auc = roc_auc_score(ytest, preds)
        auc.append(fold_auc)
    print(f'{updrs} LGBMClassifier AUC: {np.mean(auc)}')
    auc_results[updrs] = np.mean(auc)
    

updrs_1 LGBMClassifier AUC: 0.5711190476190476
updrs_2 LGBMClassifier AUC: 0.618269144648455
updrs_3 LGBMClassifier AUC: 0.5416923076923077


In [104]:
auc_results

{'updrs_1': 0.5711190476190476,
 'updrs_2': 0.618269144648455,
 'updrs_3': 0.5416923076923077}

## Look at the XGBoost Results

In [118]:
xgb_hyperparams

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.750063,0.304094,0.560559
gamma,0.013479,5.4e-05,0.013055
learning_rate,0.001338,0.874748,0.002278
max_delta_step,4.0,8.0,5.0
max_depth,15.0,19.0,1.0
min_child_weight,10.142062,9.599838,2.363675
reg_alpha,3.523685,7.38495,7.481439
reg_lambda,6.62519,1.041579,9.368603
scale_pos_weight,2.048692,2.372691,2.095664
subsample,0.521727,0.487426,0.646992


In [120]:
auc_results = {}

for updrs in ['updrs_1', 'updrs_2', 'updrs_3']:
    updrs_xgb = xgb_hyperparams[updrs]
    
    # create a dictionary of the hyperparameters with the index as the key and the hyperparameters as the value
    updrs_xgb_dict = updrs_xgb.to_dict()
    updrs_xgb_dict['max_depth'] = int(updrs_xgb_dict['max_depth'])
    updrs_xgb_dict['max_delta_step'] = int(updrs_xgb_dict['max_delta_step'])
    
    # create the XGBClassifier with the hyperparameters
    model = XGBClassifier(**updrs_xgb_dict, n_estimators=500, eval_metric='auc')
    # get only the updrs of interest data
    if updrs == 'updrs_1':
        model_updrs_df = updrs_df.drop(columns=['updrs_2_max', 'updrs_3_max'])
    if updrs == 'updrs_2':
        model_updrs_df = updrs_df.drop(columns=['updrs_1_max', 'updrs_3_max'])
    if updrs == 'updrs_3':
        model_updrs_df = updrs_df.drop(columns=['updrs_1_max', 'updrs_2_max'])
    X = model_updrs_df.drop(columns=['patient_id', f'{updrs}_max']).values
    y = model_updrs_df[f'{updrs}_max'].values
    
    # perform the stratified k-fold cross validation
    kf = model_selection.StratifiedKFold(n_splits=5)
    auc = []
    for idx in kf.split(X=X, y=y):
        train_idx, test_idx = idx[0], idx[1]
        xtrain = X[train_idx]
        ytrain = y[train_idx]
        xtest = X[test_idx]
        ytest = y[test_idx]
        model.fit(xtrain, ytrain)
        preds = model.predict(xtest)
        fold_auc = roc_auc_score(ytest, preds)
        auc.append(fold_auc)
    print(f'{updrs} XGBClassifier AUC: {np.mean(auc)}')
    auc_results[updrs] = np.mean(auc)



updrs_1 XGBClassifier AUC: 0.5600714285714286




updrs_2 XGBClassifier AUC: 0.4884639498432602




updrs_3 XGBClassifier AUC: 0.5564615384615383


In [121]:
auc_results

{'updrs_1': 0.5600714285714286,
 'updrs_2': 0.4884639498432602,
 'updrs_3': 0.5564615384615383}