## Explore the results of the 12 month data model hyperparameter tuning on Max Categorical UPDRS

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

In [2]:
def convert_df_to_1yr(df, updrs):
    # get the max category for each patient
    max_df = df.groupby(['patient_id'])[f'{updrs}_cat'].max().reset_index()
    max_df = max_df.rename(columns={f'{updrs}_cat': f'{updrs}_max_cat'})
    # merge the max category with the original dataframe
    updrs_df = df.merge(max_df, on=['patient_id'], how='left')
    # take only the visit months that are 12 or less
    updrs_yr_df = updrs_df[updrs_df['visit_month'] <= 12]
    updrs_yr_df = updrs_yr_df.drop(columns=[f'{updrs}_cat'])
    updrs_yr_df.rename(columns={f'{updrs}_max_cat': f'{updrs}_cat'}, inplace=True)
    
    return updrs_yr_df

In [3]:
# read the training data
# read in the protein and updrs data
updrs1_df = pd.read_csv('../data/processed/train_updrs_1_cat.csv')
updrs2_df = pd.read_csv('../data/processed/train_updrs_2_cat.csv')
updrs3_df = pd.read_csv('../data/processed/train_updrs_3_cat.csv')

# replace the categorical updrs scores with numerical for mild, moderate and severe
## combine the moderate and severe categories since there are very few severe observations
updrs1_df['updrs_1_cat'] = updrs1_df['updrs_1_cat'].map({'mild': 0, 'moderate': 1, 'severe': 1})
updrs2_df['updrs_2_cat'] = updrs2_df['updrs_2_cat'].map({'mild': 0, 'moderate': 1, 'severe': 1})
updrs3_df['updrs_3_cat'] = updrs3_df['updrs_3_cat'].map({'mild': 0, 'moderate': 1, 'severe': 1})

updrs1_df = convert_df_to_1yr(updrs1_df, 'updrs_1')
updrs2_df = convert_df_to_1yr(updrs2_df, 'updrs_2')
updrs3_df = convert_df_to_1yr(updrs3_df, 'updrs_3')

In [5]:
# import the hyperparameter tuning results
lgb_hyperparams = pd.read_csv('../data/processed/lgboost_future_cat_12m_hyperparam_results.csv', index_col=0)
xgb_hyperparams = pd.read_csv('../data/processed/xgboost_future_cat_12m_hyperparam_results.csv', index_col=0)

#### boosting: "gbdt", "dart", "rf"
#### is_unbalance: False, True
#### tree_learner: "serial", "feature", "data", "voting"

In [6]:
# convert the categoricals parameter to correct names by mapping the values
lgb_hyperparams.loc['tree_learner'] = lgb_hyperparams.loc['tree_learner'].map({0: 'serial', 1: 'feature', 2: 'data', 3: 'voting'})
lgb_hyperparams.loc['boosting'] = lgb_hyperparams.loc['boosting'].map({0: 'gbdt', 1: 'dart', 2: 'rf'})
lgb_hyperparams.loc['is_unbalance'] = lgb_hyperparams.loc['is_unbalance'].map({0: "true", 1: "false"})

# convert floats to ints
lgb_hyperparams.loc['max_depth'] = lgb_hyperparams.loc['max_depth'].astype(int)
lgb_hyperparams.loc['max_delta_step'] = lgb_hyperparams.loc['max_delta_step'].astype(int)
lgb_hyperparams.loc['min_data_in_leaf'] = lgb_hyperparams.loc['min_data_in_leaf'].astype(int)
lgb_hyperparams.loc['bagging_freq'] = lgb_hyperparams.loc['bagging_freq'].astype(int)

In [7]:

lgb_hyperparams

Unnamed: 0,updrs_1,updrs_2,updrs_3
bagging_fraction,0.766248,0.306936,0.820672
bagging_freq,4,3,8
boosting,dart,gbdt,dart
feature_fraction,0.736872,0.820175,0.696855
is_unbalance,true,true,true
lambda_l1,4.924833,1.72621,1.335757
lambda_l2,7.728147,9.968733,9.155462
learning_rate,0.229834,0.178539,0.866817
max_delta_step,1,5,9
max_depth,8,7,19


In [19]:
lgb_hyperopt_results = {"updrs_1":{"AUC":0, "Accuracy":0, "Recall":0, "Precision":0},
                        "updrs_2":{"AUC":0, "Accuracy":0, "Recall":0, "Precision":0},
                        "updrs_3":{"AUC":0, "Accuracy":0, "Recall":0, "Precision":0}}

for updrs in ['updrs_1', 'updrs_2', 'updrs_3']:
    updrs_lgb = lgb_hyperparams[updrs]
    
    # create a dictionary of the hyperparameters with the index as the key and the hyperparameters as the value
    updrs_lgb_dict = updrs_lgb.to_dict()
    
    # create the LGBMClassifier with the hyperparameters
    model = LGBMClassifier(**updrs_lgb_dict, metric='auc')
    # get only the updrs of interest data
    if updrs == 'updrs_1':
        X = updrs1_df.drop(columns=['patient_id', f'{updrs}_cat', 'visit_id', f'{updrs}']).values
        y = updrs1_df[f'{updrs}_cat'].values
    elif updrs == 'updrs_2':
        X = updrs2_df.drop(columns=['patient_id', f'{updrs}_cat', 'visit_id', f'{updrs}']).values
        y = updrs2_df[f'{updrs}_cat'].values
    elif updrs == 'updrs_3':
        X = updrs3_df.drop(columns=['patient_id', f'{updrs}_cat', 'visit_id', f'{updrs}']).values
        y = updrs3_df[f'{updrs}_cat'].values
    
    # perform the stratified k-fold cross validation
    kf = model_selection.StratifiedKFold(n_splits=5)
    auc, rec, acc, prec = [], [], [], []
    for idx in kf.split(X=X, y=y):
        train_idx, test_idx = idx[0], idx[1]
        xtrain = X[train_idx]
        ytrain = y[train_idx]
        xtest = X[test_idx]
        ytest = y[test_idx]
        model.fit(xtrain, ytrain)
        preds = model.predict(xtest)
        fold_auc = roc_auc_score(ytest, preds)
        fold_prec = precision_score(ytest, preds)
        fold_acc = accuracy_score(ytest, preds)
        fold_rec = recall_score(ytest, preds)
        auc.append(fold_auc)
        rec.append(fold_rec)
        acc.append(fold_acc)
        prec.append(fold_prec)
        
    print(f'{updrs} LGBMClassifier AUC: {np.mean(auc)}')
    lgb_hyperopt_results[updrs]['AUC'] = np.mean(auc)
    lgb_hyperopt_results[updrs]['Accuracy'] = np.mean(acc)
    lgb_hyperopt_results[updrs]['Recall'] = np.mean(rec)
    lgb_hyperopt_results[updrs]['Precision'] = np.mean(prec)
    

[LightGBM] [Info] Number of positive: 146, number of negative: 222
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135000
[LightGBM] [Info] Number of data points in the train set: 368, number of used features: 1200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.396739 -> initscore=-0.419071
[LightGBM] [Info] Start training from score -0.419071
[LightGBM] [Info] Number of positive: 146, number of negative: 222
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 134831
[LightGBM] [Info] Number of data points in the train set: 368, number of used features: 1200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.396739 -> initscore=-0.419071
[LightGBM] [Info] Start training from score -0.419071
[LightGBM] [Info] Number of positive: 146, number of negative: 222
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135261
[LightGBM] [Info] Number of data points in the train set: 368, 

In [20]:
lgb_hyperopt_results

{'updrs_1': {'Accuracy': 0.6956521739130435,
  'AUC': 0.6725908700908702,
  'Recall': 0.5609609609609609,
  'Precision': 0.6282723325062034},
 'updrs_2': {'Accuracy': 0.7,
  'AUC': 0.6658285440613027,
  'Recall': 0.5770935960591133,
  'Precision': 0.5280074963795893},
 'updrs_3': {'Accuracy': 0.7243669374104157,
  'AUC': 0.6969444444444445,
  'Recall': 0.5760317460317459,
  'Precision': 0.6737746481992006}}

## Look at the XGBoost Results

In [15]:
xgb_hyperparams

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.512766,0.993532,0.606491
gamma,0.000416,0.014997,0.0737
learning_rate,0.047764,0.160403,0.040297
max_delta_step,9.0,6.0,8.0
max_depth,16.0,14.0,3.0
min_child_weight,0.466116,1.047142,2.008656
reg_alpha,2.214063,0.674621,0.103136
reg_lambda,6.423071,8.380209,1.346014
scale_pos_weight,2.186045,2.051922,2.346251
subsample,0.729392,0.687759,0.64926


In [21]:
xgb_hyperopt_results = {"updrs_1":{"AUC":0, "Accuracy":0, "Recall":0, "Precision":0},
                        "updrs_2":{"AUC":0, "Accuracy":0, "Recall":0, "Precision":0},
                        "updrs_3":{"AUC":0, "Accuracy":0, "Recall":0, "Precision":0}}

for updrs in ['updrs_1', 'updrs_2', 'updrs_3']:
    updrs_xgb = xgb_hyperparams[updrs]
    
    # create a dictionary of the hyperparameters with the index as the key and the hyperparameters as the value
    updrs_xgb_dict = updrs_xgb.to_dict()
    updrs_xgb_dict['max_depth'] = int(updrs_xgb_dict['max_depth'])
    updrs_xgb_dict['max_delta_step'] = int(updrs_xgb_dict['max_delta_step'])
    
    # create the XGBClassifier with the hyperparameters
    model = XGBClassifier(**updrs_xgb_dict, n_estimators=500, eval_metric='auc')
    # get only the updrs of interest data
    if updrs == 'updrs_1':
        X = updrs1_df.drop(columns=['patient_id', f'{updrs}_cat', 'visit_id', f'{updrs}']).values
        y = updrs1_df[f'{updrs}_cat'].values
    elif updrs == 'updrs_2':
        X = updrs2_df.drop(columns=['patient_id', f'{updrs}_cat', 'visit_id', f'{updrs}']).values
        y = updrs2_df[f'{updrs}_cat'].values
    elif updrs == 'updrs_3':
        X = updrs3_df.drop(columns=['patient_id', f'{updrs}_cat', 'visit_id', f'{updrs}']).values
        y = updrs3_df[f'{updrs}_cat'].values
    
    # perform the stratified k-fold cross validation
    kf = model_selection.StratifiedKFold(n_splits=5)
    auc = []
    for idx in kf.split(X=X, y=y):
        train_idx, test_idx = idx[0], idx[1]
        xtrain = X[train_idx]
        ytrain = y[train_idx]
        xtest = X[test_idx]
        ytest = y[test_idx]
        model.fit(xtrain, ytrain)
        preds = model.predict(xtest)
        fold_auc = roc_auc_score(ytest, preds)
        fold_prec = precision_score(ytest, preds)
        fold_acc = accuracy_score(ytest, preds)
        fold_rec = recall_score(ytest, preds)
        auc.append(fold_auc)
        rec.append(fold_rec)
        acc.append(fold_acc)
        prec.append(fold_prec)
    print(f'{updrs} XGBClassifier AUC: {np.mean(auc)}')
    auc_results[updrs] = np.mean(auc)
    xgb_hyperopt_results[updrs]['AUC'] = np.mean(auc)
    xgb_hyperopt_results[updrs]['Accuracy'] = np.mean(acc)
    xgb_hyperopt_results[updrs]['Recall'] = np.mean(rec)
    xgb_hyperopt_results[updrs]['Precision'] = np.mean(prec)

updrs_1 XGBClassifier AUC: 0.7012337662337662
updrs_2 XGBClassifier AUC: 0.7231390257252326
updrs_3 XGBClassifier AUC: 0.7021031746031746


In [22]:
xgb_hyperopt_results

{'updrs_1': {'Accuracy': 0.7284878165312948,
  'AUC': 0.7012337662337662,
  'Recall': 0.563015873015873,
  'Precision': 0.6911721888356912},
 'updrs_2': {'Accuracy': 0.7523252110208631,
  'AUC': 0.7231390257252326,
  'Recall': 0.5492355409596789,
  'Precision': 0.7124886684232633},
 'updrs_3': {'Accuracy': 0.7459030100334448,
  'AUC': 0.7021031746031746,
  'Recall': 0.560299671592775,
  'Precision': 0.7016165013174473}}