In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score


## View Results from Hyperparameter Tuning For Current UPDRS
Using the protein and peptide data as well as the visit month, predict the UPDRS value as either Mild, Moderate, or Severe

In [11]:
# read in the data from the csv file for xgboost hyperparameter tuning
xgb1_hyperparams_df = pd.read_csv("~/parkinsons_proj_1/parkinsons_project/parkinsons_1/src/models/xgboost_24m_hyperparam_finetune_results_updrs_1.csv", index_col=0)
xgb2_hyperparams_df = pd.read_csv("~/parkinsons_proj_1/parkinsons_project/parkinsons_1/src/models/xgboost_24m_hyperparam_finetune_results_updrs_2.csv", index_col=0)
xgb3_hyperparams_df = pd.read_csv("~/parkinsons_proj_1/parkinsons_project/parkinsons_1/src/models/xgboost_24m_hyperparam_finetune_results_updrs_3.csv", index_col=0)

lgb1_hyperparams_df = pd.read_csv("~/parkinsons_proj_1/parkinsons_project/parkinsons_1/src/models/lgboost_24m_hyperparam_finetune_results_updrs_1.csv", index_col=0)
lgb2_hyperparams_df = pd.read_csv("~/parkinsons_proj_1/parkinsons_project/parkinsons_1/src/models/lgboost_24m_hyperparam_finetune_results_updrs_2.csv", index_col=0)
lgb3_hyperparams_df = pd.read_csv("~/parkinsons_proj_1/parkinsons_project/parkinsons_1/src/models/lgboost_24m_hyperparam_finetune_results_updrs_3.csv", index_col=0)


In [43]:
lgb_hyperparams_df

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.927765,0.861307,0.909645
learning_rate,0.302803,0.461413,0.004638
max_depth,6.0,3.0,2.0
min_child_weight,0.403494,1.084966,18.93466
min_split_gain,0.000537,0.009833,5.956643
reg_alpha,4.600247,0.887957,9.177749
reg_lambda,7.594063,9.787396,2.906254
subsample,0.836524,0.674584,0.590861


In [44]:
xgb_hyperparams_df

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.608034,0.699842,0.975992
gamma,0.023834,0.026007,0.92226
learning_rate,0.002434,0.780489,0.281493
max_depth,6.0,3.0,1.0
min_child_weight,5.925806,2.245957,7.755283
reg_alpha,3.182263,1.47304,4.193993
reg_lambda,1.619103,3.618716,5.388605
subsample,0.997353,0.97568,0.60965


In [45]:
# read in the protein and updrs data
updrs_df = pd.read_csv(
        "~/parkinsons_proj_1/parkinsons_project/parkinsons_1/data/processed/train_24month_protein_data.csv"
    )

In [46]:
updrs_df.head()

Unnamed: 0,patient_id,O00391_24m_max_diff,O00391_24m_min_diff,O00533_24m_max_diff,O00533_24m_min_diff,O00584_24m_max_diff,O00584_24m_min_diff,O14498_24m_max_diff,O14498_24m_min_diff,O14773_24m_max_diff,...,YSLTYIYTGLSK_P25311_max,YTTEIIK_P00736_max,YVGGQEHFAHLLILR_P02763_max,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR_P00738_max,YVMLPVADQDQC(UniMod_4)IR_P00738_max,YVNKEIQNAVNGVK_P10909_max,YWGVASFLQK_P02753_max,YYC(UniMod_4)FQGNQFLR_P02790_max,YYTYLIMNK_P01024_max,YYWGGQYTWDMAK_P02675_max
0,7832,0.0,0.0,13937.0,13937.0,15188.8,15188.8,10337.2,10337.2,11159.76,...,264123.0,10135.8,4308190.0,287228.0,1188190.0,105262.0,119051.0,530247.0,59186.5,41160.9
1,40874,138.9,138.9,-24110.0,-24110.0,-5059.1,-5059.1,-1893.1,-1893.1,-8046.59,...,225145.0,14107.7,5319090.0,5555.8,1764240.0,89180.0,160973.0,395954.0,52582.4,20639.5
2,23636,-9656.96,-9656.96,-138626.0,-138626.0,2473.7,2473.7,653.6,653.6,10038.74,...,195196.0,12993.9,5184580.0,89648.1,883685.0,62224.2,170098.0,647961.0,64110.0,12694.0
3,30119,0.0,0.0,182841.0,112577.0,9552.7,-1674.2,15416.4,12478.9,10746.38,...,185536.0,6870.4,4749580.0,166512.0,921488.0,52793.0,118695.0,501415.0,40288.8,50618.6
4,29417,0.0,0.0,154844.0,117353.0,4852.1,-10615.7,7993.2,6153.3,12549.3,...,195454.0,0.0,0.0,235477.0,428694.0,94570.5,106663.0,624249.0,34506.0,0.0


In [47]:
# get only the updrs of interest
# the updrs_1_max is the target, which means the highest categorical value
# 0 is mild parkinsons, and 1 is moderate to severe parkinsons as the max
updrs1_df = updrs_df.drop(columns=["updrs_2_max", "updrs_3_max"])
updrs2_df = updrs_df.drop(columns=["updrs_1_max", "updrs_3_max"])
updrs3_df = updrs_df.drop(columns=["updrs_1_max", "updrs_2_max"])

In [48]:
updrs1_df['updrs_1_max'].value_counts()

0.0    122
1.0     72
Name: updrs_1_max, dtype: int64

In [49]:
updrs2_df['updrs_2_max'].value_counts()

0.0    142
1.0     52
Name: updrs_2_max, dtype: int64

In [50]:
updrs3_df['updrs_3_max'].value_counts()

0.0    129
1.0     65
Name: updrs_3_max, dtype: int64

In [51]:
def create_kfolds(df, updrs):
    # create a new column for kfold and fill it with -1
    df["kfold"] = -1

    # randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)

    # calculate the number of bins using Sturge's rule
    # I am using the max here to ensure that the number of bins is at least 5
    # and at most 12
    num_bins = int(np.floor(1 + np.log2(len(df))))

    # bin targets
    df.loc[:, "bins"] = pd.cut(df[f"{updrs}_max"], bins=num_bins, labels=False)

    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)

    # fill the new kfold column
    # note that instead of targets we are using bins!
    for fold, (train_idx, val_idx) in enumerate(kf.split(X=df, y=df.bins.values)):
        df.loc[val_idx, "kfold"] = fold

    # drop the bins column
    df = df.drop("bins", axis=1)

    # return dataframe with folds
    return df

In [52]:
# add kfolds to the df
updrs1_df = create_kfolds(updrs1_df, 'updrs_1')
updrs2_df = create_kfolds(updrs2_df, 'updrs_2')
updrs3_df = create_kfolds(updrs3_df, 'updrs_3')

In [53]:

def cross_fold_validation(df, model, target):

    updrs_results = dict()
    
    for fold in range(0, 5):
        # get the train and test data for the current fold
        train = df[df['kfold'] != fold].reset_index(drop=True)
        test = df[df['kfold'] == fold].reset_index(drop=True)

        # get the train and test data for the current fold
        drop_cols = ['patient_id', f'{target}_max', 'kfold']
        X_train = train.drop(columns=drop_cols)
        y_train = train[f'{target}_max']
        X_test = test.drop(columns=drop_cols)
        y_test = test[f'{target}_max']

        # train the model
        model.fit(X_train, y_train)

        # make predictions
        preds = model.predict(X_test)


        # save the results
        updrs_results[f'{target}_fold_{fold}'] = {
            'auc_score': roc_auc_score(y_test, preds),
            'acc_score': accuracy_score(y_test, preds),
            'precision_score': precision_score(y_test, preds),
            'recall_score': recall_score(y_test, preds),
        }
        
    mean_auc = np.mean([updrs_results[f'{target}_fold_{fold}']['auc_score'] for fold in range(0, 5)])
    mean_acc = np.mean([updrs_results[f'{target}_fold_{fold}']['acc_score'] for fold in range(0, 5)])
    mean_precision = np.mean([updrs_results[f'{target}_fold_{fold}']['precision_score'] for fold in range(0, 5)])
    mean_recall = np.mean([updrs_results[f'{target}_fold_{fold}']['recall_score'] for fold in range(0, 5)])
    
    return mean_auc, mean_acc, mean_precision, mean_recall
        
    
    

In [54]:
xgb_hyperparams_df

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.608034,0.699842,0.975992
gamma,0.023834,0.026007,0.92226
learning_rate,0.002434,0.780489,0.281493
max_depth,6.0,3.0,1.0
min_child_weight,5.925806,2.245957,7.755283
reg_alpha,3.182263,1.47304,4.193993
reg_lambda,1.619103,3.618716,5.388605
subsample,0.997353,0.97568,0.60965


In [55]:
def prepare_xgboost_model(xgb_hyperparams_df, target):
    # train the model using the hyperparameters from the hyperparameter tuning
    updrs_hp = xgb_hyperparams_df[target].to_dict()
    updrs_hp['max_depth'] = int(updrs_hp['max_depth'])
    updrs_hp['eval_metric'] = 'auc'
    model = XGBClassifier(**updrs_hp)
    return model

In [56]:
# test the model function
model = prepare_xgboost_model(xgb_hyperparams_df, 'updrs_1')
model.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 0.6080342999653765,
 'gamma': 0.0238343456844193,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': 0.0024335042387472,
 'max_delta_step': None,
 'max_depth': 6,
 'min_child_weight': 5.925806478040236,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': 3.1822625312009545,
 'reg_lambda': 1.6191029232254417,
 'scale_pos_weight': None,
 'subsample': 0.9973528183597652,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None,
 'eval_metric': 'auc'}

In [57]:
xgb_results = dict()

for updrs, df in zip(['updrs_1', 'updrs_2', 'updrs_3'], [updrs1_df, updrs2_df, updrs3_df]):
    model = prepare_xgboost_model(xgb_hyperparams_df, updrs)
    auc, acc, prec, recall = cross_fold_validation(df, model, updrs)
    xgb_results[updrs] = {"auc":auc,
                        "acc":acc,
                        "prec":prec,
                        "recall":recall}



In [58]:
xgb_results

{'updrs_1': {'auc': 0.588452380952381,
  'acc': 0.6491228070175439,
  'prec': 0.5775757575757575,
  'recall': 0.3485714285714286},
 'updrs_2': {'auc': 0.5489341692789969,
  'acc': 0.7008097165991903,
  'prec': 0.3129004329004329,
  'recall': 0.22545454545454544},
 'updrs_3': {'auc': 0.5264615384615385,
  'acc': 0.6442645074224022,
  'prec': 0.3704761904761904,
  'recall': 0.16923076923076924}}

## LightGBM Classifier Results

In [59]:
def prepare_lgboost_model(lgb_hyperparams_df, target):
    # train the model using the hyperparameters from the hyperparameter tuning
    updrs_hp = lgb_hyperparams_df[target].to_dict()
    updrs_hp['max_depth'] = int(updrs_hp['max_depth'])
    updrs_hp['metric'] = 'AUC'
    model = LGBMClassifier(**updrs_hp)
    return model

In [60]:
lgb_hyperparams_df

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.927765,0.861307,0.909645
learning_rate,0.302803,0.461413,0.004638
max_depth,6.0,3.0,2.0
min_child_weight,0.403494,1.084966,18.93466
min_split_gain,0.000537,0.009833,5.956643
reg_alpha,4.600247,0.887957,9.177749
reg_lambda,7.594063,9.787396,2.906254
subsample,0.836524,0.674584,0.590861


In [61]:
lgb_results = dict()

for updrs, df in zip(['updrs_1', 'updrs_2', 'updrs_3'], [updrs1_df, updrs2_df, updrs3_df]):
    model = prepare_lgboost_model(lgb_hyperparams_df, updrs)
    auc, acc, prec, recall = cross_fold_validation(df, model, updrs)
    lgb_results[updrs] = {"auc":auc,
                        "acc":acc,
                        "prec":prec,
                        "recall":recall}

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [62]:
lgb_results

{'updrs_1': {'auc': 0.581,
  'acc': 0.6545209176788125,
  'prec': 0.5666666666666667,
  'recall': 0.29333333333333333},
 'updrs_2': {'auc': 0.4962651141961487,
  'acc': 0.6747638326585694,
  'prec': 0.24000000000000005,
  'recall': 0.11272727272727275},
 'updrs_3': {'auc': 0.5,
  'acc': 0.6649122807017543,
  'prec': 0.0,
  'recall': 0.0}}

In [40]:
train_df = updrs1_df[updrs1_df['kfold'] != 4].reset_index(drop=True)
test_df = updrs1_df[updrs1_df['kfold'] == 4].reset_index(drop=True)
X_train = train_df.drop(columns=['patient_id', 'kfold', 'updrs_1_max'])
y_train = train_df['updrs_1_max']
X_test = test_df.drop(columns=['patient_id', 'kfold', 'updrs_1_max'])
y_test = test_df['updrs_1_max']

model = prepare_lgboost_model(lgb_hyperparams_df, 'updrs_1')

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

test_df['preds'] = y_pred


In [41]:
test_df[['updrs_1_max', 'preds']]

Unnamed: 0,updrs_1_max,preds
0,1.0,0.0
1,1.0,0.0
2,1.0,1.0
3,0.0,0.0
4,1.0,0.0
5,0.0,0.0
6,0.0,0.0
7,0.0,0.0
8,0.0,0.0
9,1.0,1.0
