In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score


## View Results from Hyperparameter Tuning For Current UPDRS
Using the protein and peptide data as well as the visit month, predict the UPDRS value as either Mild, Moderate, or Severe

In [2]:
# read in the data from the csv file for xgboost hyperparameter tuning
xgb_hyperparams_df = pd.read_csv("../data/processed/xgboost_future_cat_24m_hyperparam_results.csv", index_col=0)
lgb_hyperparams_df = pd.read_csv("../data/processed/lgboost_future_cat_24m_hyperparam_results.csv", index_col=0)

In [3]:
lgb_hyperparams_df

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.788359,0.707755,0.96449
learning_rate,0.280928,0.882243,0.149543
max_depth,3.0,8.0,5.0
min_child_weight,11.736254,0.1459,0.346503
min_split_gain,6.4e-05,11.265839,19413.98576
reg_alpha,1.063561,3.480579,4.746389
reg_lambda,4.071927,7.083994,3.50809
subsample,0.885743,0.988104,0.906154


In [4]:
xgb_hyperparams_df

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.684247,0.584096,0.996983
gamma,0.00016,0.095862,0.004415
learning_rate,0.009385,0.001702,0.077006
max_depth,3.0,3.0,4.0
min_child_weight,0.59533,0.137404,1.717018
reg_alpha,1.665443,2.589235,0.729288
reg_lambda,2.213086,3.162691,4.090971
subsample,0.818436,0.760515,0.813584


In [5]:
# read in the protein and updrs data
updrs_df = pd.read_csv(
        "~/parkinsons_proj_1/parkinsons_project/parkinsons_1/data/processed/train_24month_protein_data.csv"
    )

In [6]:
updrs_df.head()

Unnamed: 0,patient_id,O00391_24m_max_diff,O00391_24m_min_diff,O00533_24m_max_diff,O00533_24m_min_diff,O00584_24m_max_diff,O00584_24m_min_diff,O14498_24m_max_diff,O14498_24m_min_diff,O14773_24m_max_diff,...,YSLTYIYTGLSK_P25311_max,YTTEIIK_P00736_max,YVGGQEHFAHLLILR_P02763_max,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR_P00738_max,YVMLPVADQDQC(UniMod_4)IR_P00738_max,YVNKEIQNAVNGVK_P10909_max,YWGVASFLQK_P02753_max,YYC(UniMod_4)FQGNQFLR_P02790_max,YYTYLIMNK_P01024_max,YYWGGQYTWDMAK_P02675_max
0,7832,0.0,0.0,13937.0,13937.0,15188.8,15188.8,10337.2,10337.2,11159.76,...,264123.0,10135.8,4308190.0,287228.0,1188190.0,105262.0,119051.0,530247.0,59186.5,41160.9
1,40874,138.9,138.9,-24110.0,-24110.0,-5059.1,-5059.1,-1893.1,-1893.1,-8046.59,...,225145.0,14107.7,5319090.0,5555.8,1764240.0,89180.0,160973.0,395954.0,52582.4,20639.5
2,23636,-9656.96,-9656.96,-138626.0,-138626.0,2473.7,2473.7,653.6,653.6,10038.74,...,195196.0,12993.9,5184580.0,89648.1,883685.0,62224.2,170098.0,647961.0,64110.0,12694.0
3,30119,0.0,0.0,182841.0,112577.0,9552.7,-1674.2,15416.4,12478.9,10746.38,...,185536.0,6870.4,4749580.0,166512.0,921488.0,52793.0,118695.0,501415.0,40288.8,50618.6
4,29417,0.0,0.0,154844.0,117353.0,4852.1,-10615.7,7993.2,6153.3,12549.3,...,195454.0,0.0,0.0,235477.0,428694.0,94570.5,106663.0,624249.0,34506.0,0.0


In [8]:
# get only the updrs of interest
# the updrs_1_max is the target, which means the highest categorical value
# 0 is mild parkinsons, and 1 is moderate to severe parkinsons as the max
updrs1_df = updrs_df.drop(columns=["updrs_2_max", "updrs_3_max"])
updrs2_df = updrs_df.drop(columns=["updrs_1_max", "updrs_3_max"])
updrs3_df = updrs_df.drop(columns=["updrs_1_max", "updrs_2_max"])

In [9]:
updrs1_df['updrs_1_max'].value_counts()

0.0    122
1.0     72
Name: updrs_1_max, dtype: int64

In [10]:
updrs2_df['updrs_2_max'].value_counts()

0.0    142
1.0     52
Name: updrs_2_max, dtype: int64

In [11]:
updrs3_df['updrs_3_max'].value_counts()

0.0    129
1.0     65
Name: updrs_3_max, dtype: int64

In [12]:
def create_kfolds(df, updrs):
    # create a new column for kfold and fill it with -1
    df["kfold"] = -1

    # randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)

    # calculate the number of bins using Sturge's rule
    # I am using the max here to ensure that the number of bins is at least 5
    # and at most 12
    num_bins = int(np.floor(1 + np.log2(len(df))))

    # bin targets
    df.loc[:, "bins"] = pd.cut(df[f"{updrs}_max"], bins=num_bins, labels=False)

    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)

    # fill the new kfold column
    # note that instead of targets we are using bins!
    for fold, (train_idx, val_idx) in enumerate(kf.split(X=df, y=df.bins.values)):
        df.loc[val_idx, "kfold"] = fold

    # drop the bins column
    df = df.drop("bins", axis=1)

    # return dataframe with folds
    return df

In [15]:
# add kfolds to the df
updrs1_df = create_kfolds(updrs1_df, 'updrs_1')
updrs2_df = create_kfolds(updrs2_df, 'updrs_2')
updrs3_df = create_kfolds(updrs3_df, 'updrs_3')

In [16]:

def cross_fold_validation(df, model, target):

    updrs_results = dict()
    
    for fold in range(0, 5):
        # get the train and test data for the current fold
        train = df[df['kfold'] != fold].reset_index(drop=True)
        test = df[df['kfold'] == fold].reset_index(drop=True)

        # get the train and test data for the current fold
        drop_cols = ['patient_id', f'{target}_max', 'kfold']
        X_train = train.drop(columns=drop_cols)
        y_train = train[f'{target}_max']
        X_test = test.drop(columns=drop_cols)
        y_test = test[f'{target}_max']

        # train the model
        model.fit(X_train, y_train)

        # make predictions
        preds = model.predict(X_test)


        # save the results
        updrs_results[f'{target}_fold_{fold}'] = {
            'auc_score': roc_auc_score(y_test, preds),
            'acc_score': accuracy_score(y_test, preds),
            'precision_score': precision_score(y_test, preds),
            'recall_score': recall_score(y_test, preds),
        }
        
    mean_auc = np.mean([updrs_results[f'{target}_fold_{fold}']['auc_score'] for fold in range(0, 5)])
    mean_acc = np.mean([updrs_results[f'{target}_fold_{fold}']['acc_score'] for fold in range(0, 5)])
    mean_precision = np.mean([updrs_results[f'{target}_fold_{fold}']['precision_score'] for fold in range(0, 5)])
    mean_recall = np.mean([updrs_results[f'{target}_fold_{fold}']['recall_score'] for fold in range(0, 5)])
    
    return mean_auc, mean_acc, mean_precision, mean_recall
        
    
    

In [29]:
xgb_hyperparams_df.head()

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.684247,0.584096,0.996983
gamma,0.00016,0.095862,0.004415
learning_rate,0.009385,0.001702,0.077006
max_depth,3.0,3.0,4.0
min_child_weight,0.59533,0.137404,1.717018


In [17]:
def prepare_xgboost_model(xgb_hyperparams_df, target):
    # train the model using the hyperparameters from the hyperparameter tuning
    updrs_hp = xgb_hyperparams_df[target].to_dict()
    updrs_hp['max_depth'] = int(updrs_hp['max_depth'])
    model = XGBClassifier(**updrs_hp)
    return model

In [19]:
# test the model function
model = prepare_xgboost_model(xgb_hyperparams_df, 'updrs_1')
model.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 0.6842473934580278,
 'gamma': 0.0001595641595819,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': 0.0093846672717342,
 'max_delta_step': None,
 'max_depth': 3,
 'min_child_weight': 0.5953295257830465,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': 1.6654430506226268,
 'reg_lambda': 2.213085505829708,
 'scale_pos_weight': None,
 'subsample': 0.8184363274620908,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [20]:
xgb_results = dict()

for updrs, df in zip(['updrs_1', 'updrs_2', 'updrs_3'], [updrs1_df, updrs2_df, updrs3_df]):
    model = prepare_xgboost_model(xgb_hyperparams_df, updrs)
    auc, acc, prec, recall = cross_fold_validation(df, model, updrs)
    xgb_results[updrs] = {"auc":auc,
                        "acc":acc,
                        "prec":prec,
                        "recall":recall}





























  _warn_prf(average, modifier, msg_start, len(result))
































In [21]:
xgb_results

{'updrs_1': {'auc': 0.536952380952381,
  'acc': 0.6183535762483131,
  'prec': 0.475,
  'recall': 0.22190476190476188},
 'updrs_2': {'auc': 0.5184012539184953,
  'acc': 0.7218623481781377,
  'prec': 0.5066666666666666,
  'recall': 0.07818181818181817},
 'updrs_3': {'auc': 0.5226153846153846,
  'acc': 0.639136302294197,
  'prec': 0.43272727272727274,
  'recall': 0.16923076923076924}}

## LightGBM Classifier Results

In [22]:
def prepare_lgboost_model(lgb_hyperparams_df, target):
    # train the model using the hyperparameters from the hyperparameter tuning
    updrs_hp = lgb_hyperparams_df[target].to_dict()
    updrs_hp['max_depth'] = int(updrs_hp['max_depth'])
    model = LGBMClassifier(**updrs_hp)
    return model

In [23]:
lgb_hyperparams_df.head()

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.788359,0.707755,0.96449
learning_rate,0.280928,0.882243,0.149543
max_depth,3.0,8.0,5.0
min_child_weight,11.736254,0.1459,0.346503
min_split_gain,6.4e-05,11.265839,19413.98576


In [24]:
lgb_results = dict()

for updrs, df in zip(['updrs_1', 'updrs_2', 'updrs_3'], [updrs1_df, updrs2_df, updrs3_df]):
    model = prepare_lgboost_model(lgb_hyperparams_df, updrs)
    auc, acc, prec, recall = cross_fold_validation(df, model, updrs)
    lgb_results[updrs] = {"auc":auc,
                        "acc":acc,
                        "prec":prec,
                        "recall":recall}

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
lgb_results

{'updrs_1': {'auc': 0.5789285714285715,
  'acc': 0.6340080971659919,
  'prec': 0.5111111111111111,
  'recall': 0.3628571428571429},
 'updrs_2': {'auc': 0.7084684281236007,
  'acc': 0.6183535762483131,
  'prec': 0.40543290043290037,
  'recall': 0.9036363636363637},
 'updrs_3': {'auc': 0.5,
  'acc': 0.6649122807017543,
  'prec': 0.0,
  'recall': 0.0}}

In [26]:
train_df = updrs1_df[updrs1_df['kfold'] != 4].reset_index(drop=True)
test_df = updrs1_df[updrs1_df['kfold'] == 4].reset_index(drop=True)
X_train = train_df.drop(columns=['patient_id', 'kfold', 'updrs_1_max'])
y_train = train_df['updrs_1_max']
X_test = test_df.drop(columns=['patient_id', 'kfold', 'updrs_1_max'])
y_test = test_df['updrs_1_max']

model = prepare_lgboost_model(lgb_hyperparams_df, 'updrs_1')

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

test_df['preds'] = y_pred


In [28]:
test_df[['updrs_1_max', 'preds']]

Unnamed: 0,updrs_1_max,preds
0,1.0,1.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,1.0,0.0
6,1.0,1.0
7,1.0,1.0
8,0.0,1.0
9,0.0,0.0
