In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score


## View Results from Hyperparameter Tuning For Current UPDRS
Using the protein and peptide data as well as the visit month, predict the UPDRS value as either Mild, Moderate, or Severe

In [29]:
# read in the data from the csv file for xgboost hyperparameter tuning
xgb_hyperparams_df = pd.read_csv('../data/processed/xgboost_cat_hyperparam_results.csv', index_col=0)
lgb_hyperparams_df = pd.read_csv('../data/processed/lgboost_cat_hyperparam_results.csv', index_col=0)

In [30]:
lgb_hyperparams_df

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.873455,0.885309,0.556294
min_split_gain,1.025932,0.000952,0.073092
learning_rate,0.940496,0.574412,0.684648
max_depth,1.0,1.0,2.0
min_child_weight,0.894024,2.040004,2.496106
reg_alpha,0.222292,2.131621,4.798424
reg_lambda,6.000214,5.622438,4.85703
subsample,0.606967,0.895645,0.961862


In [31]:
xgb_hyperparams_df

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.796869,0.703083,0.888702
gamma,0.020274,0.005378,0.00023
learning_rate,0.920227,0.756913,0.637066
max_depth,7.0,4.0,3.0
min_child_weight,0.283017,2.460482,2.294683
reg_alpha,1.582413,5.804418,0.829745
reg_lambda,2.446018,7.883427,2.323503
subsample,0.757228,0.691198,0.846702


In [60]:
# read in the protein and updrs data
updrs1_df = pd.read_csv('../data/processed/train_updrs_1_cat.csv')
updrs2_df = pd.read_csv('../data/processed/train_updrs_2_cat.csv')
updrs3_df = pd.read_csv('../data/processed/train_updrs_3_cat.csv')

In [61]:
updrs1_df['updrs_1_cat'].value_counts()

mild        854
moderate    199
severe       15
Name: updrs_1_cat, dtype: int64

In [62]:
# replace the categorical updrs scores with numerical for mild, moderate and severe
## combine the moderate and severe categories since there are very few severe observations
updrs1_df['updrs_1_cat'] = updrs1_df['updrs_1_cat'].map({'mild': 0, 'moderate': 1, 'severe': 1})
updrs2_df['updrs_2_cat'] = updrs2_df['updrs_2_cat'].map({'mild': 0, 'moderate': 1, 'severe': 1})
updrs3_df['updrs_3_cat'] = updrs3_df['updrs_3_cat'].map({'mild': 0, 'moderate': 1, 'severe': 1})

In [63]:

def cross_fold_validation(df, model, target):

    updrs_results = dict()
    
    for fold in range(0, 5):
        # get the train and test data for the current fold
        train = df[df['kfold'] != fold].reset_index(drop=True)
        test = df[df['kfold'] == fold].reset_index(drop=True)

        # get the train and test data for the current fold
        X_train = train.drop(columns=['visit_id', 'patient_id', f'{target}', 'kfold', f'{target}_cat'])
        y_train = train[f'{target}_cat']
        X_test = test.drop(columns=['visit_id', 'patient_id', f'{target}', 'kfold', f'{target}_cat'])
        y_test = test[f'{target}_cat']

        # train the model
        model.fit(X_train, y_train)

        # make predictions
        preds = model.predict(X_test)


        # save the results
        updrs_results[f'{target}_fold_{fold}'] = {
            'auc_score': roc_auc_score(y_test, preds),
            'acc_score': accuracy_score(y_test, preds),
            'precision_score': precision_score(y_test, preds),
            'recall_score': recall_score(y_test, preds),
        }
        
    mean_auc = np.mean([updrs_results[f'{target}_fold_{fold}']['auc_score'] for fold in range(0, 5)])
    mean_acc = np.mean([updrs_results[f'{target}_fold_{fold}']['acc_score'] for fold in range(0, 5)])
    mean_precision = np.mean([updrs_results[f'{target}_fold_{fold}']['precision_score'] for fold in range(0, 5)])
    mean_recall = np.mean([updrs_results[f'{target}_fold_{fold}']['recall_score'] for fold in range(0, 5)])
    
    return mean_auc, mean_acc, mean_precision, mean_recall
        
    
    

In [64]:
def prepare_xgboost_model(xgb_hyperparams_df, target):
    # train the model using the hyperparameters from the hyperparameter tuning
    updrs_hp = xgb_hyperparams_df[target].to_dict()
    updrs_hp['max_depth'] = int(updrs_hp['max_depth'])
    model = XGBClassifier(**updrs_hp)
    return model

In [65]:
xgb_results = dict()

for updrs, df in zip(['updrs_1', 'updrs_2', 'updrs_3'], [updrs1_df, updrs2_df, updrs3_df]):
    model = prepare_xgboost_model(xgb_hyperparams_df, updrs)
    auc, acc, prec, recall = cross_fold_validation(df, model, updrs)
    xgb_results[updrs] = {"auc":auc,
                        "acc":acc,
                        "prec":prec,
                        "recall":recall}





























































In [66]:
xgb_results

{'updrs_1': {'auc': 0.6093154487852749,
  'acc': 0.8052125839147031,
  'prec': 0.5327690700104493,
  'recall': 0.2820071412099511},
 'updrs_2': {'auc': 0.5974217666205179,
  'acc': 0.8557851783598789,
  'prec': 0.536969696969697,
  'recall': 0.22995267072346176},
 'updrs_3': {'auc': 0.6031350592526609,
  'acc': 0.8393186086023429,
  'prec': 0.557085020242915,
  'recall': 0.2470530802449308}}

## LightGBM Classifier Results

In [67]:
def prepare_lgboost_model(lgb_hyperparams_df, target):
    # train the model using the hyperparameters from the hyperparameter tuning
    updrs_hp = lgb_hyperparams_df[target].to_dict()
    updrs_hp['max_depth'] = int(updrs_hp['max_depth'])
    model = LGBMClassifier(**updrs_hp)
    return model

In [68]:
lgb_hyperparams_df.head()

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.873455,0.885309,0.556294
min_split_gain,1.025932,0.000952,0.073092
learning_rate,0.940496,0.574412,0.684648
max_depth,1.0,1.0,2.0
min_child_weight,0.894024,2.040004,2.496106


In [69]:
lgb_results = dict()

for updrs, df in zip(['updrs_1', 'updrs_2', 'updrs_3'], [updrs1_df, updrs2_df, updrs3_df]):
    model = prepare_lgboost_model(lgb_hyperparams_df, updrs)
    auc, acc, prec, recall = cross_fold_validation(df, model, updrs)
    lgb_results[updrs] = {"auc":auc,
                        "acc":acc,
                        "prec":prec,
                        "recall":recall}

In [70]:
lgb_results

{'updrs_1': {'auc': 0.6151252296757459,
  'acc': 0.8033390373393006,
  'prec': 0.520641309155241,
  'recall': 0.30190212639789565},
 'updrs_2': {'auc': 0.5859883723833501,
  'acc': 0.8661182045544293,
  'prec': 0.6645021645021645,
  'recall': 0.1884504271928207},
 'updrs_3': {'auc': 0.6041906052528299,
  'acc': 0.8506706608244656,
  'prec': 0.6663461538461538,
  'recall': 0.23215966390839057}}

In [79]:
train_df = updrs1_df[updrs1_df['kfold'] != 4].reset_index(drop=True)
test_df = updrs1_df[updrs1_df['kfold'] == 4].reset_index(drop=True)
X_train = train_df.drop(columns=['visit_id', 'patient_id', 'updrs_1', 'kfold', 'updrs_1_cat'])
y_train = train_df['updrs_1_cat']
X_test = test_df.drop(columns=['visit_id', 'patient_id', 'updrs_1', 'kfold', 'updrs_1_cat'])
y_test = test_df['updrs_1_cat']

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

test_df['preds'] = y_pred


In [80]:
test_df['preds'].value_counts()

0    173
1     40
Name: preds, dtype: int64

In [81]:
wrong_preds = test_df[test_df['updrs_1_cat'] != test_df['preds']]
wrong_preds.shape

(2, 1205)

In [82]:
wrong_preds['updrs_1_cat'].value_counts()

1    2
Name: updrs_1_cat, dtype: int64

## View XGBoost Results with Forecasting the UPDRS

In [43]:
# get the max category for each patient
max_df = updrs1_df.groupby(['patient_id'])['updrs_1_cat'].max().reset_index()
max_df = max_df.rename(columns={'updrs_1_cat': 'updrs_1_max_cat'})
# merge the max category with the original dataframe
updrs1_df = updrs1_df.merge(max_df, on=['patient_id'], how='left')
# take only the visit months that are 12 or less
updrs1_yr_df = updrs1_df[updrs1_df['visit_month'] <= 12]
updrs1_yr_df = updrs1_yr_df.drop(columns=['updrs_1_cat'])
updrs1_yr_df.rename(columns={'updrs_1_max_cat': 'updrs_1_cat'}, inplace=True)

In [44]:
# get the max category for each patient
max_df = updrs2_df.groupby(['patient_id'])['updrs_2_cat'].max().reset_index()
max_df = max_df.rename(columns={'updrs_2_cat': 'updrs_2_max_cat'})
# merge the max category with the original dataframe
updrs2_df = updrs2_df.merge(max_df, on=['patient_id'], how='left')
# take only the visit months that are 12 or less
updrs2_yr_df = updrs2_df[updrs2_df['visit_month'] <= 12]
updrs2_yr_df = updrs2_yr_df.drop(columns=['updrs_2_cat'])
updrs2_yr_df.rename(columns={'updrs_2_max_cat': 'updrs_2_cat'}, inplace=True)

In [45]:
# get the max category for each patient
max_df = updrs3_df.groupby(['patient_id'])['updrs_3_cat'].max().reset_index()
max_df = max_df.rename(columns={'updrs_3_cat': 'updrs_3_max_cat'})
# merge the max category with the original dataframe
updrs3_df = updrs3_df.merge(max_df, on=['patient_id'], how='left')
# take only the visit months that are 12 or less
updrs3_yr_df = updrs3_df[updrs3_df['visit_month'] <= 12]
updrs3_yr_df = updrs3_yr_df.drop(columns=['updrs_3_cat'])
updrs3_yr_df.rename(columns={'updrs_3_max_cat': 'updrs_3_cat'}, inplace=True)

In [46]:
xgb_forecast_hyperparams_df = pd.read_csv('../data/processed/xgboost_future_cat_hyperparam_results.csv', index_col=0)
lgb_forecast_hyperparams_df = pd.read_csv('../data/processed/lgboost_future_cat_hyperparam_results.csv', index_col=0)

In [47]:
lgb_forecast_hyperparams_df

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.811327,0.98416,0.895658
min_split_gain,9.6e-05,1.272034,0.001123
learning_rate,0.999408,0.993501,0.515028
max_depth,8.0,8.0,2.0
min_child_weight,2.174744,2.421318,0.218445
reg_alpha,1.434613,0.133531,0.750725
reg_lambda,1.360761,5.750833,2.372409
subsample,0.650306,0.857437,0.637569


In [48]:
xgb_forecast_hyperparams_df

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.654963,0.537749,0.671407
gamma,0.019979,2.102313,0.274657
learning_rate,0.95537,0.381512,0.984645
max_depth,7.0,1.0,8.0
min_child_weight,0.954945,0.922874,0.38124
reg_alpha,2.555897,2.143399,0.016335
reg_lambda,2.67818,2.99109,2.369731
subsample,0.857239,0.960498,0.841739


In [49]:
xgb_forecast_results = dict()

for updrs, df in zip(['updrs_1', 'updrs_2', 'updrs_3'], [updrs1_yr_df, updrs2_yr_df, updrs3_yr_df]):
    model = prepare_xgboost_model(xgb_forecast_hyperparams_df, updrs)
    auc, acc, prec, recall = cross_fold_validation(df, model, updrs)
    xgb_forecast_results[updrs] = {"auc":auc,
                        "acc":acc,
                        "prec":prec,
                        "recall":recall}





























































In [50]:
xgb_forecast_results

{'updrs_1': {'auc': 0.627263605237623,
  'acc': 0.6597958631565353,
  'prec': 0.6092886111709641,
  'recall': 0.44451518234007975},
 'updrs_2': {'auc': 0.6038171736597545,
  'acc': 0.7180423742923743,
  'prec': 0.5844444444444444,
  'recall': 0.3060383963862225},
 'updrs_3': {'auc': 0.6472226542235492,
  'acc': 0.6761404407023932,
  'prec': 0.5981565275682923,
  'recall': 0.518812769010043}}

## LGBoost Future Categorical Predictions

In [51]:
lgb_forecast_results = dict()

for updrs, df in zip(['updrs_1', 'updrs_2', 'updrs_3'], [updrs1_yr_df, updrs2_yr_df, updrs3_yr_df]):
    model = prepare_lgboost_model(lgb_forecast_hyperparams_df, updrs)
    auc, acc, prec, recall = cross_fold_validation(df, model, updrs)
    lgb_forecast_results[updrs] = {"auc":auc,
                        "acc":acc,
                        "prec":prec,
                        "recall":recall}

In [52]:
lgb_forecast_results

{'updrs_1': {'auc': 0.639971567890737,
  'acc': 0.672581877009455,
  'prec': 0.6115300127713921,
  'recall': 0.47299858524208727},
 'updrs_2': {'auc': 0.6270216415015286,
  'acc': 0.7222527472527472,
  'prec': 0.5772689075630252,
  'recall': 0.3796517974778844},
 'updrs_3': {'auc': 0.6714000151485306,
  'acc': 0.7035588045663139,
  'prec': 0.6505525846702318,
  'recall': 0.5241950194712032}}