In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score


## View Results from Hyperparameter Tuning For Current UPDRS
Using the protein and peptide data as well as the visit month, predict the UPDRS value as either Mild, Moderate, or Severe

In [44]:
# read in the data from the csv file for xgboost hyperparameter tuning
xgb_hyperparams_df = pd.read_csv('../data/processed/xgboost_cat_hyperparam_results.csv', index_col=0)
lgb_hyperparams_df = pd.read_csv('../data/processed/lgboost_cat_hyperparam_results.csv', index_col=0)
cboost_hyperparams_df = pd.read_csv('../data/processed/catboost_future_cat_hyperparam_results.csv', index_col=0)

In [45]:
lgb_hyperparams_df

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.629116,0.856101,0.731011
learning_rate,0.628567,0.750618,0.556761
max_depth,5.0,8.0,3.0
min_child_weight,3.578848,16.437261,5.512251
min_split_gain,0.005489,4.6e-05,0.000197
reg_alpha,7.73583,6.684875,1.484969
reg_lambda,3.493592,3.002659,3.025202
subsample,0.916682,0.882994,0.556561


In [46]:
xgb_hyperparams_df

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.708264,0.694449,0.643855
gamma,1.070175,0.03239,1.949759
learning_rate,0.969325,0.993978,0.50235
max_depth,2.0,3.0,2.0
min_child_weight,0.3015,0.926601,2.980635
reg_alpha,1.641284,5.318046,3.232656
reg_lambda,6.011928,6.999249,7.487927
subsample,0.900018,0.710181,0.885401


In [47]:
cboost_hyperparams_df

Unnamed: 0,updrs_1,updrs_2,updrs_3
bagging_temperature,4.429427,4.095758,1.162359
depth,7.0,6.0,4.0
l2_leaf_reg,5.641367,5.757078,7.35671
learning_rate,0.366781,0.954836,0.973876
min_data_in_leaf,8.0,8.0,4.0


In [48]:
# read in the protein and updrs data
updrs1_df = pd.read_csv('../data/processed/train_updrs_1_cat.csv')
updrs2_df = pd.read_csv('../data/processed/train_updrs_2_cat.csv')
updrs3_df = pd.read_csv('../data/processed/train_updrs_3_cat.csv')

In [49]:
updrs1_df['updrs_1_cat'].value_counts()

mild        854
moderate    199
severe       15
Name: updrs_1_cat, dtype: int64

In [50]:
updrs2_df['updrs_2_cat'].value_counts()

mild        910
moderate    158
Name: updrs_2_cat, dtype: int64

In [51]:
updrs3_df['updrs_3_cat'].value_counts()

mild        880
moderate    168
severe       10
Name: updrs_3_cat, dtype: int64

In [52]:
# replace the categorical updrs scores with numerical for mild, moderate and severe
## combine the moderate and severe categories since there are very few severe observations
updrs1_df['updrs_1_cat'] = updrs1_df['updrs_1_cat'].map({'mild': 0, 'moderate': 1, 'severe': 1})
updrs2_df['updrs_2_cat'] = updrs2_df['updrs_2_cat'].map({'mild': 0, 'moderate': 1, 'severe': 1})
updrs3_df['updrs_3_cat'] = updrs3_df['updrs_3_cat'].map({'mild': 0, 'moderate': 1, 'severe': 1})

In [53]:
updrs3_df['updrs_3_cat'].value_counts()

0    880
1    178
Name: updrs_3_cat, dtype: int64

In [54]:
updrs3_df.columns

Index(['visit_id', 'patient_id', 'visit_month', 'updrs_3', 'O00391', 'O00533',
       'O00584', 'O14498', 'O14773', 'O14791',
       ...
       'YVNKEIQNAVNGVK_P10909', 'YWGVASFLQK_P02753',
       'YYC(UniMod_4)FQGNQFLR_P02790', 'YYTYLIMNK_P01024',
       'YYWGGQYTWDMAK_P02675', 'kfold', 'num_prot_pep', 'num_prot', 'num_pept',
       'updrs_3_cat'],
      dtype='object', length=1204)

In [55]:

def cross_fold_validation(df, model, target):

    updrs_results = dict()
    
    for fold in range(0, 5):
        # get the train and test data for the current fold
        train = df[df['kfold'] != fold].reset_index(drop=True)
        test = df[df['kfold'] == fold].reset_index(drop=True)

        # get the train and test data for the current fold
        drop_cols = ['visit_id', 'patient_id', f'{target}', 'kfold', f'{target}_cat']
        X_train = train.drop(columns=drop_cols)
        y_train = train[f'{target}_cat']
        X_test = test.drop(columns=drop_cols)
        y_test = test[f'{target}_cat']

        # train the model
        model.fit(X_train, y_train)

        # make predictions
        preds = model.predict(X_test)


        # save the results
        updrs_results[f'{target}_fold_{fold}'] = {
            'auc_score': roc_auc_score(y_test, preds),
            'acc_score': accuracy_score(y_test, preds),
            'precision_score': precision_score(y_test, preds),
            'recall_score': recall_score(y_test, preds),
        }
        
    mean_auc = np.mean([updrs_results[f'{target}_fold_{fold}']['auc_score'] for fold in range(0, 5)])
    mean_acc = np.mean([updrs_results[f'{target}_fold_{fold}']['acc_score'] for fold in range(0, 5)])
    mean_precision = np.mean([updrs_results[f'{target}_fold_{fold}']['precision_score'] for fold in range(0, 5)])
    mean_recall = np.mean([updrs_results[f'{target}_fold_{fold}']['recall_score'] for fold in range(0, 5)])
    
    return mean_auc, mean_acc, mean_precision, mean_recall
        
    
    

In [56]:
def prepare_xgboost_model(xgb_hyperparams_df, target):
    # train the model using the hyperparameters from the hyperparameter tuning
    updrs_hp = xgb_hyperparams_df[target].to_dict()
    updrs_hp['max_depth'] = int(updrs_hp['max_depth'])
    model = XGBClassifier(**updrs_hp)
    return model

In [57]:
# test the model function
# model = prepare_xgboost_model(xgb_hyperparams_df, 'updrs_1')
# model.get_params()

In [58]:
xgb_results = dict()

for updrs, df in zip(['updrs_1', 'updrs_2', 'updrs_3'], [updrs1_df, updrs2_df, updrs3_df]):
    model = prepare_xgboost_model(xgb_hyperparams_df, updrs)
    auc, acc, prec, recall = cross_fold_validation(df, model, updrs)
    xgb_results[updrs] = {"auc":auc,
                        "acc":acc,
                        "prec":prec,
                        "recall":recall}





























































In [59]:
xgb_results

{'updrs_1': {'auc': 0.6228153495890046,
  'acc': 0.8033916897020754,
  'prec': 0.5157675273775034,
  'recall': 0.3231265220685688},
 'updrs_2': {'auc': 0.6172550362778326,
  'acc': 0.8642183317976395,
  'prec': 0.5985714285714285,
  'recall': 0.26631630708709814},
 'updrs_3': {'auc': 0.6185961118239277,
  'acc': 0.8591969954395065,
  'prec': 0.7281562881562882,
  'recall': 0.25652528589710255}}

## LightGBM Classifier Results

In [60]:
def prepare_lgboost_model(lgb_hyperparams_df, target):
    # train the model using the hyperparameters from the hyperparameter tuning
    updrs_hp = lgb_hyperparams_df[target].to_dict()
    updrs_hp['max_depth'] = int(updrs_hp['max_depth'])
    model = LGBMClassifier(**updrs_hp)
    return model

In [61]:
lgb_hyperparams_df.head()

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.629116,0.856101,0.731011
learning_rate,0.628567,0.750618,0.556761
max_depth,5.0,8.0,3.0
min_child_weight,3.578848,16.437261,5.512251
min_split_gain,0.005489,4.6e-05,0.000197


In [62]:
lgb_results = dict()

for updrs, df in zip(['updrs_1', 'updrs_2', 'updrs_3'], [updrs1_df, updrs2_df, updrs3_df]):
    model = prepare_lgboost_model(lgb_hyperparams_df, updrs)
    auc, acc, prec, recall = cross_fold_validation(df, model, updrs)
    lgb_results[updrs] = {"auc":auc,
                        "acc":acc,
                        "prec":prec,
                        "recall":recall}

In [63]:
lgb_results

{'updrs_1': {'auc': 0.5658502432545193,
  'acc': 0.79680575665833,
  'prec': 0.48523125996810207,
  'recall': 0.1797514852179319},
 'updrs_2': {'auc': 0.5692116705945804,
  'acc': 0.8511078934667193,
  'prec': 0.5183566433566433,
  'recall': 0.1690761571086115},
 'updrs_3': {'auc': 0.6167585795325474,
  'acc': 0.8535097916480373,
  'prec': 0.6657518951636598,
  'recall': 0.2596450440253496}}

In [64]:
train_df = updrs1_df[updrs1_df['kfold'] != 4].reset_index(drop=True)
test_df = updrs1_df[updrs1_df['kfold'] == 4].reset_index(drop=True)
X_train = train_df.drop(columns=['visit_id', 'patient_id', 'updrs_1', 'kfold', 'updrs_1_cat'])
y_train = train_df['updrs_1_cat']
X_test = test_df.drop(columns=['visit_id', 'patient_id', 'updrs_1', 'kfold', 'updrs_1_cat'])
y_test = test_df['updrs_1_cat']

model = prepare_lgboost_model(lgb_hyperparams_df, 'updrs_1')

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

test_df['preds'] = y_pred


In [65]:
model = prepare_lgboost_model(lgb_hyperparams_df, 'updrs_3')
model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.7310113546368466,
 'importance_type': 'split',
 'learning_rate': 0.5567614720759515,
 'max_depth': 3,
 'min_child_samples': 20,
 'min_child_weight': 5.512250721546523,
 'min_split_gain': 0.0001967260227772,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 1.4849686714281682,
 'reg_lambda': 3.025201814436393,
 'silent': 'warn',
 'subsample': 0.5565613110107488,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [66]:
test_df['preds'].value_counts()

0    202
1     11
Name: preds, dtype: int64

In [67]:
wrong_preds = test_df[test_df['updrs_1_cat'] != test_df['preds']]
wrong_preds.shape

(41, 1205)

In [68]:
wrong_preds['updrs_1_cat'].value_counts()

1    36
0     5
Name: updrs_1_cat, dtype: int64

## Try the Catboost model

In [69]:
def prepare_catboost_model(cboost_hyperparams_df, target):
    # train the model using the hyperparameters from the hyperparameter tuning
    updrs_hp = cboost_hyperparams_df[target].to_dict()
    updrs_hp['depth'] = int(updrs_hp['depth'])
    updrs_hp['min_data_in_leaf'] = int(updrs_hp['min_data_in_leaf'])
    model = CatBoostClassifier(**updrs_hp)
    return model

In [70]:
cb_results = dict()

for updrs, df in zip(['updrs_1', 'updrs_2', 'updrs_3'], [updrs1_df, updrs2_df, updrs3_df]):
    model = prepare_catboost_model(cboost_hyperparams_df, updrs)
    auc, acc, prec, recall = cross_fold_validation(df, model, updrs)
    cb_results[updrs] = {"auc":auc,
                        "acc":acc,
                        "prec":prec,
                        "recall":recall}

0:	learn: 0.5490647	total: 366ms	remaining: 6m 5s
1:	learn: 0.4782074	total: 696ms	remaining: 5m 47s
2:	learn: 0.4375548	total: 1.02s	remaining: 5m 40s
3:	learn: 0.3825585	total: 1.35s	remaining: 5m 37s
4:	learn: 0.3527780	total: 1.69s	remaining: 5m 36s
5:	learn: 0.3361100	total: 2.02s	remaining: 5m 34s
6:	learn: 0.3096034	total: 2.35s	remaining: 5m 33s
7:	learn: 0.2948200	total: 2.69s	remaining: 5m 33s
8:	learn: 0.2653139	total: 3.03s	remaining: 5m 33s
9:	learn: 0.2451644	total: 3.36s	remaining: 5m 32s
10:	learn: 0.2300460	total: 3.69s	remaining: 5m 31s
11:	learn: 0.2137444	total: 4.02s	remaining: 5m 31s
12:	learn: 0.2006707	total: 4.36s	remaining: 5m 30s
13:	learn: 0.1817910	total: 4.68s	remaining: 5m 29s
14:	learn: 0.1790823	total: 5.02s	remaining: 5m 29s
15:	learn: 0.1761048	total: 5.34s	remaining: 5m 28s
16:	learn: 0.1668808	total: 5.68s	remaining: 5m 28s
17:	learn: 0.1551529	total: 6.01s	remaining: 5m 28s
18:	learn: 0.1451144	total: 6.35s	remaining: 5m 27s
19:	learn: 0.1367271	to

21:	learn: 0.1206381	total: 7.33s	remaining: 5m 25s
22:	learn: 0.1187296	total: 7.66s	remaining: 5m 25s
23:	learn: 0.1128837	total: 7.99s	remaining: 5m 25s
24:	learn: 0.0993713	total: 8.33s	remaining: 5m 24s
25:	learn: 0.0924545	total: 8.66s	remaining: 5m 24s
26:	learn: 0.0852478	total: 8.99s	remaining: 5m 23s
27:	learn: 0.0844071	total: 9.32s	remaining: 5m 23s
28:	learn: 0.0779856	total: 9.65s	remaining: 5m 23s
29:	learn: 0.0717202	total: 9.98s	remaining: 5m 22s
30:	learn: 0.0664475	total: 10.3s	remaining: 5m 22s
31:	learn: 0.0626453	total: 10.6s	remaining: 5m 21s
32:	learn: 0.0578912	total: 11s	remaining: 5m 21s
33:	learn: 0.0559568	total: 11.3s	remaining: 5m 21s
34:	learn: 0.0512441	total: 11.6s	remaining: 5m 20s
35:	learn: 0.0483245	total: 12s	remaining: 5m 20s
36:	learn: 0.0464500	total: 12.3s	remaining: 5m 20s
37:	learn: 0.0447889	total: 12.6s	remaining: 5m 19s
38:	learn: 0.0424228	total: 13s	remaining: 5m 19s
39:	learn: 0.0402594	total: 13.3s	remaining: 5m 19s
40:	learn: 0.03863

In [71]:
cb_results

{'updrs_1': {'auc': 0.5707848701863123,
  'acc': 0.8174103812908605,
  'prec': 0.7075901875901875,
  'recall': 0.15921423038553578},
 'updrs_2': {'auc': 0.5854652654452738,
  'acc': 0.8623623360098286,
  'prec': 0.6192207792207792,
  'recall': 0.1917942098469482},
 'updrs_3': {'auc': 0.584152282388305,
  'acc': 0.8327014218009477,
  'prec': 0.5110092521857228,
  'recall': 0.2090492869779797}}

## View XGBoost Results with Forecasting the UPDRS

In [72]:
# get the max category for each patient
max_df = updrs1_df.groupby(['patient_id'])['updrs_1_cat'].max().reset_index()
max_df = max_df.rename(columns={'updrs_1_cat': 'updrs_1_max_cat'})
# merge the max category with the original dataframe
updrs1_df = updrs1_df.merge(max_df, on=['patient_id'], how='left')
# take only the visit months that are 12 or less
updrs1_yr_df = updrs1_df[updrs1_df['visit_month'] <= 12]
updrs1_yr_df = updrs1_yr_df.drop(columns=['updrs_1_cat'])
updrs1_yr_df.rename(columns={'updrs_1_max_cat': 'updrs_1_cat'}, inplace=True)

In [73]:
# get the max category for each patient
max_df = updrs2_df.groupby(['patient_id'])['updrs_2_cat'].max().reset_index()
max_df = max_df.rename(columns={'updrs_2_cat': 'updrs_2_max_cat'})
# merge the max category with the original dataframe
updrs2_df = updrs2_df.merge(max_df, on=['patient_id'], how='left')
# take only the visit months that are 12 or less
updrs2_yr_df = updrs2_df[updrs2_df['visit_month'] <= 12]
updrs2_yr_df = updrs2_yr_df.drop(columns=['updrs_2_cat'])
updrs2_yr_df.rename(columns={'updrs_2_max_cat': 'updrs_2_cat'}, inplace=True)

In [74]:
# get the max category for each patient
max_df = updrs3_df.groupby(['patient_id'])['updrs_3_cat'].max().reset_index()
max_df = max_df.rename(columns={'updrs_3_cat': 'updrs_3_max_cat'})
# merge the max category with the original dataframe
updrs3_df = updrs3_df.merge(max_df, on=['patient_id'], how='left')
# take only the visit months that are 12 or less
updrs3_yr_df = updrs3_df[updrs3_df['visit_month'] <= 12]
updrs3_yr_df = updrs3_yr_df.drop(columns=['updrs_3_cat'])
updrs3_yr_df.rename(columns={'updrs_3_max_cat': 'updrs_3_cat'}, inplace=True)

In [75]:
xgb_forecast_hyperparams_df = pd.read_csv('../data/processed/xgboost_future_cat_hyperparam_results.csv', index_col=0)
lgb_forecast_hyperparams_df = pd.read_csv('../data/processed/lgboost_future_cat_hyperparam_results.csv', index_col=0)
cboost_forecast_hyperparams_df = pd.read_csv('../data/processed/catboost_future_cat_hyperparam_results.csv', index_col=0)

In [76]:
lgb_forecast_hyperparams_df

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.970757,0.903629,0.686326
learning_rate,0.983567,0.174551,0.910126
max_depth,8.0,2.0,6.0
min_child_weight,1.232586,2.797518,13.3875
min_split_gain,0.007964,0.297876,0.004011
reg_alpha,7.437594,1.543678,4.15971
reg_lambda,7.499285,3.401574,1.926977
subsample,0.728666,0.94891,0.839655


In [77]:
xgb_forecast_hyperparams_df

Unnamed: 0,updrs_1,updrs_2,updrs_3
colsample_bytree,0.999434,0.972336,0.579427
gamma,0.218284,0.10828,7e-05
learning_rate,0.876356,0.65033,0.759742
max_depth,6.0,8.0,8.0
min_child_weight,0.292568,7.076801,0.64285
reg_alpha,6.803953,2.521598,2.426225
reg_lambda,1.268909,1.67716,3.504071
subsample,0.535242,0.682804,0.69348


In [78]:
cboost_hyperparams_df

Unnamed: 0,updrs_1,updrs_2,updrs_3
bagging_temperature,4.429427,4.095758,1.162359
depth,7.0,6.0,4.0
l2_leaf_reg,5.641367,5.757078,7.35671
learning_rate,0.366781,0.954836,0.973876
min_data_in_leaf,8.0,8.0,4.0


In [79]:
xgb_forecast_results = dict()

for updrs, df in zip(['updrs_1', 'updrs_2', 'updrs_3'], [updrs1_yr_df, updrs2_yr_df, updrs3_yr_df]):
    model = prepare_xgboost_model(xgb_forecast_hyperparams_df, updrs)
    print(f'UPDRS: {updrs}')
    print(f'Hyperparameters: {model.get_params()}')
    print('\n')
    auc, acc, prec, recall = cross_fold_validation(df, model, updrs)
    xgb_forecast_results[updrs] = {"auc":auc,
                        "acc":acc,
                        "prec":prec,
                        "recall":recall}

UPDRS: updrs_1
Hyperparameters: {'objective': 'binary:logistic', 'use_label_encoder': True, 'base_score': None, 'booster': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.9994339895225436, 'gamma': 0.2182840730386759, 'gpu_id': None, 'importance_type': 'gain', 'interaction_constraints': None, 'learning_rate': 0.8763564203413519, 'max_delta_step': None, 'max_depth': 6, 'min_child_weight': 0.2925675445213144, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 100, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': 6.803952572268468, 'reg_lambda': 1.2689093632909028, 'scale_pos_weight': None, 'subsample': 0.5352418739052822, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}




















UPDRS: updrs_2
Hyperparameters: {'objective': 'binary:logistic', 'use_label_encoder': True, 'base_score': None, 'booster': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.9723358597087076, 'gamma': 0.1082797129959026, 'gpu_id': None, 'importance_type': 'gain', 'interaction_constraints': None, 'learning_rate': 0.6503304038389104, 'max_delta_step': None, 'max_depth': 8, 'min_child_weight': 7.076800505115973, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 100, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': 2.5215977214254304, 'reg_lambda': 1.677159748913182, 'scale_pos_weight': None, 'subsample': 0.6828035084965419, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}


















UPDRS: updrs_3
Hyperparameters: {'objective': 'binary:logistic', 'use_label_encoder': True, 'base_score': None, 'booster': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.5794273693429098, 'gamma': 6.958373374993177e-05, 'gpu_id': None, 'importance_type': 'gain', 'interaction_constraints': None, 'learning_rate': 0.7597417382901697, 'max_delta_step': None, 'max_depth': 8, 'min_child_weight': 0.6428495541542038, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 100, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': 2.426225167246236, 'reg_lambda': 3.5040708421969304, 'scale_pos_weight': None, 'subsample': 0.6934802250073431, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}




















In [80]:
xgb_forecast_results

{'updrs_1': {'auc': 0.6493186328197951,
  'acc': 0.6743910764901406,
  'prec': 0.5923254099116168,
  'recall': 0.5364276778093467},
 'updrs_2': {'auc': 0.6548371574182562,
  'acc': 0.7431776556776557,
  'prec': 0.6170812324929973,
  'recall': 0.4286708074534161},
 'updrs_3': {'auc': 0.6416416876381418,
  'acc': 0.6779819471308833,
  'prec': 0.6093808353808353,
  'recall': 0.47740751178520197}}

## LGBoost Future Categorical Predictions

In [81]:
lgb_forecast_results = dict()

for updrs, df in zip(['updrs_1', 'updrs_2', 'updrs_3'], [updrs1_yr_df, updrs2_yr_df, updrs3_yr_df]):
    model = prepare_lgboost_model(lgb_forecast_hyperparams_df, updrs)
    print(f'UPDRS: {updrs}')
    print(model.get_params())
    print('\n')
    auc, acc, prec, recall = cross_fold_validation(df, model, updrs)
    lgb_forecast_results[updrs] = {"auc":auc,
                        "acc":acc,
                        "prec":prec,
                        "recall":recall}

UPDRS: updrs_1
{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.970757221693757, 'importance_type': 'split', 'learning_rate': 0.9835668264286576, 'max_depth': 8, 'min_child_samples': 20, 'min_child_weight': 1.2325864748203452, 'min_split_gain': 0.007964040585614, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 7.437593735288674, 'reg_lambda': 7.499284502976287, 'silent': 'warn', 'subsample': 0.7286661893753408, 'subsample_for_bin': 200000, 'subsample_freq': 0}


UPDRS: updrs_2
{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.9036286590547646, 'importance_type': 'split', 'learning_rate': 0.1745511363640152, 'max_depth': 2, 'min_child_samples': 20, 'min_child_weight': 2.7975180017133723, 'min_split_gain': 0.2978757291694001, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 1.5436778945279928, 'reg_lambda': 3.401574051688944, 'silen

In [82]:
lgb_forecast_results

{'updrs_1': {'auc': 0.59664793015473,
  'acc': 0.6300638403447596,
  'prec': 0.5485747458161251,
  'recall': 0.41962947373344095},
 'updrs_2': {'auc': 0.6251290954000639,
  'acc': 0.7347631535131536,
  'prec': 0.6323177476118652,
  'recall': 0.33857255787690566},
 'updrs_3': {'auc': 0.5969516130707487,
  'acc': 0.6313568096484241,
  'prec': 0.531607724043764,
  'recall': 0.4460611805697889}}

## Compare Catboost Model

In [83]:
cgb_forecast_results = dict()

for updrs, df in zip(['updrs_1', 'updrs_2', 'updrs_3'], [updrs1_yr_df, updrs2_yr_df, updrs3_yr_df]):
    model = prepare_catboost_model(cboost_forecast_hyperparams_df, updrs)
    print(f'UPDRS: {updrs}')
    print(f'Hyperparameters: {model.get_params()}')
    print('\n')
    auc, acc, prec, recall = cross_fold_validation(df, model, updrs)
    cgb_forecast_results[updrs] = {"auc":auc,
                        "acc":acc,
                        "prec":prec,
                        "recall":recall}

UPDRS: updrs_1
Hyperparameters: {'learning_rate': 0.3667810794764569, 'depth': 7, 'l2_leaf_reg': 5.641366638454, 'bagging_temperature': 4.429427339938372, 'min_data_in_leaf': 8}


0:	learn: 0.5964092	total: 360ms	remaining: 5m 59s
1:	learn: 0.5284874	total: 692ms	remaining: 5m 45s
2:	learn: 0.4746385	total: 1.02s	remaining: 5m 37s
3:	learn: 0.4301078	total: 1.34s	remaining: 5m 34s
4:	learn: 0.3818352	total: 1.67s	remaining: 5m 32s
5:	learn: 0.3581313	total: 2s	remaining: 5m 31s
6:	learn: 0.3207303	total: 2.32s	remaining: 5m 29s
7:	learn: 0.2842559	total: 2.64s	remaining: 5m 27s
8:	learn: 0.2643040	total: 2.96s	remaining: 5m 26s
9:	learn: 0.2496839	total: 3.28s	remaining: 5m 25s
10:	learn: 0.2323576	total: 3.61s	remaining: 5m 24s
11:	learn: 0.2093736	total: 3.94s	remaining: 5m 24s
12:	learn: 0.1869665	total: 4.27s	remaining: 5m 24s
13:	learn: 0.1679286	total: 4.6s	remaining: 5m 24s
14:	learn: 0.1571655	total: 4.93s	remaining: 5m 23s
15:	learn: 0.1523514	total: 5.26s	remaining: 5m 23s
16

In [84]:
cgb_forecast_results

{'updrs_1': {'auc': 0.6875180353730299,
  'acc': 0.7291737868499222,
  'prec': 0.7309163059163059,
  'recall': 0.4883101242882365},
 'updrs_2': {'auc': 0.6656831081016579,
  'acc': 0.7504308191808191,
  'prec': 0.643524896156475,
  'recall': 0.44420327498588363},
 'updrs_3': {'auc': 0.661142439158262,
  'acc': 0.6960405051769257,
  'prec': 0.6438998501498501,
  'recall': 0.503075681492109}}

In [117]:
from sklearn.model_selection import StratifiedKFold

def create_folds(df, target):
    # calculate the number of bins by Sturge's rule
    num_bins = int(np.floor(1 + np.log2(len(df))))
    df.loc[:, "bins"] = pd.cut(df[f'{updrs}_cat'], bins=num_bins, labels=False)

    df = df.dropna().reset_index(drop=True)
        
    # initiate the kfold class from sklearn
    kf = StratifiedKFold(n_splits=5)
        
    # create a kfold column
    df['kfold'] = -1

    # fill the kfold column
    for f, (t_, v_) in enumerate(kf.split(X=df, y=df['bins'].values)):
        df.loc[v_, 'kfold'] = f
            
    # drop the bins column
    df = df.drop('bins', axis=1)
    max_kfold = df['kfold'].max()
    
    print(f'{max_kfold + 1} Kfolds created for {target}_cat')
    return df, max_kfold

In [157]:
def train_catboost(train_df, test_df, updrs):
    features = train_df.drop(f'{updrs}_cat', axis=1)
    target = train_df[f'{updrs}_cat']
    #X_val = val_df.drop(f'{updrs}_cat', axis=1)
    #y_val = val_df[f'{updrs}_cat']

    # Define the CatBoost classifier
    model = CatBoostClassifier(iterations=1000, eval_metric='AUC', random_seed=42)

    # Fit the model on the training data
    model.fit(features, 
              target,
              #eval_set=(X_val, y_val),
                #early_stopping_rounds=50, 
                verbose=100,
                use_best_model=True)

    # Make predictions on the test data
    predictions = model.predict_proba(test_df)[:, 1]

    # Evaluate AUC on the test data
    auc = roc_auc_score(test_df[f'{updrs}_cat'], predictions)
    print("AUC on Test Data:", auc)

    # Save the best iteration of the model
    model.save_model(f'../models/catboost_{updrs}_model.cbm', format='cbm', pool=None)

    return model

In [158]:
# train and save the best catboost model for each updrs
cboost_results = dict()

for updrs, df in zip(['updrs_1', 'updrs_2', 'updrs_3'], [updrs1_yr_df, updrs2_yr_df, updrs3_yr_df]):
    model = prepare_catboost_model(cboost_forecast_hyperparams_df, updrs)
    temp_df, test_kfold = create_folds(df, updrs)
    val_kfold = test_kfold - 1 
    temp_df = temp_df.drop(columns=['visit_id', 'patient_id', f'{updrs}'])
    X_train = temp_df[(temp_df['kfold'] != test_kfold)].reset_index(drop=True)
    X_test = temp_df[temp_df['kfold'] == test_kfold].reset_index(drop=True)
    y_test = X_test[f'{updrs}_cat']
    #X_val = temp_df[temp_df['kfold'] == val_kfold].reset_index(drop=True)
    
    best_model = train_catboost(X_train, X_test, updrs)
    
    #preds = best_model.predict(X_test)
    #test_auc = roc_auc_score(y_test, preds)
    #preds = best_model.predict(X_train.drop(columns=['kfold', f'{updrs}_cat']))
    #train_auc = roc_auc_score(y_train, preds)
    
    #cboost_results[updrs] = {'test_auc':test_auc, 'train_auc':train_auc}



    

5 Kfolds created for updrs_1_cat


You should provide test set for use best model. use_best_model parameter has been switched to false value.


Learning rate set to 0.006723
0:	total: 241ms	remaining: 4m
100:	total: 20.6s	remaining: 3m 2s
200:	total: 39.9s	remaining: 2m 38s
300:	total: 59.1s	remaining: 2m 17s
400:	total: 1m 18s	remaining: 1m 56s
500:	total: 1m 37s	remaining: 1m 37s
600:	total: 1m 56s	remaining: 1m 17s
700:	total: 2m 15s	remaining: 57.9s
800:	total: 2m 35s	remaining: 38.6s
900:	total: 2m 54s	remaining: 19.2s
999:	total: 3m 13s	remaining: 0us
AUC on Test Data: 0.7636363636363637
5 Kfolds created for updrs_2_cat


You should provide test set for use best model. use_best_model parameter has been switched to false value.


Learning rate set to 0.006723
0:	total: 220ms	remaining: 3m 40s
100:	total: 19.5s	remaining: 2m 53s
200:	total: 38.5s	remaining: 2m 32s
300:	total: 57.4s	remaining: 2m 13s
400:	total: 1m 16s	remaining: 1m 54s
500:	total: 1m 35s	remaining: 1m 35s
600:	total: 1m 54s	remaining: 1m 16s
700:	total: 2m 13s	remaining: 57s
800:	total: 2m 32s	remaining: 37.9s
900:	total: 2m 51s	remaining: 18.9s
999:	total: 3m 10s	remaining: 0us
AUC on Test Data: 0.842911877394636
5 Kfolds created for updrs_3_cat


You should provide test set for use best model. use_best_model parameter has been switched to false value.


Learning rate set to 0.006707
0:	total: 231ms	remaining: 3m 50s
100:	total: 19.2s	remaining: 2m 51s
200:	total: 38.2s	remaining: 2m 31s
300:	total: 57.4s	remaining: 2m 13s
400:	total: 1m 16s	remaining: 1m 53s
500:	total: 1m 35s	remaining: 1m 34s
600:	total: 1m 57s	remaining: 1m 17s
700:	total: 2m 16s	remaining: 58.2s
800:	total: 2m 35s	remaining: 38.7s
900:	total: 2m 55s	remaining: 19.3s
999:	total: 3m 14s	remaining: 0us
AUC on Test Data: 0.8596938775510203


In [159]:
# compare the results of the model
test_preds = best_model.predict(X_test)
test_auc = roc_auc_score(y_test, test_preds)
test_auc

0.6482142857142856

## Compare to Logistic Regression Model

In [85]:
model = LogisticRegression()

# run cross fold validation on the updrs 1 data
updrs1_results = cross_fold_validation(updrs1_df, model, 'updrs_1')

model = LogisticRegression()
updrs2_results = cross_fold_validation(updrs2_df, model, 'updrs_2')

model = LogisticRegression()
updrs3_results = cross_fold_validation(updrs3_df, model, 'updrs_3')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [86]:
updrs1_results

(0.511721383287093,
 0.780847703040674,
 0.32282051282051283,
 0.062211088895139766)

In [87]:
updrs2_results

(0.5258909849605227,
 0.8464349962704578,
 0.42333333333333323,
 0.07048988874546684)

In [88]:
updrs3_results

(0.5240749211448716,
 0.8194894035589735,
 0.3485714285714286,
 0.07900396151669495)

## Forecast with Logistic Regression

In [89]:
model = LogisticRegression()

# run cross fold validation on the updrs 1 data
updrs1_forecast_results = cross_fold_validation(updrs1_yr_df, model, 'updrs_1')

model = LogisticRegression()
updrs2_forecast_results = cross_fold_validation(updrs2_yr_df, model, 'updrs_2')

model = LogisticRegression()
updrs3_forecast_results = cross_fold_validation(updrs3_yr_df, model, 'updrs_3')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [90]:
updrs1_forecast_results

(0.5634874999789138,
 0.5994172665293609,
 0.4912421630094044,
 0.3920253253358589)

In [91]:
updrs2_forecast_results

(0.582742150911908, 0.6805340492840493, 0.4779128959276018, 0.3289821193299454)

In [92]:
updrs3_forecast_results

(0.5758538282007636,
 0.6218648310387985,
 0.5047414576826341,
 0.3744481451117032)