PART III

This final Jupyter notebook contains the fine-tuning of the best-performing model: ADABoost Regressor, and a discussion section.

Skip down to Cell 31 for the model training; everything prior I have already demonstrated in the previous notebooks. 

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import learning_curve, validation_curve


def plot_learning_curve(
    estimator,
    X,
    y,
    title="Learning Curve",
    ylim=None,
    cv=None,
    n_jobs=None,
    train_sizes=np.linspace(0.1, 1.0, 5),
    scoring=None,
):
    """
    This is a custom modification of the code present here:
    https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
    
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))

    scoring: string, callable or None, optional, default: None
        A string (see model evaluation documentation) or a scorer callable object / function
        with signature scorer(estimator, X, y).
    """

    fig, axes = plt.subplots(1, 1, figsize=(10, 5))

    axes.set_title(title)
    if ylim is not None:
        axes.set_ylim(*ylim)
    axes.set_xlabel("Training examples")
    axes.set_ylabel("Score")

    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=scoring
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # Plot learning curve
    axes.grid(True)
    axes.fill_between(
        train_sizes,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.1,
        color="r",
    )
    axes.fill_between(
        train_sizes,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.1,
        color="g",
    )
    axes.plot(train_sizes, train_scores_mean, "o-", color="r", label="Training score")
    axes.plot(
        train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score"
    )
    axes.legend(loc="best")
    return fig, axes


def plot_validation_curve(
    estimator,
    X,
    y,
    ylim=None,
    cv=None,
    n_jobs=None,
    param_name=None,
    param_range=None,
    scoring=None,
):
    """
    referred from :
    https://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics

    :param estimator: object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.
    :param X: array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.
    :param y: array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.
    :param ylim: tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.
    :param cv: int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.
    :param n_jobs: int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.
    :param param_range: array-like, shape (n_values,)
    The values of the parameter that will be evaluated.
    :param param_name: string
    Name of the parameter that will be varied.
    :param scoring: string, callable or None, optional, default: None
        A string (see model evaluation documentation) or a scorer callable object / function
        with signature scorer(estimator, X, y).
    :return: fig


    """
    train_scores, test_scores = validation_curve(
        estimator,
        X,
        y,
        param_name=param_name,
        param_range=param_range,
        scoring=scoring,
        n_jobs=n_jobs,
        cv=cv,
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    fig, axes = plt.subplots(1, 1, figsize=(10, 5))
    axes.grid(True)
    axes.set_title(f"Validation Curve with {estimator.__class__}")
    axes.set_xlabel(f"{param_name}")
    axes.set_ylabel(f"{scoring}")
    if ylim is not None:
        axes.set_ylim(*ylim)

    lw = 2
    axes.plot(
        param_range,
        train_scores_mean,
        label="Training score",
        color="darkorange",
        lw=lw,
    )
    axes.fill_between(
        param_range,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.2,
        color="darkorange",
        lw=lw,
    )
    axes.plot(
        param_range,
        test_scores_mean,
        label="Cross-validation score",
        color="navy",
        lw=lw,
    )
    axes.fill_between(
        param_range,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.2,
        color="navy",
        lw=lw,
    )
    axes.legend(loc="best")
    return fig, axes


In [2]:
import pandas as pd
import seaborn as sns
sns.set()

from pathlib import Path

# classifiers we will use
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, HistGradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost

#imputers
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer

# model selection bits
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, ParameterGrid, ParameterSampler
#from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit, GroupShuffleSplit, GroupKFold, StratifiedKFold
from sklearn.model_selection import learning_curve, validation_curve

# evaluation
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error


import scipy

Let's load all the data. 

In [3]:
# Load the competition datasets into Pandas DataFrame
path = Path("/Users/13392/Documents/amp-parkinsons-disease-progression-prediction")
proteins = pd.read_csv(path/"train_proteins.csv")
peptides = pd.read_csv(path/"train_peptides.csv")
clinical = pd.read_csv(path/"train_clinical_data.csv")
supplemental = pd.read_csv(path/"supplemental_clinical_data.csv")

As discussed previously, we are dropping the entire "medication status" column, because:
1) Over 50% values are NaN. 
2) the test dataset will not have this data. 

In [4]:
# drop the "medication status" column (due to over 50% NaN values), keep a copy of the original for later access. 
clinical_copy = clinical.copy()

clinical.drop('upd23b_clinical_state_on_medication', axis=1, inplace=True)

In [5]:
targets = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']
ids = ['patient_id', 'visit_id']
month = ['visit_month']

Let's see how much remains of 'NaN'.

In [6]:
print(f'NaN value count:\n{clinical.isna().sum()}')
clinical

NaN value count:
visit_id          0
patient_id        0
visit_month       0
updrs_1           1
updrs_2           2
updrs_3          25
updrs_4        1038
dtype: int64


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4
0,55_0,55,0,10.0,6.0,15.0,
1,55_3,55,3,10.0,7.0,25.0,
2,55_6,55,6,8.0,10.0,34.0,
3,55_9,55,9,8.0,9.0,30.0,0.0
4,55_12,55,12,10.0,10.0,41.0,0.0
...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7.0,6.0,13.0,0.0
2611,65043_54,65043,54,4.0,8.0,11.0,1.0
2612,65043_60,65043,60,6.0,6.0,16.0,1.0
2613,65043_72,65043,72,3.0,9.0,14.0,1.0


Significant, but manageable with some kind of imputation. Let's count the number of visits each patient has on record. 

In [7]:
cols = ['patient_id', 'num_entries']
patient_list = clinical.patient_id.unique()
n_list = []
p_list = []

for patient in patient_list:
    n=len(clinical[clinical.patient_id==patient].index)

    n_list.append(n)
    p_list.append(patient)

df_visits_by_patient = pd.DataFrame(list(zip(p_list, n_list)), columns=cols)

df_visits_by_patient


Unnamed: 0,patient_id,num_entries
0,55,13
1,942,15
2,1517,10
3,1923,7
4,2660,6
...,...,...
243,63875,9
244,63889,10
245,64669,15
246,64674,16


NaN values in the "Proteins"  and "Peptides" datasets. 

In [8]:
print(f'NaN value count:\n{proteins.isna().sum()}')
proteins

NaN value count:
visit_id       0
visit_month    0
patient_id     0
UniProt        0
NPX            0
dtype: int64


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0
...,...,...,...,...,...
232736,58648_108,108,58648,Q9UBX5,27387.8
232737,58648_108,108,58648,Q9UHG2,369437.0
232738,58648_108,108,58648,Q9UKV8,105830.0
232739,58648_108,108,58648,Q9Y646,21257.6


In [9]:
print(f'NaN value count:\n{peptides.isna().sum()}')

peptides

NaN value count:
visit_id            0
visit_month         0
patient_id          0
UniProt             0
Peptide             0
PeptideAbundance    0
dtype: int64


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.30
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.00
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.00
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.90
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.70
...,...,...,...,...,...,...
981829,58648_108,108,58648,Q9UHG2,ILAGSADSEGVAAPR,202820.00
981830,58648_108,108,58648,Q9UKV8,SGNIPAGTTVDTK,105830.00
981831,58648_108,108,58648,Q9Y646,LALLVDTVGPR,21257.60
981832,58648_108,108,58648,Q9Y6R7,AGC(UniMod_4)VAESTAVC(UniMod_4)R,5127.26


Great! No NaN values at all in them. Let's find out how many of each patient's visits have protein/peptide data. 

In [10]:
cols = ['patient_id', 'num_entries_protein']
patient_list = proteins.patient_id.unique()
n_list = []
p_list = []

for patient in patient_list:
    n=len(proteins[proteins.patient_id==patient].visit_id.unique())

    n_list.append(n)
    p_list.append(patient)

df_recorded_visits_protein = pd.DataFrame(list(zip(p_list, n_list)), columns=cols)

df_recorded_visits_protein


Unnamed: 0,patient_id,num_entries_protein
0,55,4
1,1517,4
2,1923,3
3,2660,5
4,3636,3
...,...,...
243,52998,3
244,54979,3
245,58597,3
246,7508,3


In [11]:
cols = ['patient_id', 'num_entries_peptide']
patient_list = peptides.patient_id.unique()
n_list = []
p_list = []

for patient in patient_list:
    n=len(peptides[peptides.patient_id==patient].visit_id.unique())

    n_list.append(n)
    p_list.append(patient)

df_recorded_visits_peptide = pd.DataFrame(list(zip(p_list, n_list)), columns=cols)

df_recorded_visits_peptide

Unnamed: 0,patient_id,num_entries_peptide
0,55,4
1,1517,4
2,1923,3
3,2660,5
4,3636,3
...,...,...
243,52998,3
244,54979,3
245,58597,3
246,7508,3


In [12]:
df = pd.merge(df_recorded_visits_protein, df_recorded_visits_peptide, on='patient_id', how='left')
df = pd.merge(df_visits_by_patient, df, on='patient_id', how='left')
df.head(10)

Unnamed: 0,patient_id,num_entries,num_entries_protein,num_entries_peptide
0,55,13,4,4
1,942,15,4,4
2,1517,10,4,4
3,1923,7,3,3
4,2660,6,5,5
5,3636,14,3,3
6,3863,9,5,5
7,4161,12,6,6
8,4172,8,7,7
9,4923,11,5,5


It's not looking great - we can clearly see that, for most of the patients, only 1/2 to 1/3 of the visits contain protein and peptide records.

Something to think about for later on...

Now we will "pivot" the datasets so the unique coding for each protein/peptide becomes a feature for the models to learn on. 

In [13]:
df_proteins = proteins.pivot(index=['patient_id', 'visit_month', 'visit_id'], columns='UniProt', values='NPX').rename_axis(columns=None).reset_index()

df_peptides = peptides.pivot(index=['patient_id', 'visit_month', 'visit_id'], columns='Peptide', values='PeptideAbundance').rename_axis(columns=None).reset_index()


In [14]:
df_proteins

Unnamed: 0,patient_id,visit_month,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
0,55,0,55_0,11254.3,732430.0,39585.8,41526.9,31238.00,4202.71,177775.0,...,365475.0,35528.00,97005.6,23122.5,60912.6,408698.0,,29758.8,23833.7,18953.5
1,55,6,55_6,13163.6,630465.0,35220.8,41295.0,26219.90,4416.42,165638.0,...,405676.0,30332.60,109174.0,23499.8,51655.8,369870.0,,22935.2,17722.5,16642.7
2,55,12,55_12,15257.6,815083.0,41650.9,39763.3,30703.60,4343.60,151073.0,...,303953.0,43026.20,114921.0,21860.1,61598.2,318553.0,65762.6,29193.4,28536.1,19290.9
3,55,36,55_36,13530.8,753832.0,43048.9,43503.6,33577.60,5367.06,101056.0,...,303597.0,48188.40,109794.0,23930.6,70223.5,377550.0,74976.1,31732.6,22186.5,21717.1
4,942,6,942_6,11218.7,399518.0,20581.0,31290.9,6173.58,2564.37,160526.0,...,253373.0,27431.80,93796.7,17450.9,21299.1,306621.0,82335.5,24018.7,18939.5,15251.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,64674,84,64674_84,,190487.0,24907.9,18543.1,10124.90,2308.71,62095.4,...,260021.0,7139.93,104277.0,10500.0,21944.2,136725.0,62217.5,,10287.7,13848.2
1109,65043,0,65043_0,13472.4,927954.0,42661.5,43663.2,20071.30,3278.88,266339.0,...,186414.0,25897.80,,21480.7,57364.0,416142.0,37584.6,,28346.5,35617.5
1110,65043,12,65043_12,14134.9,984651.0,28990.8,42440.9,25357.40,3267.66,270575.0,...,301343.0,22343.40,105626.0,20500.8,54011.2,380072.0,40588.9,,17035.7,37064.2
1111,65043,24,65043_24,14659.5,1062020.0,46440.4,38293.0,21971.80,3990.34,221358.0,...,300439.0,52143.60,139291.0,19449.2,66569.9,300948.0,36150.4,,21286.3,39587.9


In [15]:
df_peptides

Unnamed: 0,patient_id,visit_month,visit_id,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55,0,55_0,8984260.0,53855.6,8579740.0,,19735.4,114400.0,46371.1,...,201158.0,16492.30,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55,6,55_6,8279770.0,45251.9,8655890.0,49927.5,23820.4,90539.4,38652.4,...,171079.0,13198.80,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
2,55,12,55_12,8382390.0,53000.9,8995640.0,45519.2,17813.5,147312.0,45840.9,...,231772.0,17873.80,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
3,55,36,55_36,10671500.0,58108.4,9985420.0,52374.0,19373.3,64356.1,49793.2,...,185290.0,18580.50,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
4,942,6,942_6,6177730.0,42682.6,3596660.0,25698.8,17130.6,86471.5,41007.9,...,226314.0,6399.80,,57571.4,480951.0,80001.2,79661.9,573300.0,48005.8,15674.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,64674,84,64674_84,7083630.0,35656.1,6273100.0,,,15479.2,,...,203523.0,3835.58,4901220.0,40325.9,335625.0,49250.4,64076.3,667993.0,38472.5,21949.1
1109,65043,0,65043_0,7818630.0,95033.0,5119260.0,57483.7,11610.0,270739.0,42527.3,...,257361.0,18316.60,2514660.0,51444.6,530245.0,156148.0,157548.0,336625.0,48423.2,10915.8
1110,65043,12,65043_12,8070390.0,76532.7,8233520.0,54260.6,11631.9,230169.0,42255.5,...,230437.0,16703.20,2481560.0,44405.0,543391.0,159828.0,161207.0,330337.0,45368.1,19023.2
1111,65043,24,65043_24,7608150.0,75401.6,9168030.0,,13313.9,220202.0,46914.1,...,251228.0,18326.20,2939460.0,50588.2,597869.0,148032.0,192857.0,388125.0,65101.0,20790.1


Because not all 1200+ proteins and peptides are measured at each recorded visit, some NaN values should be now expected. 

In [16]:
df_proteins.isna().sum().sort_values(ascending=False)

Q99829        624
Q99832        507
Q562R1        497
P01780        459
Q6UX71        452
             ... 
P02766          0
P02765          0
P02751          0
P02749          0
patient_id      0
Length: 230, dtype: int64

In [17]:
df_peptides.isna().sum().sort_values(ascending=False)

QALPQVR                   624
EPQVYTLPPSRDELTK          550
TPSGLYLGTC(UniMod_4)ER    523
SLEDQVEMLR                514
HYEGSTVPEK                508
                         ... 
visit_id                    0
IPTTFENGR                   0
AIGYLNTGYQR                 0
NILTSNNIDVK                 0
patient_id                  0
Length: 971, dtype: int64

We are going to combine the protein and peptide data, and check once again the status of NaN values. They should remain unchanged because we haven't done anything with them.

In [18]:
prot_pept_df = pd.merge(df_proteins, df_peptides, on=['patient_id','visit_month','visit_id'], how='left')
prot_pept_df

Unnamed: 0,patient_id,visit_month,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55,0,55_0,11254.3,732430.0,39585.8,41526.9,31238.00,4202.71,177775.0,...,201158.0,16492.30,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55,6,55_6,13163.6,630465.0,35220.8,41295.0,26219.90,4416.42,165638.0,...,171079.0,13198.80,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
2,55,12,55_12,15257.6,815083.0,41650.9,39763.3,30703.60,4343.60,151073.0,...,231772.0,17873.80,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
3,55,36,55_36,13530.8,753832.0,43048.9,43503.6,33577.60,5367.06,101056.0,...,185290.0,18580.50,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
4,942,6,942_6,11218.7,399518.0,20581.0,31290.9,6173.58,2564.37,160526.0,...,226314.0,6399.80,,57571.4,480951.0,80001.2,79661.9,573300.0,48005.8,15674.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,64674,84,64674_84,,190487.0,24907.9,18543.1,10124.90,2308.71,62095.4,...,203523.0,3835.58,4901220.0,40325.9,335625.0,49250.4,64076.3,667993.0,38472.5,21949.1
1109,65043,0,65043_0,13472.4,927954.0,42661.5,43663.2,20071.30,3278.88,266339.0,...,257361.0,18316.60,2514660.0,51444.6,530245.0,156148.0,157548.0,336625.0,48423.2,10915.8
1110,65043,12,65043_12,14134.9,984651.0,28990.8,42440.9,25357.40,3267.66,270575.0,...,230437.0,16703.20,2481560.0,44405.0,543391.0,159828.0,161207.0,330337.0,45368.1,19023.2
1111,65043,24,65043_24,14659.5,1062020.0,46440.4,38293.0,21971.80,3990.34,221358.0,...,251228.0,18326.20,2939460.0,50588.2,597869.0,148032.0,192857.0,388125.0,65101.0,20790.1


In [19]:
prot_pept_df.isna().sum().sort_values(ascending=False)

Q99829                    624
QALPQVR                   624
EPQVYTLPPSRDELTK          550
TPSGLYLGTC(UniMod_4)ER    523
SLEDQVEMLR                514
                         ... 
P41222                      0
P02774                      0
P02787                      0
P02790                      0
patient_id                  0
Length: 1198, dtype: int64

In [20]:
patient_list=prot_pept_df['patient_id'].unique()
patient_list

array([   55,   942,  1517,  1923,  2660,  3636,  3863,  4161,  4172,
        4923,  5027,  5036,  5178,  5645,  5742,  6054,  6211,  6420,
        7051,  7117,  7151,  7265,  7508,  7568,  7832,  7886,  8344,
        8699, 10053, 10138, 10174, 10541, 10715, 10718, 11459, 11686,
       11928, 12516, 12636, 12703, 12755, 12931, 13360, 13368, 13618,
       13804, 13852, 13968, 14035, 14124, 14242, 14270, 14344, 14450,
       14811, 15009, 15245, 15504, 15590, 16238, 16347, 16566, 16574,
       16778, 16931, 17154, 17201, 17414, 17727, 18183, 18204, 18553,
       18560, 19088, 20212, 20216, 20352, 20404, 20460, 20581, 20664,
       20707, 20791, 20792, 21126, 21537, 21729, 22126, 22623, 23175,
       23192, 23244, 23391, 23636, 24278, 24690, 24818, 24820, 24911,
       25562, 25739, 25750, 25827, 25911, 26005, 26104, 26210, 26809,
       27079, 27300, 27464, 27468, 27607, 27715, 27872, 27893, 27971,
       27987, 28327, 28342, 28818, 29313, 29417, 30119, 30155, 30416,
       30894, 30951,

Here there are two ways to impute the data:
1) combine "clinical" and "prot_pept_df" first then impute, or
2) first impute them separately, then combine. 

We'll do both and see how the outcomes differ.

I am choosing to use sklearn's KNN imputer because, with a bit of clever coding, I can use all the availble data for a given patient to impute missing values. 

In [21]:
# Method 1: Combine first, impute next. 
big_data = pd.merge(clinical, prot_pept_df, on=['patient_id','visit_month','visit_id'], how='left')
big_data.head(30)

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,O00391,O00533,O00584,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55_0,55,0,10.0,6.0,15.0,,11254.3,732430.0,39585.8,...,201158.0,16492.3,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55_3,55,3,10.0,7.0,25.0,,,,,...,,,,,,,,,,
2,55_6,55,6,8.0,10.0,34.0,,13163.6,630465.0,35220.8,...,171079.0,13198.8,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
3,55_9,55,9,8.0,9.0,30.0,0.0,,,,...,,,,,,,,,,
4,55_12,55,12,10.0,10.0,41.0,0.0,15257.6,815083.0,41650.9,...,231772.0,17873.8,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
5,55_18,55,18,7.0,13.0,38.0,0.0,,,,...,,,,,,,,,,
6,55_24,55,24,16.0,9.0,49.0,0.0,,,,...,,,,,,,,,,
7,55_30,55,30,14.0,13.0,49.0,0.0,,,,...,,,,,,,,,,
8,55_36,55,36,17.0,18.0,51.0,0.0,13530.8,753832.0,43048.9,...,185290.0,18580.5,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
9,55_42,55,42,12.0,20.0,41.0,0.0,,,,...,,,,,,,,,,


In [22]:
big_data

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,O00391,O00533,O00584,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55_0,55,0,10.0,6.0,15.0,,11254.3,732430.0,39585.8,...,201158.0,16492.3,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55_3,55,3,10.0,7.0,25.0,,,,,...,,,,,,,,,,
2,55_6,55,6,8.0,10.0,34.0,,13163.6,630465.0,35220.8,...,171079.0,13198.8,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
3,55_9,55,9,8.0,9.0,30.0,0.0,,,,...,,,,,,,,,,
4,55_12,55,12,10.0,10.0,41.0,0.0,15257.6,815083.0,41650.9,...,231772.0,17873.8,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7.0,6.0,13.0,0.0,10589.6,902434.0,44890.8,...,233567.0,14478.3,3185530.0,48793.0,501159.0,133992.0,170146.0,359045.0,45780.0,17370.6
2611,65043_54,65043,54,4.0,8.0,11.0,1.0,,,,...,,,,,,,,,,
2612,65043_60,65043,60,6.0,6.0,16.0,1.0,,,,...,,,,,,,,,,
2613,65043_72,65043,72,3.0,9.0,14.0,1.0,,,,...,,,,,,,,,,


In [23]:
big_data.isna().mean().sort_values()

visit_id                  0.000000
patient_id                0.000000
visit_month               0.000000
updrs_1                   0.000382
updrs_2                   0.000765
                            ...   
SLEDQVEMLR                0.781644
TPSGLYLGTC(UniMod_4)ER    0.782409
EPQVYTLPPSRDELTK          0.793881
QALPQVR                   0.821415
Q99829                    0.821415
Length: 1202, dtype: float64

In [24]:
big_data_1 = big_data.loc[:, big_data.isna().mean() < 0.7]
big_data_1

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,O00533,O00584,O14498,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55_0,55,0,10.0,6.0,15.0,,732430.0,39585.8,41526.9,...,201158.0,16492.3,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55_3,55,3,10.0,7.0,25.0,,,,,...,,,,,,,,,,
2,55_6,55,6,8.0,10.0,34.0,,630465.0,35220.8,41295.0,...,171079.0,13198.8,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
3,55_9,55,9,8.0,9.0,30.0,0.0,,,,...,,,,,,,,,,
4,55_12,55,12,10.0,10.0,41.0,0.0,815083.0,41650.9,39763.3,...,231772.0,17873.8,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7.0,6.0,13.0,0.0,902434.0,44890.8,38771.5,...,233567.0,14478.3,3185530.0,48793.0,501159.0,133992.0,170146.0,359045.0,45780.0,17370.6
2611,65043_54,65043,54,4.0,8.0,11.0,1.0,,,,...,,,,,,,,,,
2612,65043_60,65043,60,6.0,6.0,16.0,1.0,,,,...,,,,,,,,,,
2613,65043_72,65043,72,3.0,9.0,14.0,1.0,,,,...,,,,,,,,,,


In [25]:
#imputing
big_data_patient_list = big_data_1.patient_id.unique()
data_imputed_list = []
for patient_id in big_data_patient_list:
    masked_data = big_data[big_data['patient_id']==patient_id]
    num_rows = len(masked_data.index)
    knn = KNNImputer(missing_values=np.nan, keep_empty_features=True, n_neighbors=num_rows-1, weights='distance')
    X_knn = knn.fit_transform(masked_data)
    X_knn_df = pd.DataFrame(X_knn, columns = big_data.columns)
    data_imputed_list.append(X_knn_df)

big_data_imputed_1 = pd.concat(data_imputed_list, ignore_index=True)

In [26]:
big_data_imputed_1.head(30)

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,O00391,O00533,O00584,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,550.0,55.0,0.0,10.0,6.0,15.0,0.0,11254.3,732430.0,39585.8,...,201158.0,16492.3,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,553.0,55.0,3.0,10.0,7.0,25.0,0.0,12334.785768,675214.704118,37137.66053,...,184262.267618,14644.342049,3984725.0,110536.418287,543921.541924,115616.062135,153965.859351,448964.0626,42796.419382,18169.287093
2,556.0,55.0,6.0,8.0,10.0,34.0,0.0,13163.6,630465.0,35220.8,...,171079.0,13198.8,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
3,559.0,55.0,9.0,8.0,9.0,30.0,0.0,12862.372461,646873.277154,35923.996315,...,175907.727634,13728.866925,4070366.0,112340.558877,525569.281503,107942.05763,148048.968011,454649.313064,41058.34615,19778.422756
4,5512.0,55.0,12.0,10.0,10.0,41.0,0.0,15257.6,815083.0,41650.9,...,231772.0,17873.8,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
5,5518.0,55.0,18.0,7.0,13.0,38.0,0.0,14986.051248,805405.447723,41858.782472,...,224527.24418,17977.316686,5037707.0,112368.453483,706490.758504,135621.228578,185093.414028,459390.594547,54406.710646,20621.44696
6,5524.0,55.0,24.0,16.0,9.0,49.0,0.0,14425.806933,785489.93529,42300.996496,...,209508.09544,18199.369402,4130422.0,104209.424406,695717.193668,132903.900331,192093.918346,474288.047929,53765.988135,18085.590942
7,5530.0,55.0,30.0,14.0,13.0,49.0,0.0,14050.064768,772176.09698,42608.884997,...,199373.871407,18355.493866,3516106.0,98672.385379,688670.794502,131089.66778,196898.785238,484425.00637,53349.986393,16368.275124
8,5536.0,55.0,36.0,17.0,18.0,51.0,0.0,13530.8,753832.0,43048.9,...,185290.0,18580.5,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
9,5542.0,55.0,42.0,12.0,20.0,41.0,0.0,13968.772329,769262.341576,42666.746983,...,197228.716914,18383.700115,3387703.0,97524.725463,687007.63388,130690.799535,197853.349919,486505.470464,53249.369332,16009.561976


In [27]:
big_data_imputed_1 = big_data_imputed_1.astype(int)

In [28]:
#relabel patient id, visit id, and visit month.
ls = [ids, month]
for col in ls:
    big_data_imputed_1[col] = big_data_1[col]

In [29]:
big_data_imputed_1

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,O00391,O00533,O00584,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55_0,55,0,10,6,15,0,11254,732430,39585,...,201158,16492,3810270,106894,580667,131155,165851,437305,46289,14898
1,55_3,55,3,10,7,25,0,12334,675214,37137,...,184262,14644,3984725,110536,543921,115616,153965,448964,42796,18169
2,55_6,55,6,8,10,34,0,13163,630465,35220,...,171079,13198,4119520,113385,514861,103512,144607,457891,40047,20703
3,55_9,55,9,8,9,30,0,12862,646873,35923,...,175907,13728,4070366,112340,525569,107942,148048,454649,41058,19778
4,55_12,55,12,10,10,41,0,15257,815083,41650,...,231772,17873,5474140,116286,711815,136943,181763,452253,54725,21841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7,6,13,0,10589,902434,44890,...,233567,14478,3185530,48793,501159,133992,170146,359045,45780,17370
2611,65043_54,65043,54,4,8,11,1,11577,934826,43335,...,235782,15286,3070952,48562,519938,138924,172444,360055,48541,18052
2612,65043_60,65043,60,6,6,16,1,11964,947219,42647,...,236561,15597,3023890,48444,527081,140916,173208,360198,49539,18312
2613,65043_72,65043,72,3,9,14,1,12369,959960,41847,...,237295,15917,2972324,48294,534382,143065,173880,360103,50508,18579


In [30]:
def smape_score(actual, predicted):
    sum = 0
    for a, p in zip(actual, predicted):
        if a==0 and p==0:
            pass
        else:
            sum += (np.abs(p-a))/(np.abs(p)+np.abs(a))*2
    return sum/len(actual)*100

My previous tests showed that ADABoost Regressor performed equally well in predicting UPDRS_1, _2, _3 scores and significantly better in predicting UPDRS_4 scores. This eliminates the need to have a different model for each score, and simplifies my model training. 

The parameters I am fine-tuning for are "n_estimators" (number of trees), "max_depth", and "learning_rating". Learning Rate is referring to how much the model emphasizes the "mistakes" it makes in the process of training - a h as we can imagine, a higher learning rate would produce a model that trains more specifically on the training data, but it could potentially lead to overfitting. The default learning rate is 1. 

In [31]:

params = {'n_estimators': [100,250,500,750,1000],
                 'max_depth': np.arange(20,50,5), 
                 'learning_rate': np.arange(0.1,3,0.2)}

random_params = list(ParameterSampler(params, n_iter=5, random_state=2))

random_params


[{'n_estimators': 100, 'max_depth': 35, 'learning_rate': 2.1000000000000005},
 {'n_estimators': 500, 'max_depth': 40, 'learning_rate': 2.7000000000000006},
 {'n_estimators': 750, 'max_depth': 45, 'learning_rate': 2.3000000000000007},
 {'n_estimators': 500, 'max_depth': 40, 'learning_rate': 0.9000000000000001},
 {'n_estimators': 250, 'max_depth': 30, 'learning_rate': 1.7000000000000004}]

I'll first save these pseudo randomly generated parameters in a dataframe, for later referencing.

In [32]:
params_df = pd.DataFrame(random_params)
params_df

Unnamed: 0,n_estimators,max_depth,learning_rate
0,100,35,2.1
1,500,40,2.7
2,750,45,2.3
3,500,40,0.9
4,250,30,1.7


In [33]:
#ADABOOST REGRESSOR with decision tree regressor, random parameters, 5 iterations.

X = big_data_imputed_1.drop(columns=targets, axis=1)

smape_list_ada = []
mae_list_ada = []
actual_depth_ada = []


for target in targets:

    y = big_data_imputed_1[target]
    
    #splitting training and testing data. 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=0)

    print(f'{target}: {len(X_train)} samples in training, {len(X_test)} samples in testing.')
    
    for params in random_params:
        #fitting and testing
        dtr = DecisionTreeRegressor(max_depth=params['max_depth'])
        ada = AdaBoostRegressor(dtr, n_estimators=params['n_estimators'], learning_rate=params['learning_rate'])
        ada.fit(X_train,y_train)    
        y_predict = ada.predict(X_test)

        #scoring
        mae = mean_absolute_error(y_test, y_predict)
        smape = smape_score(y_test, y_predict)
        print(f'Parameter: {params}; MAE: {mae}; SMAPE: {smape}')

        #saving scores in their lists
        mae_list_ada.append(mae)
        smape_list_ada.append(smape)

        max_depth = list()
        for tree in ada.estimators_:
            max_depth.append(tree.tree_.max_depth)
        print("avg max depth %0.1f" % (sum(max_depth) / len(max_depth)))
        print(f"max depth {max(max_depth)}")
        print(f"min depth {min(max_depth)}")
        print()
        actual_depth_ada.append(max(max_depth))
    print()

updrs_1: 2353 samples in training, 262 samples in testing.
Parameter: {'n_estimators': 100, 'max_depth': 35, 'learning_rate': 2.1000000000000005}; MAE: 2.5572519083969465; SMAPE: 48.57136440099941
avg max depth 16.0
max depth 29
min depth 11

Parameter: {'n_estimators': 500, 'max_depth': 40, 'learning_rate': 2.7000000000000006}; MAE: 4.347328244274809; SMAPE: 79.70248768144883
avg max depth 1.4
max depth 33
min depth 0

Parameter: {'n_estimators': 750, 'max_depth': 45, 'learning_rate': 2.3000000000000007}; MAE: 3.5572519083969465; SMAPE: 63.76967661797991
avg max depth 6.3
max depth 23
min depth 0

Parameter: {'n_estimators': 500, 'max_depth': 40, 'learning_rate': 0.9000000000000001}; MAE: 2.366412213740458; SMAPE: 47.84594249999242
avg max depth 24.1
max depth 34
min depth 18

Parameter: {'n_estimators': 250, 'max_depth': 30, 'learning_rate': 1.7000000000000004}; MAE: 2.381679389312977; SMAPE: 46.971838996574476
avg max depth 20.6
max depth 30
min depth 16


updrs_2: 2353 samples in t

In [34]:
ada_df = pd.concat([params_df,params_df,params_df,params_df], ignore_index=True)
ada_df['actual depth reached'] = actual_depth_ada
ada_df['MAE'] = mae_list_ada
ada_df['SMAPE'] = smape_list_ada
ada_df['UPDRS'] = ""
ada_df.loc[:4, 'UPDRS']=1
ada_df.loc[5:9, 'UPDRS']=2
ada_df.loc[10:14, 'UPDRS']=3
ada_df.loc[15:, 'UPDRS']=4

ada_df

Unnamed: 0,n_estimators,max_depth,learning_rate,actual depth reached,MAE,SMAPE,UPDRS
0,100,35,2.1,29,2.557252,48.571364,1
1,500,40,2.7,33,4.347328,79.702488,1
2,750,45,2.3,23,3.557252,63.769677,1
3,500,40,0.9,34,2.366412,47.845942,1
4,250,30,1.7,30,2.381679,46.971839,1
5,100,35,2.1,24,2.435115,62.096859,2
6,500,40,2.7,27,4.652672,98.221842,2
7,750,45,2.3,27,3.576336,85.804198,2
8,500,40,0.9,35,2.148855,47.129512,2
9,250,30,1.7,30,2.312977,54.808246,2


From prior tests, I already know that having more trees yields better results. Here I see that having 500 trees is sufficient, and having more (750) didn't make the scores better. Additionally, the actual maximum depth reached is 45, but that didn't necessarily produce the best results either. On the contrary, some iterations with fewer trees and less depth did better. There seems to be very little difference in scores beyond 500 trees and a max depth of 35-40, except that the models take significantly more time to train with more trees and more depth.

The only other variable here is Learning Rate, as the results are ambiguous to suggest an optimal rate, so I am going to look further into that. 

I will set n_estimators to 500, max_depth to 40, and vary the learning rate between 0.1 and 100. Again, the default is 1.  

In [35]:
learning_rates = [0.1,0.5,1,5,10,20,50,100]

In [37]:
# ADABOOST REGRESSOR with decision tree regressor, testing for learning rate. 
# n_estimators = 500, max_depth = 40

X = big_data_imputed_1.drop(columns=targets, axis=1)

smape_list_ada = []
mae_list_ada = []
actual_depth_ada = []


for target in targets:

    y = big_data_imputed_1[target]
    
    #splitting training and testing data. 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=0)

    print(f'{target}: {len(X_train)} samples in training, {len(X_test)} samples in testing.')
    
    for learning_rate in learning_rates:
        #fitting and testing
        dtr = DecisionTreeRegressor(max_depth=40)
        ada = AdaBoostRegressor(dtr, n_estimators=500, learning_rate=learning_rate)
        ada.fit(X_train,y_train)    
        y_predict = ada.predict(X_test)

        #scoring
        mae = mean_absolute_error(y_test, y_predict)
        smape = smape_score(y_test, y_predict)
        print(f'Learning Rate: {learning_rate}; MAE: {mae}; SMAPE: {smape}')

        #saving scores in their lists
        mae_list_ada.append(mae)
        smape_list_ada.append(smape)

        max_depth = list()
        for tree in ada.estimators_:
            max_depth.append(tree.tree_.max_depth)
        print("avg max depth %0.1f" % (sum(max_depth) / len(max_depth)))
        print(f"max depth {max(max_depth)}")
        print(f"min depth {min(max_depth)}")
        print()
        actual_depth_ada.append(max(max_depth))
    print()

updrs_1: 2353 samples in training, 262 samples in testing.
Learning Rate: 0.1; MAE: 2.3587786259541983; SMAPE: 48.1939394952882
avg max depth 25.6
max depth 36
min depth 20

Learning Rate: 0.5; MAE: 2.3625954198473282; SMAPE: 47.68574644770978
avg max depth 24.9
max depth 36
min depth 18

Learning Rate: 1; MAE: 2.312977099236641; SMAPE: 47.595727505066314
avg max depth 24.1
max depth 35
min depth 18

Learning Rate: 5; MAE: 4.187022900763359; SMAPE: 71.56875189344737
avg max depth 4.9
max depth 40
min depth 0

Learning Rate: 10; MAE: 23.98854961832061; SMAPE: 132.2623721905669
avg max depth 11.9
max depth 40
min depth 0

Learning Rate: 20; MAE: 25.98854961832061; SMAPE: 135.4376226609052
avg max depth 0.1
max depth 31
min depth 0

Learning Rate: 50; MAE: 7.011450381679389; SMAPE: 187.78625954198475
avg max depth 0.1
max depth 28
min depth 0

Learning Rate: 100; MAE: 7.011450381679389; SMAPE: 187.78625954198475
avg max depth 0.1
max depth 29
min depth 0


updrs_2: 2353 samples in trainin

In [38]:
learning_rates

[0.1, 0.5, 1, 5, 10, 20, 50, 100]

In [39]:
learning_rate_df = pd.DataFrame()
learning_rate_df['Learning Rate'] = learning_rates
learning_rate_df

Unnamed: 0,Learning Rate
0,0.1
1,0.5
2,1.0
3,5.0
4,10.0
5,20.0
6,50.0
7,100.0


In [40]:

ada_df1 = pd.concat([learning_rate_df, learning_rate_df, learning_rate_df, learning_rate_df], ignore_index=True)
ada_df1['n_estimators'] = 500
ada_df1['Max Depth'] = 40
ada_df1['actual depth reached'] = actual_depth_ada
ada_df1['MAE'] = mae_list_ada
ada_df1['SMAPE'] = smape_list_ada
ada_df1['UPDRS'] = ""
ada_df1.loc[:7, 'UPDRS']=1
ada_df1.loc[8:15, 'UPDRS']=2
ada_df1.loc[16:23, 'UPDRS']=3
ada_df1.loc[24:, 'UPDRS']=4


ada_df1

Unnamed: 0,Learning Rate,n_estimators,Max Depth,actual depth reached,MAE,SMAPE,UPDRS
0,0.1,500,40,36,2.358779,48.193939,1
1,0.5,500,40,36,2.362595,47.685746,1
2,1.0,500,40,35,2.312977,47.595728,1
3,5.0,500,40,40,4.187023,71.568752,1
4,10.0,500,40,40,23.98855,132.262372,1
5,20.0,500,40,31,25.98855,135.437623,1
6,50.0,500,40,28,7.01145,187.78626,1
7,100.0,500,40,29,7.01145,187.78626,1
8,0.1,500,40,40,2.114504,43.821501,2
9,0.5,500,40,37,2.10687,45.882301,2


Somewhat surprisingly, for all UPDRS scores, a higher than default learning rate (1) leads to much worse MAE and SMAPE scores. This is actually very good news, because a high learning rate tends to teach a model to overfit. By the same logic, a low learning rate that leads to good scores means the model will more likely do better with independent testing data. I also verified that a max depth of 40 is sufficient, because in the highest scoring parameters, only once did any decision tree reached the depth of 40. This can be verified if needed, but I think there is sufficient evidence to show that there is no need to go beyond max_depth of 40. 

I will now verify if there is significant improvement in the scores with a lot of trees, by setting max depth to 40 and learning rate to 0.1. Previously we have seen some evidence that there is little additional gain after about 500 trees, but there were multiple variables. This time the only parameter we are iterating through is n_estimators. 

In [41]:
n_trees = [250,500,750,1000]
n_trees_df = pd.DataFrame()
n_trees_df['n_estimators'] = n_trees
n_trees_df

Unnamed: 0,n_estimators
0,250
1,500
2,750
3,1000


In [42]:
# ADABOOST REGRESSOR with decision tree regressor, testing for n_estimators. 
# max_depth=40, learning_rate=0.1
X = big_data_imputed_1.drop(columns=targets, axis=1)

smape_list_ada = []
mae_list_ada = []
actual_depth_ada = []


for target in targets:

    y = big_data_imputed_1[target]
    
    #splitting training and testing data. 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=0)

    print(f'{target}: {len(X_train)} samples in training, {len(X_test)} samples in testing.')
    
    for n_estimators in n_trees:
        #fitting and testing
        dtr = DecisionTreeRegressor(max_depth=40)
        ada = AdaBoostRegressor(dtr, n_estimators=n_estimators, learning_rate=0.1)
        ada.fit(X_train,y_train)    
        y_predict = ada.predict(X_test)

        #scoring
        mae = mean_absolute_error(y_test, y_predict)
        smape = smape_score(y_test, y_predict)
        print(f'n_estimators: {n_estimators}; MAE: {mae}; SMAPE: {smape}')

        #saving scores in their lists
        mae_list_ada.append(mae)
        smape_list_ada.append(smape)

        max_depth = list()
        for tree in ada.estimators_:
            max_depth.append(tree.tree_.max_depth)
        print("avg max depth %0.1f" % (sum(max_depth) / len(max_depth)))
        print(f"max depth {max(max_depth)}")
        print(f"min depth {min(max_depth)}")
        print()
        actual_depth_ada.append(max(max_depth))
    print()

updrs_1: 2353 samples in training, 262 samples in testing.
n_estimators: 250; MAE: 2.366412213740458; SMAPE: 47.50414941028742
avg max depth 25.6
max depth 39
min depth 18

n_estimators: 500; MAE: 2.3435114503816794; SMAPE: 47.689768716552855
avg max depth 25.7
max depth 40
min depth 18

n_estimators: 750; MAE: 2.366412213740458; SMAPE: 47.06648363109143
avg max depth 25.4
max depth 40
min depth 19

n_estimators: 1000; MAE: 2.3702290076335877; SMAPE: 48.14584028485007
avg max depth 25.5
max depth 36
min depth 17


updrs_2: 2353 samples in training, 262 samples in testing.
n_estimators: 250; MAE: 2.1374045801526718; SMAPE: 43.67143765094306
avg max depth 23.9
max depth 40
min depth 18

n_estimators: 500; MAE: 2.1374045801526718; SMAPE: 45.048447029879604
avg max depth 24.1
max depth 40
min depth 18

n_estimators: 750; MAE: 2.1259541984732824; SMAPE: 44.53924435782508
avg max depth 23.9
max depth 36
min depth 18

n_estimators: 1000; MAE: 2.114503816793893; SMAPE: 43.89757008804988
avg ma

In [43]:
ada_df2 = pd.concat([n_trees_df, n_trees_df, n_trees_df, n_trees_df], ignore_index=True)
ada_df2['Learning rate'] = 0.1
ada_df2['Max Depth'] = 40
ada_df2['actual depth reached'] = actual_depth_ada
ada_df2['MAE'] = mae_list_ada
ada_df2['SMAPE'] = smape_list_ada
ada_df2['UPDRS'] = ""
ada_df2.loc[:3, 'UPDRS']=1
ada_df2.loc[4:7, 'UPDRS']=2
ada_df2.loc[8:11, 'UPDRS']=3
ada_df2.loc[12:, 'UPDRS']=4


ada_df2

Unnamed: 0,n_estimators,Learning rate,Max Depth,actual depth reached,MAE,SMAPE,UPDRS
0,250,0.1,40,39,2.366412,47.504149,1
1,500,0.1,40,40,2.343511,47.689769,1
2,750,0.1,40,40,2.366412,47.066484,1
3,1000,0.1,40,36,2.370229,48.14584,1
4,250,0.1,40,40,2.137405,43.671438,2
5,500,0.1,40,40,2.137405,45.048447,2
6,750,0.1,40,36,2.125954,44.539244,2
7,1000,0.1,40,35,2.114504,43.89757,2
8,250,0.1,40,32,4.801527,47.447415,3
9,500,0.1,40,33,4.877863,47.645556,3


We now have a good idea for 2 out of the three parameters: n_estimators = 750 and max_depth = 40 yield good results. I'm going to narrow down my search for an optimal learning rate. For the sake of speed, I am going to use 200 trees and a max depth of 30, but that should not affect how the model behaves differently depending on the one variable: learning rate. I also set the training size to be 80% of the total dataset, again to speed things up a little.

In [66]:
learning_rates = [0.1,0.2,0.3,0.5,0.75,1,1.5]

In [68]:
# ADABOOST REGRESSOR with decision tree regressor, testing for learning rate. 
# n_estimators = 750, max_depth = 40

X = big_data_imputed_1.drop(columns=targets, axis=1)

smape_list_ada = []
mae_list_ada = []
actual_depth_ada = []


for target in targets:

    y = big_data_imputed_1[target]
    
    #splitting training and testing data. 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

    print(f'{target}: {len(X_train)} samples in training, {len(X_test)} samples in testing.')
    
    for learning_rate in learning_rates:
        #fitting and testing
        dtr = DecisionTreeRegressor(max_depth=30)
        ada = AdaBoostRegressor(dtr, n_estimators=200, learning_rate=learning_rate)
        ada.fit(X_train,y_train)    
        y_predict = ada.predict(X_test)

        #scoring
        mae = mean_absolute_error(y_test, y_predict)
        smape = smape_score(y_test, y_predict)
        print(f'Learning Rate: {learning_rate}; MAE: {mae}; SMAPE: {smape}')

        #saving scores in their lists
        mae_list_ada.append(mae)
        smape_list_ada.append(smape)

        max_depth = list()
        for tree in ada.estimators_:
            max_depth.append(tree.tree_.max_depth)
        print("avg max depth %0.1f" % (sum(max_depth) / len(max_depth)))
        print(f"max depth {max(max_depth)}")
        print(f"min depth {min(max_depth)}")
        print()
        actual_depth_ada.append(max(max_depth))
    print()

updrs_1: 2092 samples in training, 523 samples in testing.
Learning Rate: 0.1; MAE: 2.4684512428298278; SMAPE: 48.678829321586
avg max depth 24.6
max depth 30
min depth 17

Learning Rate: 0.2; MAE: 2.5124282982791586; SMAPE: 50.03472226503675
avg max depth 24.7
max depth 30
min depth 18

Learning Rate: 0.3; MAE: 2.48491619160285; SMAPE: 48.63436522485042
avg max depth 24.6
max depth 30
min depth 19

Learning Rate: 0.5; MAE: 2.5047801147227533; SMAPE: 49.568925709420064
avg max depth 24.2
max depth 30
min depth 18

Learning Rate: 0.75; MAE: 2.4815414031475216; SMAPE: 49.32548347486503
avg max depth 23.8
max depth 30
min depth 17

Learning Rate: 1; MAE: 2.502868068833652; SMAPE: 49.08569002518787
avg max depth 23.5
max depth 30
min depth 17

Learning Rate: 1.5; MAE: 2.478011472275335; SMAPE: 48.21757770359311
avg max depth 21.4
max depth 30
min depth 15


updrs_2: 2092 samples in training, 523 samples in testing.
Learning Rate: 0.1; MAE: 2.170195689634823; SMAPE: 50.78694396009552
avg ma

In [69]:
learning_rate_df = pd.DataFrame()
learning_rate_df['Learning Rate'] = learning_rates
learning_rate_df

Unnamed: 0,Learning Rate
0,0.1
1,0.2
2,0.3
3,0.5
4,0.75
5,1.0
6,1.5


In [70]:

ada_df2 = pd.concat([learning_rate_df, learning_rate_df, learning_rate_df, learning_rate_df], ignore_index=True)
ada_df2['n_estimators'] = 200
ada_df2['Max Depth'] = 30
ada_df2['actual depth reached'] = actual_depth_ada
ada_df2['MAE'] = mae_list_ada
ada_df2['SMAPE'] = smape_list_ada
ada_df2['UPDRS'] = ""
ada_df2.loc[:6, 'UPDRS']=1
ada_df2.loc[7:13, 'UPDRS']=2
ada_df2.loc[14:20, 'UPDRS']=3
ada_df2.loc[21:, 'UPDRS']=4


ada_df2

Unnamed: 0,Learning Rate,n_estimators,Max Depth,actual depth reached,MAE,SMAPE,UPDRS
0,0.1,200,30,30,2.468451,48.678829,1
1,0.2,200,30,30,2.512428,50.034722,1
2,0.3,200,30,30,2.484916,48.634365,1
3,0.5,200,30,30,2.50478,49.568926,1
4,0.75,200,30,30,2.481541,49.325483,1
5,1.0,200,30,30,2.502868,49.08569,1
6,1.5,200,30,30,2.478011,48.217578,1
7,0.1,200,30,30,2.170196,50.786944,2
8,0.2,200,30,30,2.141556,52.10856,2
9,0.3,200,30,30,2.177767,52.126879,2


Because we don't have actual testing data (yet - the estimated public release date for the data is Oct. 1, 2023), and so far my model has been training/testing on imputed data. I will instead go back to the combined dataset prior to imputation (big_data), and randomly select 100 samples that have complete UPDRS 1-4 scores and set them aside for testing. Then I will impute the rest of the dataset, and retrain my model on it and see how it performs against the new testing data.

In [44]:
targets

['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

In [45]:
big_data_drop_all_na = big_data.dropna()
big_data_drop_all_na

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,O00391,O00533,O00584,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK


There are no rows (i.e. visits) in which all proteins and peptides are analyzed. I'm only going to drop any visits that do not contain complete UPDRS scores, because these are my target values and one could make the arguement that imputing target values for training/test might make the model less reliable. 

In [46]:
test123 = big_data.dropna(subset='updrs_4')
test123.isna().sum().head(10)

visit_id          0
patient_id        0
visit_month       0
updrs_1           0
updrs_2           1
updrs_3          14
updrs_4           0
O00391         1191
O00533         1009
O00584         1021
dtype: int64

In [47]:
big_data_dropna = big_data.dropna(subset=targets)
big_data_dropna

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,O00391,O00533,O00584,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
3,55_9,55,9,8.0,9.0,30.0,0.0,,,,...,,,,,,,,,,
4,55_12,55,12,10.0,10.0,41.0,0.0,15257.6,815083.0,41650.9,...,231772.0,17873.8,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
5,55_18,55,18,7.0,13.0,38.0,0.0,,,,...,,,,,,,,,,
6,55_24,55,24,16.0,9.0,49.0,0.0,,,,...,,,,,,,,,,
7,55_30,55,30,14.0,13.0,49.0,0.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7.0,6.0,13.0,0.0,10589.6,902434.0,44890.8,...,233567.0,14478.3,3185530.0,48793.0,501159.0,133992.0,170146.0,359045.0,45780.0,17370.6
2611,65043_54,65043,54,4.0,8.0,11.0,1.0,,,,...,,,,,,,,,,
2612,65043_60,65043,60,6.0,6.0,16.0,1.0,,,,...,,,,,,,,,,
2613,65043_72,65043,72,3.0,9.0,14.0,1.0,,,,...,,,,,,,,,,


In [48]:
big_data_dropna.isna().sum().head(10)

visit_id          0
patient_id        0
visit_month       0
updrs_1           0
updrs_2           0
updrs_3           0
updrs_4           0
O00391         1181
O00533          999
O00584         1011
dtype: int64

Now that there are no NaN values in UPDRS scores, I can manipulate the rest of the dataset. ADABoost Regressor  cannot handle NaN values - and as we see, there are a lot still. The question is: split first then impute separately, or impute together then split? 
1) Splitting first: 

In [49]:

train, test = train_test_split(big_data_dropna, test_size=0.1, random_state=0)


In [50]:
train.count()

visit_id                 1405
patient_id               1405
visit_month              1405
updrs_1                  1405
updrs_2                  1405
                         ... 
YVNKEIQNAVNGVK            508
YWGVASFLQK                495
YYC(UniMod_4)FQGNQFLR     504
YYTYLIMNK                 459
YYWGGQYTWDMAK             378
Length: 1202, dtype: int64

In [51]:
test.count()

visit_id                 157
patient_id               157
visit_month              157
updrs_1                  157
updrs_2                  157
                        ... 
YVNKEIQNAVNGVK            55
YWGVASFLQK                53
YYC(UniMod_4)FQGNQFLR     55
YYTYLIMNK                 53
YYWGGQYTWDMAK             40
Length: 1202, dtype: int64

THe ADABoost Regressor does not handle NaN values well, so we must first impute the training data. 

In [52]:
#imputing the training data
train_patient_list = train.patient_id.unique()
train_imputed_list = []
for patient_id in train_patient_list:
    masked_data = train[train['patient_id']==patient_id]
    num_rows = len(masked_data.index)
    knn = KNNImputer(missing_values=np.nan, keep_empty_features=True, n_neighbors=num_rows)
    knn_data = knn.fit_transform(masked_data)
    knn_df = pd.DataFrame(knn_data, columns = train.columns)
    train_imputed_list.append(knn_df)

train_imputed = pd.concat(train_imputed_list)

One thing to keep in mind about imputing the testing dataset (which is 10% of the whole dataset), is that imputation becomes more problematic when the dataset is small. 

In [53]:
#imputing the testing data
test_patient_list = test.patient_id.unique()
test_imputed_list = []
for patient_id in test_patient_list:
    masked_data = test[test['patient_id']==patient_id]
    num_rows = len(masked_data.index)
    knn = KNNImputer(missing_values=np.nan, keep_empty_features=True, n_neighbors=num_rows)
    knn_data = knn.fit_transform(masked_data)
    knn_df = pd.DataFrame(knn_data, columns = test.columns)
    test_imputed_list.append(knn_df)

test_imputed = pd.concat(test_imputed_list)

In [54]:
train_imputed.isna().sum()

visit_id                 0
patient_id               0
visit_month              0
updrs_1                  0
updrs_2                  0
                        ..
YVNKEIQNAVNGVK           0
YWGVASFLQK               0
YYC(UniMod_4)FQGNQFLR    0
YYTYLIMNK                0
YYWGGQYTWDMAK            0
Length: 1202, dtype: int64

In [55]:
test_imputed.isna().sum()

visit_id                 0
patient_id               0
visit_month              0
updrs_1                  0
updrs_2                  0
                        ..
YVNKEIQNAVNGVK           0
YWGVASFLQK               0
YYC(UniMod_4)FQGNQFLR    0
YYTYLIMNK                0
YYWGGQYTWDMAK            0
Length: 1202, dtype: int64

I will now train and test the model. Remember that the difference here is this:
This dataset does not contain any imputed UPDRS scores. 

In [56]:
# ADABOOST REGRESSOR
# max_depth=40, learning_rate=0.1, n_estimators=500
X_train = train_imputed.drop(columns=targets, axis=1)
X_test = test_imputed.drop(columns=targets, axis=1)

smape_list_ada = []
mae_list_ada = []
actual_depth_ada = []


for target in targets:

    y_train = train_imputed[target]
    y_test = test_imputed[target]

    print(f'{target}: {len(X_train)} samples in training, {len(X_test)} samples in testing.')
    
    #fitting and testing
    dtr = DecisionTreeRegressor(max_depth=40)
    ada = AdaBoostRegressor(dtr, n_estimators=500, learning_rate=0.1)
    ada.fit(X_train,y_train)    
    y_predict = ada.predict(X_test)

    #scoring
    mae = mean_absolute_error(y_test, y_predict)
    smape = smape_score(y_test, y_predict)
    print(f'MAE: {mae}; SMAPE: {smape}')

    #saving scores in their lists
    mae_list_ada.append(mae)
    smape_list_ada.append(smape)

    max_depth = list()
    for tree in ada.estimators_:
        max_depth.append(tree.tree_.max_depth)
    print("avg max depth %0.1f" % (sum(max_depth) / len(max_depth)))
    print(f"max depth {max(max_depth)}")
    print(f"min depth {min(max_depth)}")
    print()
    actual_depth_ada.append(max(max_depth))
    print()

updrs_1: 1405 samples in training, 157 samples in testing.
MAE: 4.350318471337579; SMAPE: 61.12010188510186
avg max depth 22.0
max depth 32
min depth 17


updrs_2: 1405 samples in training, 157 samples in testing.
MAE: 4.3630573248407645; SMAPE: 65.82770251618615
avg max depth 22.4
max depth 30
min depth 17


updrs_3: 1405 samples in training, 157 samples in testing.
MAE: 10.853503184713375; SMAPE: 61.82604982180929
avg max depth 21.6
max depth 29
min depth 17


updrs_4: 1405 samples in training, 157 samples in testing.
MAE: 1.9681528662420382; SMAPE: 94.76960094981825
avg max depth 22.0
max depth 33
min depth 16




In [57]:
df_proteins_dropna = df_proteins.dropna(axis=1)
df_peptides_dropna = df_peptides.dropna(axis=1)

In [58]:
df_prot_pept_dropna = pd.merge(df_proteins_dropna, df_peptides_dropna, on=['patient_id','visit_month','visit_id'], how='left')

In [59]:
df_prot_pept_dropna.isna().sum()

patient_id     0
visit_month    0
visit_id       0
O15240         0
P01009         0
P01011         0
P01023         0
P01024         0
P01042         0
P01834         0
P01876         0
P02647         0
P02649         0
P02749         0
P02751         0
P02765         0
P02766         0
P02768         0
P02774         0
P02787         0
P02790         0
P05090         0
P06396         0
P07602         0
P10909         0
P23142         0
P41222         0
Q12805         0
Q92520         0
Q9UHG2         0
AIGYLNTGYQR    0
IPTTFENGR      0
KYLYEIAR       0
NILTSNNIDVK    0
TLLSNLEEAK     0
dtype: int64

In [60]:
small_data = pd.merge(clinical, df_prot_pept_dropna, on=['patient_id','visit_month','visit_id'], how='left')
small_data

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,O15240,P01009,P01011,...,P23142,P41222,Q12805,Q92520,Q9UHG2,AIGYLNTGYQR,IPTTFENGR,KYLYEIAR,NILTSNNIDVK,TLLSNLEEAK
0,55_0,55,0,10.0,6.0,15.0,,177775.0,14415900.0,2025890.0,...,1673460.0,33921600.0,1611130.0,982965.0,408698.0,262946.0,594329.0,1950270.0,467059.0,795652.0
1,55_3,55,3,10.0,7.0,25.0,,,,,...,,,,,,,,,,
2,55_6,55,6,8.0,10.0,34.0,,165638.0,13330800.0,2060930.0,...,1644680.0,34435000.0,1483200.0,860236.0,369870.0,239423.0,510762.0,1850150.0,414741.0,598442.0
3,55_9,55,9,8.0,9.0,30.0,0.0,,,,...,,,,,,,,,,
4,55_12,55,12,10.0,10.0,41.0,0.0,151073.0,13225500.0,2015710.0,...,1916350.0,37906600.0,1547140.0,969345.0,318553.0,247750.0,569408.0,2226980.0,415667.0,797554.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7.0,6.0,13.0,0.0,203487.0,12675900.0,1424810.0,...,1548510.0,70121800.0,1459980.0,912988.0,320821.0,231068.0,581369.0,1817590.0,441109.0,588262.0
2611,65043_54,65043,54,4.0,8.0,11.0,1.0,,,,...,,,,,,,,,,
2612,65043_60,65043,60,6.0,6.0,16.0,1.0,,,,...,,,,,,,,,,
2613,65043_72,65043,72,3.0,9.0,14.0,1.0,,,,...,,,,,,,,,,


In [61]:
small_data.isna().sum()

visit_id          0
patient_id        0
visit_month       0
updrs_1           1
updrs_2           2
updrs_3          25
updrs_4        1038
O15240         1547
P01009         1547
P01011         1547
P01023         1547
P01024         1547
P01042         1547
P01834         1547
P01876         1547
P02647         1547
P02649         1547
P02749         1547
P02751         1547
P02765         1547
P02766         1547
P02768         1547
P02774         1547
P02787         1547
P02790         1547
P05090         1547
P06396         1547
P07602         1547
P10909         1547
P23142         1547
P41222         1547
Q12805         1547
Q92520         1547
Q9UHG2         1547
AIGYLNTGYQR    1547
IPTTFENGR      1547
KYLYEIAR       1547
NILTSNNIDVK    1547
TLLSNLEEAK     1547
dtype: int64

Let's see how big of a dataset remains if I drop all NaN values.

In [62]:
test111=small_data.dropna()
test111

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,O15240,P01009,P01011,...,P23142,P41222,Q12805,Q92520,Q9UHG2,AIGYLNTGYQR,IPTTFENGR,KYLYEIAR,NILTSNNIDVK,TLLSNLEEAK
4,55_12,55,12,10.0,10.0,41.0,0.0,151073.0,13225500.0,2015710.0,...,1916350.0,37906600.0,1547140.0,969345.0,318553.0,247750.0,569408.0,2226980.0,415667.0,797554.0
8,55_36,55,36,17.0,18.0,51.0,0.0,101056.0,13855000.0,2221880.0,...,1879000.0,32773700.0,1725030.0,910437.0,377550.0,303917.0,599722.0,2296340.0,448607.0,784097.0
16,942_12,942,12,5.0,2.0,25.0,0.0,86847.4,10101100.0,1385180.0,...,1154770.0,31070900.0,665474.0,569842.0,234094.0,164415.0,351842.0,2291060.0,279916.0,514283.0
22,942_48,942,48,2.0,6.0,35.0,0.0,82241.9,11988000.0,1665660.0,...,666753.0,26067100.0,492658.0,463228.0,232301.0,169616.0,433893.0,1989780.0,319112.0,487794.0
28,1517_0,1517,0,11.0,6.0,25.0,5.0,118752.0,13234200.0,1575740.0,...,1088530.0,41495900.0,781253.0,699234.0,164826.0,109870.0,265171.0,1520800.0,193636.0,637587.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2581,64669_60,64669,60,15.0,15.0,38.0,0.0,114099.0,10775700.0,1178870.0,...,772781.0,37524400.0,1033600.0,504681.0,163542.0,164130.0,340667.0,1879780.0,295252.0,558800.0
2594,64674_48,64674,48,11.0,17.0,46.0,1.0,39409.3,13043200.0,1554400.0,...,864031.0,20815000.0,514921.0,280482.0,83305.9,99782.9,184222.0,1742720.0,210721.0,332769.0
2598,64674_84,64674,84,11.0,15.0,45.0,4.0,62095.4,11833100.0,1423680.0,...,818154.0,20971200.0,578556.0,374604.0,136725.0,105207.0,186887.0,1302830.0,186301.0,365771.0
2604,65043_12,65043,12,4.0,7.0,14.0,0.0,270575.0,12616100.0,1142230.0,...,1569130.0,75703600.0,1703120.0,1102120.0,380072.0,234383.0,568851.0,1735120.0,450723.0,796742.0


How does the model perform with this significantly smaller dataset, which contains no imputed values? To compensate for the much smaller dataset, I set n_estimators to 1000 instead of 500.

In [63]:
# ADABOOST REGRESSOR
# max_depth=40, learning_rate=0.1, n_estimators=1000
X = test111.drop(columns=targets, axis=1)

smape_list_ada = []
mae_list_ada = []
actual_depth_ada = []


for target in targets:

    y = test111[target]
    
    #splitting training and testing data. 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=0)

    print(f'{target}: {len(X_train)} samples in training, {len(X_test)} samples in testing.')
    
    #fitting and testing
    dtr = DecisionTreeRegressor(max_depth=40)
    ada = AdaBoostRegressor(dtr, n_estimators=1000, learning_rate=0.1)
    ada.fit(X_train,y_train)    
    y_predict = ada.predict(X_test)

    #scoring
    mae = mean_absolute_error(y_test, y_predict)
    smape = smape_score(y_test, y_predict)
    print(f'MAE: {mae}; SMAPE: {smape}')

    #saving scores in their lists
    mae_list_ada.append(mae)
    smape_list_ada.append(smape)

    max_depth = list()
    for tree in ada.estimators_:
        max_depth.append(tree.tree_.max_depth)
    print("avg max depth %0.1f" % (sum(max_depth) / len(max_depth)))
    print(f"max depth {max(max_depth)}")
    print(f"min depth {min(max_depth)}")
    print()
    actual_depth_ada.append(max(max_depth))
    print()

updrs_1: 507 samples in training, 57 samples in testing.
MAE: 4.052631578947368; SMAPE: 58.245629768334275
avg max depth 17.6
max depth 27
min depth 12


updrs_2: 507 samples in training, 57 samples in testing.
MAE: 4.456140350877193; SMAPE: 72.04897440932096
avg max depth 17.9
max depth 27
min depth 13


updrs_3: 507 samples in training, 57 samples in testing.
MAE: 10.473684210526315; SMAPE: 59.32928172562978
avg max depth 17.2
max depth 26
min depth 13


updrs_4: 507 samples in training, 57 samples in testing.
MAE: 1.5964912280701755; SMAPE: 76.7099567099567
avg max depth 16.4
max depth 26
min depth 11




Scores are significantly worse, as expected. Another angle to try would be to try and drop features (columns) which contain more than 50% NaN values, per patient. In other words, it might be worthwhile to salvage some data, where there is sufficient amount to impute the rest for each given patient. 

In [64]:
test_data_list = [big_data[big_data.patient_id==55], big_data[big_data.patient_id==942], big_data[big_data.patient_id==1517]]
test_data_df = pd.concat(test_data_list, ignore_index=True)

test_data_df

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,O00391,O00533,O00584,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55_0,55,0,10.0,6.0,15.0,,11254.3,732430.0,39585.8,...,201158.0,16492.3,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55_3,55,3,10.0,7.0,25.0,,,,,...,,,,,,,,,,
2,55_6,55,6,8.0,10.0,34.0,,13163.6,630465.0,35220.8,...,171079.0,13198.8,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
3,55_9,55,9,8.0,9.0,30.0,0.0,,,,...,,,,,,,,,,
4,55_12,55,12,10.0,10.0,41.0,0.0,15257.6,815083.0,41650.9,...,231772.0,17873.8,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
5,55_18,55,18,7.0,13.0,38.0,0.0,,,,...,,,,,,,,,,
6,55_24,55,24,16.0,9.0,49.0,0.0,,,,...,,,,,,,,,,
7,55_30,55,30,14.0,13.0,49.0,0.0,,,,...,,,,,,,,,,
8,55_36,55,36,17.0,18.0,51.0,0.0,13530.8,753832.0,43048.9,...,185290.0,18580.5,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
9,55_42,55,42,12.0,20.0,41.0,0.0,,,,...,,,,,,,,,,


In [65]:
X = big_data_imputed_1.drop(columns=targets, axis=1)

smape_list_ada = []
mae_list_ada = []
actual_depth_ada = []


for target in targets:

    y = big_data_imputed_1[target]
    
    #splitting training and testing data. 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

    print(f'{target}: {len(X_train)} samples in training, {len(X_test)} samples in testing.')
    
    #fitting and testing
    dtr = DecisionTreeRegressor(max_depth=40)
    ada = AdaBoostRegressor(dtr, n_estimators=500, learning_rate=0.2)
    ada.fit(X_train,y_train)    
    y_predict = ada.predict(X_test)

    #scoring
    mae = mean_absolute_error(y_test, y_predict)
    smape = smape_score(y_test, y_predict)
    print(f'MAE: {mae}; SMAPE: {smape}')

    #saving scores in their lists
    mae_list_ada.append(mae)
    smape_list_ada.append(smape)

    max_depth = list()
    for tree in ada.estimators_:
        max_depth.append(tree.tree_.max_depth)
    print("avg max depth %0.1f" % (sum(max_depth) / len(max_depth)))
    print(f"max depth {max(max_depth)}")
    print(f"min depth {min(max_depth)}")
    print()
    actual_depth_ada.append(max(max_depth))
    print()

updrs_1: 2092 samples in training, 523 samples in testing.
MAE: 2.51434034416826; SMAPE: 48.514998614689134
avg max depth 24.6
max depth 36
min depth 19


updrs_2: 2092 samples in training, 523 samples in testing.
MAE: 2.1778202676864247; SMAPE: 51.318591870054895
avg max depth 22.7
max depth 35
min depth 17


updrs_3: 2092 samples in training, 523 samples in testing.
MAE: 5.118546845124283; SMAPE: 48.08401919339609
avg max depth 22.5
max depth 32
min depth 17


updrs_4: 2092 samples in training, 523 samples in testing.
MAE: 0.7571837007107038; SMAPE: 31.28632045845789
avg max depth 27.8
max depth 40
min depth 17


