In [3]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import learning_curve, validation_curve


def plot_learning_curve(
    estimator,
    X,
    y,
    title="Learning Curve",
    ylim=None,
    cv=None,
    n_jobs=None,
    train_sizes=np.linspace(0.1, 1.0, 5),
    scoring=None,
):
    """
    This is a custom modification of the code present here:
    https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
    
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))

    scoring: string, callable or None, optional, default: None
        A string (see model evaluation documentation) or a scorer callable object / function
        with signature scorer(estimator, X, y).
    """

    fig, axes = plt.subplots(1, 1, figsize=(10, 5))

    axes.set_title(title)
    if ylim is not None:
        axes.set_ylim(*ylim)
    axes.set_xlabel("Training examples")
    axes.set_ylabel("Score")

    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=scoring
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # Plot learning curve
    axes.grid(True)
    axes.fill_between(
        train_sizes,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.1,
        color="r",
    )
    axes.fill_between(
        train_sizes,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.1,
        color="g",
    )
    axes.plot(train_sizes, train_scores_mean, "o-", color="r", label="Training score")
    axes.plot(
        train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score"
    )
    axes.legend(loc="best")
    return fig, axes


def plot_validation_curve(
    estimator,
    X,
    y,
    ylim=None,
    cv=None,
    n_jobs=None,
    param_name=None,
    param_range=None,
    scoring=None,
):
    """
    referred from :
    https://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics

    :param estimator: object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.
    :param X: array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.
    :param y: array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.
    :param ylim: tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.
    :param cv: int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.
    :param n_jobs: int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.
    :param param_range: array-like, shape (n_values,)
    The values of the parameter that will be evaluated.
    :param param_name: string
    Name of the parameter that will be varied.
    :param scoring: string, callable or None, optional, default: None
        A string (see model evaluation documentation) or a scorer callable object / function
        with signature scorer(estimator, X, y).
    :return: fig


    """
    train_scores, test_scores = validation_curve(
        estimator,
        X,
        y,
        param_name=param_name,
        param_range=param_range,
        scoring=scoring,
        n_jobs=n_jobs,
        cv=cv,
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    fig, axes = plt.subplots(1, 1, figsize=(10, 5))
    axes.grid(True)
    axes.set_title(f"Validation Curve with {estimator.__class__}")
    axes.set_xlabel(f"{param_name}")
    axes.set_ylabel(f"{scoring}")
    if ylim is not None:
        axes.set_ylim(*ylim)

    lw = 2
    axes.plot(
        param_range,
        train_scores_mean,
        label="Training score",
        color="darkorange",
        lw=lw,
    )
    axes.fill_between(
        param_range,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.2,
        color="darkorange",
        lw=lw,
    )
    axes.plot(
        param_range,
        test_scores_mean,
        label="Cross-validation score",
        color="navy",
        lw=lw,
    )
    axes.fill_between(
        param_range,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.2,
        color="navy",
        lw=lw,
    )
    axes.legend(loc="best")
    return fig, axes


In [4]:
import pandas as pd
import seaborn as sns
sns.set()

from pathlib import Path

# classifiers we will use
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, HistGradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost

#imputers
from sklearn.impute import SimpleImputer, KNNImputer

# model selection bits
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, ParameterGrid, ParameterSampler
#from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit, GroupShuffleSplit, GroupKFold, StratifiedKFold
from sklearn.model_selection import learning_curve, validation_curve

# evaluation
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error


import scipy

Let's load all the data. 

In [5]:
# Load the competition datasets into Pandas DataFrame
path = Path("/Users/13392/Documents/amp-parkinsons-disease-progression-prediction")
proteins = pd.read_csv(path/"train_proteins.csv")
peptides = pd.read_csv(path/"train_peptides.csv")
clinical = pd.read_csv(path/"train_clinical_data.csv")
supplemental = pd.read_csv(path/"supplemental_clinical_data.csv")

As discussed previously, we are dropping the entire "medication status" column, because:
1) Over 50% values are NaN. 
2) the test dataset will not have this data. 

In [6]:
# drop the "medication status" column (due to over 50% NaN values), keep a copy of the original for later access. 
clinical_copy = clinical.copy()

clinical.drop('upd23b_clinical_state_on_medication', axis=1, inplace=True)

In [7]:
targets = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']
ids = ['patient_id', 'visit_id']
month = ['visit_month']

Let's see how much remains of 'NaN'.

In [8]:
print(f'NaN value count:\n{clinical.isna().sum()}')
clinical

NaN value count:
visit_id          0
patient_id        0
visit_month       0
updrs_1           1
updrs_2           2
updrs_3          25
updrs_4        1038
dtype: int64


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4
0,55_0,55,0,10.0,6.0,15.0,
1,55_3,55,3,10.0,7.0,25.0,
2,55_6,55,6,8.0,10.0,34.0,
3,55_9,55,9,8.0,9.0,30.0,0.0
4,55_12,55,12,10.0,10.0,41.0,0.0
...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7.0,6.0,13.0,0.0
2611,65043_54,65043,54,4.0,8.0,11.0,1.0
2612,65043_60,65043,60,6.0,6.0,16.0,1.0
2613,65043_72,65043,72,3.0,9.0,14.0,1.0


Significant, but manageable with some kind of imputation. Let's count the number of visits each patient has on record. 

In [9]:
cols = ['patient_id', 'num_entries']
patient_list = clinical.patient_id.unique()
n_list = []
p_list = []

for patient in patient_list:
    n=len(clinical[clinical.patient_id==patient].index)

    n_list.append(n)
    p_list.append(patient)

df_visits_by_patient = pd.DataFrame(list(zip(p_list, n_list)), columns=cols)

df_visits_by_patient


Unnamed: 0,patient_id,num_entries
0,55,13
1,942,15
2,1517,10
3,1923,7
4,2660,6
...,...,...
243,63875,9
244,63889,10
245,64669,15
246,64674,16


NaN values in the "Proteins"  and "Peptides" datasets. 

In [10]:
print(f'NaN value count:\n{proteins.isna().sum()}')
proteins

NaN value count:
visit_id       0
visit_month    0
patient_id     0
UniProt        0
NPX            0
dtype: int64


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0
...,...,...,...,...,...
232736,58648_108,108,58648,Q9UBX5,27387.8
232737,58648_108,108,58648,Q9UHG2,369437.0
232738,58648_108,108,58648,Q9UKV8,105830.0
232739,58648_108,108,58648,Q9Y646,21257.6


In [11]:
print(f'NaN value count:\n{peptides.isna().sum()}')

peptides

NaN value count:
visit_id            0
visit_month         0
patient_id          0
UniProt             0
Peptide             0
PeptideAbundance    0
dtype: int64


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.30
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.00
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.00
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.90
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.70
...,...,...,...,...,...,...
981829,58648_108,108,58648,Q9UHG2,ILAGSADSEGVAAPR,202820.00
981830,58648_108,108,58648,Q9UKV8,SGNIPAGTTVDTK,105830.00
981831,58648_108,108,58648,Q9Y646,LALLVDTVGPR,21257.60
981832,58648_108,108,58648,Q9Y6R7,AGC(UniMod_4)VAESTAVC(UniMod_4)R,5127.26


Great! No NaN values at all in them. Let's find out how many of each patient's visits have protein/peptide data. 

In [12]:
cols = ['patient_id', 'num_entries_protein']
patient_list = proteins.patient_id.unique()
n_list = []
p_list = []

for patient in patient_list:
    n=len(proteins[proteins.patient_id==patient].visit_id.unique())

    n_list.append(n)
    p_list.append(patient)

df_recorded_visits_protein = pd.DataFrame(list(zip(p_list, n_list)), columns=cols)

df_recorded_visits_protein


Unnamed: 0,patient_id,num_entries_protein
0,55,4
1,1517,4
2,1923,3
3,2660,5
4,3636,3
...,...,...
243,52998,3
244,54979,3
245,58597,3
246,7508,3


In [13]:
cols = ['patient_id', 'num_entries_peptide']
patient_list = peptides.patient_id.unique()
n_list = []
p_list = []

for patient in patient_list:
    n=len(peptides[peptides.patient_id==patient].visit_id.unique())

    n_list.append(n)
    p_list.append(patient)

df_recorded_visits_peptide = pd.DataFrame(list(zip(p_list, n_list)), columns=cols)

df_recorded_visits_peptide

Unnamed: 0,patient_id,num_entries_peptide
0,55,4
1,1517,4
2,1923,3
3,2660,5
4,3636,3
...,...,...
243,52998,3
244,54979,3
245,58597,3
246,7508,3


In [14]:
df = pd.merge(df_recorded_visits_protein, df_recorded_visits_peptide, on='patient_id', how='left')
df = pd.merge(df_visits_by_patient, df, on='patient_id', how='left')
df.head(10)

Unnamed: 0,patient_id,num_entries,num_entries_protein,num_entries_peptide
0,55,13,4,4
1,942,15,4,4
2,1517,10,4,4
3,1923,7,3,3
4,2660,6,5,5
5,3636,14,3,3
6,3863,9,5,5
7,4161,12,6,6
8,4172,8,7,7
9,4923,11,5,5


It's not looking great - we can clearly see that, for most of the patients, only 1/2 to 1/3 of the visits contain protein and peptide records.

Something to think about for later on...

Now we will "pivot" the datasets so the unique coding for each protein/peptide becomes a feature for the models to learn on. 

In [15]:
df_proteins = proteins.pivot(index=['patient_id', 'visit_month', 'visit_id'], columns='UniProt', values='NPX').rename_axis(columns=None).reset_index()

df_peptides = peptides.pivot(index=['patient_id', 'visit_month', 'visit_id'], columns='Peptide', values='PeptideAbundance').rename_axis(columns=None).reset_index()


In [16]:
df_proteins

Unnamed: 0,patient_id,visit_month,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
0,55,0,55_0,11254.3,732430.0,39585.8,41526.9,31238.00,4202.71,177775.0,...,365475.0,35528.00,97005.6,23122.5,60912.6,408698.0,,29758.8,23833.7,18953.5
1,55,6,55_6,13163.6,630465.0,35220.8,41295.0,26219.90,4416.42,165638.0,...,405676.0,30332.60,109174.0,23499.8,51655.8,369870.0,,22935.2,17722.5,16642.7
2,55,12,55_12,15257.6,815083.0,41650.9,39763.3,30703.60,4343.60,151073.0,...,303953.0,43026.20,114921.0,21860.1,61598.2,318553.0,65762.6,29193.4,28536.1,19290.9
3,55,36,55_36,13530.8,753832.0,43048.9,43503.6,33577.60,5367.06,101056.0,...,303597.0,48188.40,109794.0,23930.6,70223.5,377550.0,74976.1,31732.6,22186.5,21717.1
4,942,6,942_6,11218.7,399518.0,20581.0,31290.9,6173.58,2564.37,160526.0,...,253373.0,27431.80,93796.7,17450.9,21299.1,306621.0,82335.5,24018.7,18939.5,15251.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,64674,84,64674_84,,190487.0,24907.9,18543.1,10124.90,2308.71,62095.4,...,260021.0,7139.93,104277.0,10500.0,21944.2,136725.0,62217.5,,10287.7,13848.2
1109,65043,0,65043_0,13472.4,927954.0,42661.5,43663.2,20071.30,3278.88,266339.0,...,186414.0,25897.80,,21480.7,57364.0,416142.0,37584.6,,28346.5,35617.5
1110,65043,12,65043_12,14134.9,984651.0,28990.8,42440.9,25357.40,3267.66,270575.0,...,301343.0,22343.40,105626.0,20500.8,54011.2,380072.0,40588.9,,17035.7,37064.2
1111,65043,24,65043_24,14659.5,1062020.0,46440.4,38293.0,21971.80,3990.34,221358.0,...,300439.0,52143.60,139291.0,19449.2,66569.9,300948.0,36150.4,,21286.3,39587.9


In [17]:
df_peptides

Unnamed: 0,patient_id,visit_month,visit_id,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55,0,55_0,8984260.0,53855.6,8579740.0,,19735.4,114400.0,46371.1,...,201158.0,16492.30,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55,6,55_6,8279770.0,45251.9,8655890.0,49927.5,23820.4,90539.4,38652.4,...,171079.0,13198.80,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
2,55,12,55_12,8382390.0,53000.9,8995640.0,45519.2,17813.5,147312.0,45840.9,...,231772.0,17873.80,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
3,55,36,55_36,10671500.0,58108.4,9985420.0,52374.0,19373.3,64356.1,49793.2,...,185290.0,18580.50,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
4,942,6,942_6,6177730.0,42682.6,3596660.0,25698.8,17130.6,86471.5,41007.9,...,226314.0,6399.80,,57571.4,480951.0,80001.2,79661.9,573300.0,48005.8,15674.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,64674,84,64674_84,7083630.0,35656.1,6273100.0,,,15479.2,,...,203523.0,3835.58,4901220.0,40325.9,335625.0,49250.4,64076.3,667993.0,38472.5,21949.1
1109,65043,0,65043_0,7818630.0,95033.0,5119260.0,57483.7,11610.0,270739.0,42527.3,...,257361.0,18316.60,2514660.0,51444.6,530245.0,156148.0,157548.0,336625.0,48423.2,10915.8
1110,65043,12,65043_12,8070390.0,76532.7,8233520.0,54260.6,11631.9,230169.0,42255.5,...,230437.0,16703.20,2481560.0,44405.0,543391.0,159828.0,161207.0,330337.0,45368.1,19023.2
1111,65043,24,65043_24,7608150.0,75401.6,9168030.0,,13313.9,220202.0,46914.1,...,251228.0,18326.20,2939460.0,50588.2,597869.0,148032.0,192857.0,388125.0,65101.0,20790.1


Because not all 1000+ proteins and peptides are measured at each recorded visit, some NaN values should be now expected. 

In [18]:
df_proteins.isna().sum().sort_values(ascending=False)

Q99829        624
Q99832        507
Q562R1        497
P01780        459
Q6UX71        452
             ... 
P02766          0
P02765          0
P02751          0
P02749          0
patient_id      0
Length: 230, dtype: int64

In [19]:
df_peptides.isna().sum().sort_values(ascending=False)

QALPQVR                   624
EPQVYTLPPSRDELTK          550
TPSGLYLGTC(UniMod_4)ER    523
SLEDQVEMLR                514
HYEGSTVPEK                508
                         ... 
visit_id                    0
IPTTFENGR                   0
AIGYLNTGYQR                 0
NILTSNNIDVK                 0
patient_id                  0
Length: 971, dtype: int64

We are going to combine the protein and peptide data, and check once again the status of NaN values. They should remain unchanged because we haven't done anything with them.

In [20]:
prot_pept_df = pd.merge(df_proteins, df_peptides, on=['patient_id','visit_month','visit_id'], how='left')
prot_pept_df

Unnamed: 0,patient_id,visit_month,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55,0,55_0,11254.3,732430.0,39585.8,41526.9,31238.00,4202.71,177775.0,...,201158.0,16492.30,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55,6,55_6,13163.6,630465.0,35220.8,41295.0,26219.90,4416.42,165638.0,...,171079.0,13198.80,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
2,55,12,55_12,15257.6,815083.0,41650.9,39763.3,30703.60,4343.60,151073.0,...,231772.0,17873.80,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
3,55,36,55_36,13530.8,753832.0,43048.9,43503.6,33577.60,5367.06,101056.0,...,185290.0,18580.50,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
4,942,6,942_6,11218.7,399518.0,20581.0,31290.9,6173.58,2564.37,160526.0,...,226314.0,6399.80,,57571.4,480951.0,80001.2,79661.9,573300.0,48005.8,15674.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,64674,84,64674_84,,190487.0,24907.9,18543.1,10124.90,2308.71,62095.4,...,203523.0,3835.58,4901220.0,40325.9,335625.0,49250.4,64076.3,667993.0,38472.5,21949.1
1109,65043,0,65043_0,13472.4,927954.0,42661.5,43663.2,20071.30,3278.88,266339.0,...,257361.0,18316.60,2514660.0,51444.6,530245.0,156148.0,157548.0,336625.0,48423.2,10915.8
1110,65043,12,65043_12,14134.9,984651.0,28990.8,42440.9,25357.40,3267.66,270575.0,...,230437.0,16703.20,2481560.0,44405.0,543391.0,159828.0,161207.0,330337.0,45368.1,19023.2
1111,65043,24,65043_24,14659.5,1062020.0,46440.4,38293.0,21971.80,3990.34,221358.0,...,251228.0,18326.20,2939460.0,50588.2,597869.0,148032.0,192857.0,388125.0,65101.0,20790.1


In [21]:
prot_pept_df.isna().sum().sort_values(ascending=False)

Q99829                    624
QALPQVR                   624
EPQVYTLPPSRDELTK          550
TPSGLYLGTC(UniMod_4)ER    523
SLEDQVEMLR                514
                         ... 
P41222                      0
P02774                      0
P02787                      0
P02790                      0
patient_id                  0
Length: 1198, dtype: int64

In [22]:
patient_list=prot_pept_df['patient_id'].unique()
patient_list

array([   55,   942,  1517,  1923,  2660,  3636,  3863,  4161,  4172,
        4923,  5027,  5036,  5178,  5645,  5742,  6054,  6211,  6420,
        7051,  7117,  7151,  7265,  7508,  7568,  7832,  7886,  8344,
        8699, 10053, 10138, 10174, 10541, 10715, 10718, 11459, 11686,
       11928, 12516, 12636, 12703, 12755, 12931, 13360, 13368, 13618,
       13804, 13852, 13968, 14035, 14124, 14242, 14270, 14344, 14450,
       14811, 15009, 15245, 15504, 15590, 16238, 16347, 16566, 16574,
       16778, 16931, 17154, 17201, 17414, 17727, 18183, 18204, 18553,
       18560, 19088, 20212, 20216, 20352, 20404, 20460, 20581, 20664,
       20707, 20791, 20792, 21126, 21537, 21729, 22126, 22623, 23175,
       23192, 23244, 23391, 23636, 24278, 24690, 24818, 24820, 24911,
       25562, 25739, 25750, 25827, 25911, 26005, 26104, 26210, 26809,
       27079, 27300, 27464, 27468, 27607, 27715, 27872, 27893, 27971,
       27987, 28327, 28342, 28818, 29313, 29417, 30119, 30155, 30416,
       30894, 30951,

Here there are two ways to impute the data:
1) combine "clinical" and "prot_pept_df" first then impute, or
2) first impute them separately, then combine. 

We'll do both and see how the outcomes differ.

I am choosing to use sklearn's KNN imputer because, with a bit of clever coding, I can use all the availble data for a given patient to impute missing values. 

In [23]:
# Method 1: Combine first, impute next. 
big_data = pd.merge(clinical, prot_pept_df, on=['patient_id','visit_month','visit_id'], how='left')
big_data.head(30)

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,O00391,O00533,O00584,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55_0,55,0,10.0,6.0,15.0,,11254.3,732430.0,39585.8,...,201158.0,16492.3,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55_3,55,3,10.0,7.0,25.0,,,,,...,,,,,,,,,,
2,55_6,55,6,8.0,10.0,34.0,,13163.6,630465.0,35220.8,...,171079.0,13198.8,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
3,55_9,55,9,8.0,9.0,30.0,0.0,,,,...,,,,,,,,,,
4,55_12,55,12,10.0,10.0,41.0,0.0,15257.6,815083.0,41650.9,...,231772.0,17873.8,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
5,55_18,55,18,7.0,13.0,38.0,0.0,,,,...,,,,,,,,,,
6,55_24,55,24,16.0,9.0,49.0,0.0,,,,...,,,,,,,,,,
7,55_30,55,30,14.0,13.0,49.0,0.0,,,,...,,,,,,,,,,
8,55_36,55,36,17.0,18.0,51.0,0.0,13530.8,753832.0,43048.9,...,185290.0,18580.5,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
9,55_42,55,42,12.0,20.0,41.0,0.0,,,,...,,,,,,,,,,


In [24]:
#imputing
big_data_patient_list = big_data.patient_id.unique()
data_imputed_list = []
for patient_id in big_data_patient_list:
    masked_data = big_data[big_data['patient_id']==patient_id]
    num_rows = len(masked_data.index)
    knn = KNNImputer(missing_values=np.nan, keep_empty_features=True, n_neighbors=num_rows)
    X_knn = knn.fit_transform(masked_data)
    X_knn_df = pd.DataFrame(X_knn, columns = big_data.columns)
    data_imputed_list.append(X_knn_df)

big_data_imputed = pd.concat(data_imputed_list)

In [25]:
big_data_imputed.head(10)

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,O00391,O00533,O00584,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,550.0,55.0,0.0,10.0,6.0,15.0,0.0,11254.3,732430.0,39585.8,...,201158.0,16492.3,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,553.0,55.0,3.0,10.0,7.0,25.0,0.0,13301.575,732952.5,39876.6,...,197324.75,16536.35,4015897.5,106875.475,621626.5,125050.75,173975.25,461517.5,48463.675,17854.275
2,556.0,55.0,6.0,8.0,10.0,34.0,0.0,13163.6,630465.0,35220.8,...,171079.0,13198.8,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
3,559.0,55.0,9.0,8.0,9.0,30.0,0.0,13301.575,732952.5,39876.6,...,197324.75,16536.35,4015897.5,106875.475,621626.5,125050.75,173975.25,461517.5,48463.675,17854.275
4,5512.0,55.0,12.0,10.0,10.0,41.0,0.0,15257.6,815083.0,41650.9,...,231772.0,17873.8,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
5,5518.0,55.0,18.0,7.0,13.0,38.0,0.0,13301.575,732952.5,39876.6,...,197324.75,16536.35,4015897.5,106875.475,621626.5,125050.75,173975.25,461517.5,48463.675,17854.275
6,5524.0,55.0,24.0,16.0,9.0,49.0,0.0,13301.575,732952.5,39876.6,...,197324.75,16536.35,4015897.5,106875.475,621626.5,125050.75,173975.25,461517.5,48463.675,17854.275
7,5530.0,55.0,30.0,14.0,13.0,49.0,0.0,13301.575,732952.5,39876.6,...,197324.75,16536.35,4015897.5,106875.475,621626.5,125050.75,173975.25,461517.5,48463.675,17854.275
8,5536.0,55.0,36.0,17.0,18.0,51.0,0.0,13530.8,753832.0,43048.9,...,185290.0,18580.5,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
9,5542.0,55.0,42.0,12.0,20.0,41.0,0.0,13301.575,732952.5,39876.6,...,197324.75,16536.35,4015897.5,106875.475,621626.5,125050.75,173975.25,461517.5,48463.675,17854.275


In [26]:
#resetting index to match original.
big_data_imputed.reset_index(inplace=True, drop=True)
big_data_imputed.head(10)

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,O00391,O00533,O00584,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,550.0,55.0,0.0,10.0,6.0,15.0,0.0,11254.3,732430.0,39585.8,...,201158.0,16492.3,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,553.0,55.0,3.0,10.0,7.0,25.0,0.0,13301.575,732952.5,39876.6,...,197324.75,16536.35,4015897.5,106875.475,621626.5,125050.75,173975.25,461517.5,48463.675,17854.275
2,556.0,55.0,6.0,8.0,10.0,34.0,0.0,13163.6,630465.0,35220.8,...,171079.0,13198.8,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
3,559.0,55.0,9.0,8.0,9.0,30.0,0.0,13301.575,732952.5,39876.6,...,197324.75,16536.35,4015897.5,106875.475,621626.5,125050.75,173975.25,461517.5,48463.675,17854.275
4,5512.0,55.0,12.0,10.0,10.0,41.0,0.0,15257.6,815083.0,41650.9,...,231772.0,17873.8,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
5,5518.0,55.0,18.0,7.0,13.0,38.0,0.0,13301.575,732952.5,39876.6,...,197324.75,16536.35,4015897.5,106875.475,621626.5,125050.75,173975.25,461517.5,48463.675,17854.275
6,5524.0,55.0,24.0,16.0,9.0,49.0,0.0,13301.575,732952.5,39876.6,...,197324.75,16536.35,4015897.5,106875.475,621626.5,125050.75,173975.25,461517.5,48463.675,17854.275
7,5530.0,55.0,30.0,14.0,13.0,49.0,0.0,13301.575,732952.5,39876.6,...,197324.75,16536.35,4015897.5,106875.475,621626.5,125050.75,173975.25,461517.5,48463.675,17854.275
8,5536.0,55.0,36.0,17.0,18.0,51.0,0.0,13530.8,753832.0,43048.9,...,185290.0,18580.5,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
9,5542.0,55.0,42.0,12.0,20.0,41.0,0.0,13301.575,732952.5,39876.6,...,197324.75,16536.35,4015897.5,106875.475,621626.5,125050.75,173975.25,461517.5,48463.675,17854.275


In [27]:
#relabel patient id, visit id, and visit month.
ls = [ids, month]
for col in ls:
    big_data_imputed[col] = big_data[col]

In [28]:
def smape_score(actual, predicted):
    sum = 0
    for a, p in zip(actual, predicted):
        if a==0 and p==0:
            pass
        else:
            sum += (np.abs(p-a))/(np.abs(p)+np.abs(a))*2
    return sum/len(actual)*100

In [64]:

params = {'n_estimators': [100,250,500,750,1000],
                 'max_depth': np.arange(20,50,5), 
                 'learning_rate': np.arange(0.1,3,0.2)}

random_params = list(ParameterSampler(params, n_iter=5, random_state=2))

random_params


[{'n_estimators': 100, 'max_depth': 35, 'learning_rate': 2.1000000000000005},
 {'n_estimators': 500, 'max_depth': 40, 'learning_rate': 2.7000000000000006},
 {'n_estimators': 750, 'max_depth': 45, 'learning_rate': 2.3000000000000007},
 {'n_estimators': 500, 'max_depth': 40, 'learning_rate': 0.9000000000000001},
 {'n_estimators': 250, 'max_depth': 30, 'learning_rate': 1.7000000000000004}]

I'll first save these pseudo randomly generated parameters in a dataframe, for later referencing.

In [48]:
params_df = pd.DataFrame(random_params)
params_df

Unnamed: 0,n_estimators,max_depth,learning_rate
0,400,25,0.1
1,200,40,1.6
2,200,20,2.1
3,300,20,0.6
4,300,35,2.1


In [None]:
#ADABOOST REGRESSOR with decision tree regressor, random parameters, 5 iterations.

X = big_data_imputed.drop(columns=targets, axis=1)

smape_list_ada = []
mae_list_ada = []
actual_depth_ada = []


for target in targets:

    y = big_data_imputed[target]
    
    #splitting training and testing data. 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=0)

    print(f'{target}: {len(X_train)} samples in training, {len(X_test)} samples in testing.')
    
    for params in random_params:
        #fitting and testing
        dtr = DecisionTreeRegressor(max_depth=params['max_depth'])
        ada = AdaBoostRegressor(dtr, n_estimators=params['n_estimators'], learning_rate=params['learning_rate'])
        ada.fit(X_train,y_train)    
        y_predict = ada.predict(X_test)

        #scoring
        mae = mean_absolute_error(y_test, y_predict)
        smape = smape_score(y_test, y_predict)
        print(f'Parameter: {params}; MAE: {mae}; SMAPE: {smape}')

        #saving scores in their lists
        mae_list_ada.append(mae)
        smape_list_ada.append(smape)

        max_depth = list()
        for tree in ada.estimators_:
            max_depth.append(tree.tree_.max_depth)
        print("avg max depth %0.1f" % (sum(max_depth) / len(max_depth)))
        print(f"max depth {max(max_depth)}")
        print(f"min depth {min(max_depth)}")
        print()
        actual_depth_ada.append(max(max_depth))
    print()

In [52]:
ada_df = pd.concat([params_df,params_df,params_df,params_df], ignore_index=True)
ada_df['actual depth reached'] = actual_depth_ada
ada_df['MAE'] = mae_list_ada
ada_df['SMAPE'] = smape_list_ada
ada_df['UPDRS'] = ""
ada_df.loc[:4, 'UPDRS']=1
ada_df.loc[5:9, 'UPDRS']=2
ada_df.loc[10:14, 'UPDRS']=3
ada_df.loc[15:, 'UPDRS']=4

ada_df

Unnamed: 0,n_estimators,max_depth,learning_rate,actual depth reached,MAE,SMAPE,UPDRS
0,400,25,0.1,25,2.446126,51.825934,1
1,200,40,1.6,32,2.442748,50.958086,1
2,200,20,2.1,20,2.553201,49.724908,1
3,300,20,0.6,20,2.483693,51.70795,1
4,300,35,2.1,31,2.454198,48.092411,1
5,400,25,0.1,25,2.091603,51.476874,2
6,200,40,1.6,30,2.148855,53.080453,2
7,200,20,2.1,20,2.538168,62.744173,2
8,300,20,0.6,20,2.118863,52.357032,2
9,300,35,2.1,26,2.450382,61.925014,2


In [31]:
learning_rates = [0.1,0.5,1,5,10,20,50,100]

In [32]:
# ADABOOST REGRESSOR with decision tree regressor, testing for learning rate. 
# n_estimators = 500, max_depth = 40

X = big_data_imputed.drop(columns=targets, axis=1)

smape_list_ada = []
mae_list_ada = []
actual_depth_ada = []


for target in targets:

    y = big_data_imputed[target]
    
    #splitting training and testing data. 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=0)

    print(f'{target}: {len(X_train)} samples in training, {len(X_test)} samples in testing.')
    
    for learning_rate in learning_rates:
        #fitting and testing
        dtr = DecisionTreeRegressor(max_depth=40)
        ada = AdaBoostRegressor(dtr, n_estimators=500, learning_rate=learning_rate)
        ada.fit(X_train,y_train)    
        y_predict = ada.predict(X_test)

        #scoring
        mae = mean_absolute_error(y_test, y_predict)
        smape = smape_score(y_test, y_predict)
        print(f'Learning Rate: {learning_rate}; MAE: {mae}; SMAPE: {smape}')

        #saving scores in their lists
        mae_list_ada.append(mae)
        smape_list_ada.append(smape)

        max_depth = list()
        for tree in ada.estimators_:
            max_depth.append(tree.tree_.max_depth)
        print("avg max depth %0.1f" % (sum(max_depth) / len(max_depth)))
        print(f"max depth {max(max_depth)}")
        print(f"min depth {min(max_depth)}")
        print()
        actual_depth_ada.append(max(max_depth))
    print()

updrs_1: 2353 samples in training, 262 samples in testing.
Learning Rate: 0.1; MAE: 2.4664122137404583; SMAPE: 51.507290651038005
avg max depth 26.9
max depth 39
min depth 21

Learning Rate: 0.5; MAE: 2.480916030534351; SMAPE: 50.3701190184029
avg max depth 26.4
max depth 38
min depth 21

Learning Rate: 1; MAE: 2.481679389312977; SMAPE: 51.866671656171285
avg max depth 25.3
max depth 37
min depth 19

Learning Rate: 5; MAE: 6.141221374045801; SMAPE: 77.26496065461873
avg max depth 4.3
max depth 40
min depth 0

Learning Rate: 10; MAE: 23.980916030534353; SMAPE: 132.23819916257708
avg max depth 11.8
max depth 40
min depth 0

Learning Rate: 20; MAE: 25.98854961832061; SMAPE: 135.4376226609052
avg max depth 0.1
max depth 25
min depth 0

Learning Rate: 50; MAE: 7.011450381679389; SMAPE: 187.78625954198475
avg max depth 0.1
max depth 28
min depth 0

Learning Rate: 100; MAE: 7.011450381679389; SMAPE: 187.78625954198475
avg max depth 0.1
max depth 33
min depth 0


updrs_2: 2353 samples in train

In [35]:
learning_rates

[0.1, 0.5, 1, 5, 10, 20, 50, 100]

In [40]:
learning_rate_df = pd.DataFrame()
learning_rate_df['Learning Rate'] = learning_rates
learning_rate_df

Unnamed: 0,Learning Rate
0,0.1
1,0.5
2,1.0
3,5.0
4,10.0
5,20.0
6,50.0
7,100.0


In [43]:

ada_df1 = pd.concat([learning_rate_df, learning_rate_df, learning_rate_df, learning_rate_df], ignore_index=True)
ada_df1['n_estimators'] = 500
ada_df1['Max Depth'] = 40
ada_df1['actual depth reached'] = actual_depth_ada
ada_df1['MAE'] = mae_list_ada
ada_df1['SMAPE'] = smape_list_ada
ada_df1['UPDRS'] = ""
ada_df1.loc[:7, 'UPDRS']=1
ada_df1.loc[8:15, 'UPDRS']=2
ada_df1.loc[16:23, 'UPDRS']=3
ada_df1.loc[24:, 'UPDRS']=4


ada_df1

Unnamed: 0,Learning Rate,n_estimators,Max Depth,actual depth reached,MAE,SMAPE,UPDRS
0,0.1,500,40,39,2.466412,51.507291,1
1,0.5,500,40,38,2.480916,50.370119,1
2,1.0,500,40,37,2.481679,51.866672,1
3,5.0,500,40,40,6.141221,77.264961,1
4,10.0,500,40,40,23.980916,132.238199,1
5,20.0,500,40,25,25.98855,135.437623,1
6,50.0,500,40,28,7.01145,187.78626,1
7,100.0,500,40,33,7.01145,187.78626,1
8,0.1,500,40,33,2.076336,51.120318,2
9,0.5,500,40,35,2.076336,51.433138,2


Somewhat surprisingly, for all UPDRS scores, a higher than default learning rate (1) leads to much worse MAE and SMAPE scores. This is actually very good news, because a high learning rate tends to teach a model to overfit. By the same logic, a low learning rate that leads to good scores means the model will more likely do better with independent testing data. I also learned that a max depth of 40 is more than sufficient, because in the highest scoring parameters, only once did any decision tree reached the depth of 40. This can be verified if needed, but I think there is sufficient evidence to show that there is no need to go beyond max_depth of 40. 

I will now test to see if there is significant improvement in the scores with a lot of trees, by setting max depth to 40 and learning rate to 0.1. Previously we have seen some evidence that there is little additional gain after about 500 trees, but there were multiple variables. This time the only parameter we are iterating through is n_estimators. 

In [65]:
n_trees = [250,500,750,1000]
n_trees_df = pd.DataFrame()
n_trees_df['n_estimators'] = n_trees
n_trees_df

Unnamed: 0,n_estimators
0,250
1,500
2,750
3,1000


In [66]:
# ADABOOST REGRESSOR with decision tree regressor, testing for n_estimators. 
# max_depth=40, learning_rate=0.1
X = big_data_imputed.drop(columns=targets, axis=1)

smape_list_ada = []
mae_list_ada = []
actual_depth_ada = []


for target in targets:

    y = big_data_imputed[target]
    
    #splitting training and testing data. 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=0)

    print(f'{target}: {len(X_train)} samples in training, {len(X_test)} samples in testing.')
    
    for n_estimators in n_trees:
        #fitting and testing
        dtr = DecisionTreeRegressor(max_depth=40)
        ada = AdaBoostRegressor(dtr, n_estimators=n_estimators, learning_rate=0.1)
        ada.fit(X_train,y_train)    
        y_predict = ada.predict(X_test)

        #scoring
        mae = mean_absolute_error(y_test, y_predict)
        smape = smape_score(y_test, y_predict)
        print(f'n_estimators: {n_estimators}; MAE: {mae}; SMAPE: {smape}')

        #saving scores in their lists
        mae_list_ada.append(mae)
        smape_list_ada.append(smape)

        max_depth = list()
        for tree in ada.estimators_:
            max_depth.append(tree.tree_.max_depth)
        print("avg max depth %0.1f" % (sum(max_depth) / len(max_depth)))
        print(f"max depth {max(max_depth)}")
        print(f"min depth {min(max_depth)}")
        print()
        actual_depth_ada.append(max(max_depth))
    print()

updrs_1: 2353 samples in training, 262 samples in testing.
n_estimators: 250; MAE: 2.450381679389313; SMAPE: 51.815309693967315
avg max depth 26.9
max depth 35
min depth 20

n_estimators: 500; MAE: 2.4778625954198477; SMAPE: 51.89157511723342
avg max depth 26.8
max depth 40
min depth 21

n_estimators: 750; MAE: 2.4893129770992366; SMAPE: 51.49317784624379
avg max depth 27.0
max depth 37
min depth 21

n_estimators: 1000; MAE: 2.485496183206107; SMAPE: 51.78184813230503
avg max depth 26.7
max depth 40
min depth 20


updrs_2: 2353 samples in training, 262 samples in testing.
n_estimators: 250; MAE: 2.1030534351145036; SMAPE: 52.117356488794066
avg max depth 24.9
max depth 33
min depth 19

n_estimators: 500; MAE: 2.068702290076336; SMAPE: 51.480754655316495
avg max depth 25.0
max depth 36
min depth 19

n_estimators: 750; MAE: 2.1030534351145036; SMAPE: 52.980257518220355
avg max depth 25.1
max depth 34
min depth 19

n_estimators: 1000; MAE: 2.095419847328244; SMAPE: 53.370481933104884
avg 

In [67]:
ada_df2 = pd.concat([n_trees_df, n_trees_df, n_trees_df, n_trees_df], ignore_index=True)
ada_df2['Learning rate'] = 0.1
ada_df2['Max Depth'] = 40
ada_df2['actual depth reached'] = actual_depth_ada
ada_df2['MAE'] = mae_list_ada
ada_df2['SMAPE'] = smape_list_ada
ada_df2['UPDRS'] = ""
ada_df2.loc[:3, 'UPDRS']=1
ada_df2.loc[4:7, 'UPDRS']=2
ada_df2.loc[8:11, 'UPDRS']=3
ada_df2.loc[12:, 'UPDRS']=4


ada_df2

Unnamed: 0,n_estimators,Learning rate,Max Depth,actual depth reached,MAE,SMAPE,UPDRS
0,250,0.1,40,35,2.450382,51.81531,1
1,500,0.1,40,40,2.477863,51.891575,1
2,750,0.1,40,37,2.489313,51.493178,1
3,1000,0.1,40,40,2.485496,51.781848,1
4,250,0.1,40,33,2.103053,52.117356,2
5,500,0.1,40,36,2.068702,51.480755,2
6,750,0.1,40,34,2.103053,52.980258,2
7,1000,0.1,40,34,2.09542,53.370482,2
8,250,0.1,40,34,5.660348,51.528307,3
9,500,0.1,40,34,5.660348,51.674424,3


Because we don't have actual testing data (yet - the estimated public release date for the data is Oct. 1, 2023), and so far my model has been training/testing on imputed data. I will instead go back to the combined dataset prior to imputation (big_data), and randomly select 100 samples that have complete UPDRS 1-4 scores and set them aside for testing. Then I will impute the rest of the dataset, and retrain my model on it and see how it performs against the new testing data.

In [68]:
targets

['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

In [69]:
big_data_dropna = big_data.dropna(subset=targets)

In [70]:
big_data_dropna

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,O00391,O00533,O00584,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
3,55_9,55,9,8.0,9.0,30.0,0.0,,,,...,,,,,,,,,,
4,55_12,55,12,10.0,10.0,41.0,0.0,15257.6,815083.0,41650.9,...,231772.0,17873.8,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
5,55_18,55,18,7.0,13.0,38.0,0.0,,,,...,,,,,,,,,,
6,55_24,55,24,16.0,9.0,49.0,0.0,,,,...,,,,,,,,,,
7,55_30,55,30,14.0,13.0,49.0,0.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7.0,6.0,13.0,0.0,10589.6,902434.0,44890.8,...,233567.0,14478.3,3185530.0,48793.0,501159.0,133992.0,170146.0,359045.0,45780.0,17370.6
2611,65043_54,65043,54,4.0,8.0,11.0,1.0,,,,...,,,,,,,,,,
2612,65043_60,65043,60,6.0,6.0,16.0,1.0,,,,...,,,,,,,,,,
2613,65043_72,65043,72,3.0,9.0,14.0,1.0,,,,...,,,,,,,,,,


In [75]:
big_data_dropna.isna().sum().head(10)

visit_id          0
patient_id        0
visit_month       0
updrs_1           0
updrs_2           0
updrs_3           0
updrs_4           0
O00391         1181
O00533          999
O00584         1011
dtype: int64

In [83]:

train, test = train_test_split(big_data_dropna, test_size=0.1)


In [84]:
train.count()

visit_id                 1405
patient_id               1405
visit_month              1405
updrs_1                  1405
updrs_2                  1405
                         ... 
YVNKEIQNAVNGVK            505
YWGVASFLQK                491
YYC(UniMod_4)FQGNQFLR     501
YYTYLIMNK                 458
YYWGGQYTWDMAK             372
Length: 1202, dtype: int64

In [85]:
test.count()

visit_id                 157
patient_id               157
visit_month              157
updrs_1                  157
updrs_2                  157
                        ... 
YVNKEIQNAVNGVK            58
YWGVASFLQK                57
YYC(UniMod_4)FQGNQFLR     58
YYTYLIMNK                 54
YYWGGQYTWDMAK             46
Length: 1202, dtype: int64

In [86]:
# ADABOOST REGRESSOR
# max_depth=40, learning_rate=0.1, n_estimators=500
X = big_data_dropna.drop(columns=targets, axis=1)

smape_list_ada = []
mae_list_ada = []
actual_depth_ada = []


for target in targets:

    y = big_data_dropna[target]
    
    #splitting training and testing data. 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=0)

    print(f'{target}: {len(X_train)} samples in training, {len(X_test)} samples in testing.')
    
    #fitting and testing
    dtr = DecisionTreeRegressor(max_depth=40)
    ada = AdaBoostRegressor(dtr, n_estimators=500, learning_rate=0.1)
    ada.fit(X_train,y_train)    
    y_predict = ada.predict(X_test)

    #scoring
    mae = mean_absolute_error(y_test, y_predict)
    smape = smape_score(y_test, y_predict)
    print(f'MAE: {mae}; SMAPE: {smape}')

    #saving scores in their lists
    mae_list_ada.append(mae)
    smape_list_ada.append(smape)

    max_depth = list()
    for tree in ada.estimators_:
        max_depth.append(tree.tree_.max_depth)
    print("avg max depth %0.1f" % (sum(max_depth) / len(max_depth)))
    print(f"max depth {max(max_depth)}")
    print(f"min depth {min(max_depth)}")
    print()
    actual_depth_ada.append(max(max_depth))
    print()

updrs_1: 1405 samples in training, 157 samples in testing.


ValueError: Input contains NaN