In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from pathlib import Path

# classifiers we will use
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, HistGradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost

#imputers
from sklearn.impute import SimpleImputer, KNNImputer

# model selection bits
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit, GroupShuffleSplit, GroupKFold
from sklearn.model_selection import learning_curve, validation_curve

# evaluation
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error


import scipy

Let's load all the data. 

In [2]:
# Load the competition datasets into Pandas DataFrame
path = Path("/Users/13392/Documents/amp-parkinsons-disease-progression-prediction")
proteins = pd.read_csv(path/"train_proteins.csv")
peptides = pd.read_csv(path/"train_peptides.csv")
clinical = pd.read_csv(path/"train_clinical_data.csv")
supplemental = pd.read_csv(path/"supplemental_clinical_data.csv")

As discussed previously, we are dropping the entire "medication status" column, because:
1) Over 50% values are NaN. 
2) the test dataset will not have this data. 

In [3]:
# drop the "medication status" column (due to over 50% NaN values), keep a copy of the original
# for later access. 
clinical_copy = clinical.copy()

clinical.drop('upd23b_clinical_state_on_medication', axis=1, inplace=True)

In [4]:

targets = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']
ids = ['patient_id', 'visit_id']
month = ['visit_month']

Let's see how much remains of 'NaN'

In [5]:
print(f'NaN value count:\n{clinical.isna().sum()}')
clinical

NaN value count:
visit_id          0
patient_id        0
visit_month       0
updrs_1           1
updrs_2           2
updrs_3          25
updrs_4        1038
dtype: int64


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4
0,55_0,55,0,10.0,6.0,15.0,
1,55_3,55,3,10.0,7.0,25.0,
2,55_6,55,6,8.0,10.0,34.0,
3,55_9,55,9,8.0,9.0,30.0,0.0
4,55_12,55,12,10.0,10.0,41.0,0.0
...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7.0,6.0,13.0,0.0
2611,65043_54,65043,54,4.0,8.0,11.0,1.0
2612,65043_60,65043,60,6.0,6.0,16.0,1.0
2613,65043_72,65043,72,3.0,9.0,14.0,1.0


Significant, but manageable with some kind of imputation. Let's count the number of visits each patient has on record. 

In [6]:
cols = ['patient_id', 'num_entries']
patient_list = clinical.patient_id.unique()
n_list = []
p_list = []

for patient in patient_list:
    n=len(clinical[clinical.patient_id==patient].index)

    n_list.append(n)
    p_list.append(patient)

df_visits_by_patient = pd.DataFrame(list(zip(p_list, n_list)), columns=cols)

df_visits_by_patient


Unnamed: 0,patient_id,num_entries
0,55,13
1,942,15
2,1517,10
3,1923,7
4,2660,6
...,...,...
243,63875,9
244,63889,10
245,64669,15
246,64674,16


NaN values in the "Proteins"  and "Peptides" datasets. 

In [7]:
print(f'NaN value count:\n{proteins.isna().sum()}')
proteins

NaN value count:
visit_id       0
visit_month    0
patient_id     0
UniProt        0
NPX            0
dtype: int64


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0
...,...,...,...,...,...
232736,58648_108,108,58648,Q9UBX5,27387.8
232737,58648_108,108,58648,Q9UHG2,369437.0
232738,58648_108,108,58648,Q9UKV8,105830.0
232739,58648_108,108,58648,Q9Y646,21257.6


In [8]:
print(f'NaN value count:\n{peptides.isna().sum()}')

peptides

NaN value count:
visit_id            0
visit_month         0
patient_id          0
UniProt             0
Peptide             0
PeptideAbundance    0
dtype: int64


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.30
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.00
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.00
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.90
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.70
...,...,...,...,...,...,...
981829,58648_108,108,58648,Q9UHG2,ILAGSADSEGVAAPR,202820.00
981830,58648_108,108,58648,Q9UKV8,SGNIPAGTTVDTK,105830.00
981831,58648_108,108,58648,Q9Y646,LALLVDTVGPR,21257.60
981832,58648_108,108,58648,Q9Y6R7,AGC(UniMod_4)VAESTAVC(UniMod_4)R,5127.26


Great! No NaN values at all in them. Let's find out how many of each patient's visits have protein/peptide data. 

cols = ['patient_id', 'num_entries']
patient_list = clinical.patient_id.unique()
n_list = []
p_list = []

for patient in patient_list:
    n=len(clinical[clinical.patient_id==patient].index)

    n_list.append(n)
    p_list.append(patient)

df_visits_by_patient = pd.DataFrame(list(zip(p_list, n_list)), columns=cols)

df_visits_by_patient

In [14]:
cols = ['patient_id', 'num_entries_protein']
patient_list = proteins.patient_id.unique()
n_list = []
p_list = []

for patient in patient_list:
    n=len(proteins[proteins.patient_id==patient].visit_id.unique())

    n_list.append(n)
    p_list.append(patient)

df_recorded_visits_protein = pd.DataFrame(list(zip(p_list, n_list)), columns=cols)

df_recorded_visits_protein


Unnamed: 0,patient_id,num_entries_protein
0,55,4
1,1517,4
2,1923,3
3,2660,5
4,3636,3
...,...,...
243,52998,3
244,54979,3
245,58597,3
246,7508,3


In [15]:
cols = ['patient_id', 'num_entries_peptide']
patient_list = peptides.patient_id.unique()
n_list = []
p_list = []

for patient in patient_list:
    n=len(peptides[peptides.patient_id==patient].visit_id.unique())

    n_list.append(n)
    p_list.append(patient)

df_recorded_visits_peptide = pd.DataFrame(list(zip(p_list, n_list)), columns=cols)

df_recorded_visits_peptide

Unnamed: 0,patient_id,num_entries_peptide
0,55,4
1,1517,4
2,1923,3
3,2660,5
4,3636,3
...,...,...
243,52998,3
244,54979,3
245,58597,3
246,7508,3


In [29]:
df = pd.merge(df_recorded_visits_protein, df_recorded_visits_peptide, on='patient_id', how='left')
df = pd.merge(df_visits_by_patient, df, on='patient_id', how='left')
df.head(10)

Unnamed: 0,patient_id,num_entries,num_entries_protein,num_entries_peptide
0,55,13,4,4
1,942,15,4,4
2,1517,10,4,4
3,1923,7,3,3
4,2660,6,5,5
5,3636,14,3,3
6,3863,9,5,5
7,4161,12,6,6
8,4172,8,7,7
9,4923,11,5,5


It's not looking great - we can clearly see that, for most of the patients, only 1/2 to 1/3 of the visits contain protein and peptide records.

Something to think about for later on...

Now we will "pivot" the datasets so the unique coding for each protein/peptide becomes a feature for the models to learn on. 

In [30]:
df_proteins = proteins.pivot(index=['patient_id', 'visit_month', 'visit_id'], columns='UniProt', values='NPX').rename_axis(columns=None).reset_index()

df_peptides = peptides.pivot(index=['patient_id', 'visit_month', 'visit_id'], columns='Peptide', values='PeptideAbundance').rename_axis(columns=None).reset_index()


In [31]:
df_proteins

Unnamed: 0,patient_id,visit_month,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
0,55,0,55_0,11254.3,732430.0,39585.8,41526.9,31238.00,4202.71,177775.0,...,365475.0,35528.00,97005.6,23122.5,60912.6,408698.0,,29758.8,23833.7,18953.5
1,55,6,55_6,13163.6,630465.0,35220.8,41295.0,26219.90,4416.42,165638.0,...,405676.0,30332.60,109174.0,23499.8,51655.8,369870.0,,22935.2,17722.5,16642.7
2,55,12,55_12,15257.6,815083.0,41650.9,39763.3,30703.60,4343.60,151073.0,...,303953.0,43026.20,114921.0,21860.1,61598.2,318553.0,65762.6,29193.4,28536.1,19290.9
3,55,36,55_36,13530.8,753832.0,43048.9,43503.6,33577.60,5367.06,101056.0,...,303597.0,48188.40,109794.0,23930.6,70223.5,377550.0,74976.1,31732.6,22186.5,21717.1
4,942,6,942_6,11218.7,399518.0,20581.0,31290.9,6173.58,2564.37,160526.0,...,253373.0,27431.80,93796.7,17450.9,21299.1,306621.0,82335.5,24018.7,18939.5,15251.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,64674,84,64674_84,,190487.0,24907.9,18543.1,10124.90,2308.71,62095.4,...,260021.0,7139.93,104277.0,10500.0,21944.2,136725.0,62217.5,,10287.7,13848.2
1109,65043,0,65043_0,13472.4,927954.0,42661.5,43663.2,20071.30,3278.88,266339.0,...,186414.0,25897.80,,21480.7,57364.0,416142.0,37584.6,,28346.5,35617.5
1110,65043,12,65043_12,14134.9,984651.0,28990.8,42440.9,25357.40,3267.66,270575.0,...,301343.0,22343.40,105626.0,20500.8,54011.2,380072.0,40588.9,,17035.7,37064.2
1111,65043,24,65043_24,14659.5,1062020.0,46440.4,38293.0,21971.80,3990.34,221358.0,...,300439.0,52143.60,139291.0,19449.2,66569.9,300948.0,36150.4,,21286.3,39587.9


In [32]:
df_peptides

Unnamed: 0,patient_id,visit_month,visit_id,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55,0,55_0,8984260.0,53855.6,8579740.0,,19735.4,114400.0,46371.1,...,201158.0,16492.30,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55,6,55_6,8279770.0,45251.9,8655890.0,49927.5,23820.4,90539.4,38652.4,...,171079.0,13198.80,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
2,55,12,55_12,8382390.0,53000.9,8995640.0,45519.2,17813.5,147312.0,45840.9,...,231772.0,17873.80,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
3,55,36,55_36,10671500.0,58108.4,9985420.0,52374.0,19373.3,64356.1,49793.2,...,185290.0,18580.50,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
4,942,6,942_6,6177730.0,42682.6,3596660.0,25698.8,17130.6,86471.5,41007.9,...,226314.0,6399.80,,57571.4,480951.0,80001.2,79661.9,573300.0,48005.8,15674.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,64674,84,64674_84,7083630.0,35656.1,6273100.0,,,15479.2,,...,203523.0,3835.58,4901220.0,40325.9,335625.0,49250.4,64076.3,667993.0,38472.5,21949.1
1109,65043,0,65043_0,7818630.0,95033.0,5119260.0,57483.7,11610.0,270739.0,42527.3,...,257361.0,18316.60,2514660.0,51444.6,530245.0,156148.0,157548.0,336625.0,48423.2,10915.8
1110,65043,12,65043_12,8070390.0,76532.7,8233520.0,54260.6,11631.9,230169.0,42255.5,...,230437.0,16703.20,2481560.0,44405.0,543391.0,159828.0,161207.0,330337.0,45368.1,19023.2
1111,65043,24,65043_24,7608150.0,75401.6,9168030.0,,13313.9,220202.0,46914.1,...,251228.0,18326.20,2939460.0,50588.2,597869.0,148032.0,192857.0,388125.0,65101.0,20790.1


Because not all 1000+ proteins and peptides are measured at each recorded visit, some NaN values should be now expected. 

In [33]:
df_proteins.isna().sum().sort_values(ascending=False)

Q99829        624
Q99832        507
Q562R1        497
P01780        459
Q6UX71        452
             ... 
P02766          0
P02765          0
P02751          0
P02749          0
patient_id      0
Length: 230, dtype: int64

In [34]:
df_peptides.isna().sum().sort_values(ascending=False)

QALPQVR                   624
EPQVYTLPPSRDELTK          550
TPSGLYLGTC(UniMod_4)ER    523
SLEDQVEMLR                514
HYEGSTVPEK                508
                         ... 
visit_id                    0
IPTTFENGR                   0
AIGYLNTGYQR                 0
NILTSNNIDVK                 0
patient_id                  0
Length: 971, dtype: int64

We are going to combine the protein and peptide data, and check once again the status of NaN values. They should remain unchanged because we haven't done anything with them.

In [35]:
prot_pept_df = pd.merge(df_proteins, df_peptides, on=['patient_id','visit_month','visit_id'], how='left')
prot_pept_df

Unnamed: 0,patient_id,visit_month,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55,0,55_0,11254.3,732430.0,39585.8,41526.9,31238.00,4202.71,177775.0,...,201158.0,16492.30,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55,6,55_6,13163.6,630465.0,35220.8,41295.0,26219.90,4416.42,165638.0,...,171079.0,13198.80,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
2,55,12,55_12,15257.6,815083.0,41650.9,39763.3,30703.60,4343.60,151073.0,...,231772.0,17873.80,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
3,55,36,55_36,13530.8,753832.0,43048.9,43503.6,33577.60,5367.06,101056.0,...,185290.0,18580.50,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
4,942,6,942_6,11218.7,399518.0,20581.0,31290.9,6173.58,2564.37,160526.0,...,226314.0,6399.80,,57571.4,480951.0,80001.2,79661.9,573300.0,48005.8,15674.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,64674,84,64674_84,,190487.0,24907.9,18543.1,10124.90,2308.71,62095.4,...,203523.0,3835.58,4901220.0,40325.9,335625.0,49250.4,64076.3,667993.0,38472.5,21949.1
1109,65043,0,65043_0,13472.4,927954.0,42661.5,43663.2,20071.30,3278.88,266339.0,...,257361.0,18316.60,2514660.0,51444.6,530245.0,156148.0,157548.0,336625.0,48423.2,10915.8
1110,65043,12,65043_12,14134.9,984651.0,28990.8,42440.9,25357.40,3267.66,270575.0,...,230437.0,16703.20,2481560.0,44405.0,543391.0,159828.0,161207.0,330337.0,45368.1,19023.2
1111,65043,24,65043_24,14659.5,1062020.0,46440.4,38293.0,21971.80,3990.34,221358.0,...,251228.0,18326.20,2939460.0,50588.2,597869.0,148032.0,192857.0,388125.0,65101.0,20790.1


In [36]:
prot_pept_df.isna().sum().sort_values(ascending=False)

Q99829                    624
QALPQVR                   624
EPQVYTLPPSRDELTK          550
TPSGLYLGTC(UniMod_4)ER    523
SLEDQVEMLR                514
                         ... 
P41222                      0
P02774                      0
P02787                      0
P02790                      0
patient_id                  0
Length: 1198, dtype: int64

In [37]:
patient_list=prot_pept_df['patient_id'].unique()
patient_list

array([   55,   942,  1517,  1923,  2660,  3636,  3863,  4161,  4172,
        4923,  5027,  5036,  5178,  5645,  5742,  6054,  6211,  6420,
        7051,  7117,  7151,  7265,  7508,  7568,  7832,  7886,  8344,
        8699, 10053, 10138, 10174, 10541, 10715, 10718, 11459, 11686,
       11928, 12516, 12636, 12703, 12755, 12931, 13360, 13368, 13618,
       13804, 13852, 13968, 14035, 14124, 14242, 14270, 14344, 14450,
       14811, 15009, 15245, 15504, 15590, 16238, 16347, 16566, 16574,
       16778, 16931, 17154, 17201, 17414, 17727, 18183, 18204, 18553,
       18560, 19088, 20212, 20216, 20352, 20404, 20460, 20581, 20664,
       20707, 20791, 20792, 21126, 21537, 21729, 22126, 22623, 23175,
       23192, 23244, 23391, 23636, 24278, 24690, 24818, 24820, 24911,
       25562, 25739, 25750, 25827, 25911, 26005, 26104, 26210, 26809,
       27079, 27300, 27464, 27468, 27607, 27715, 27872, 27893, 27971,
       27987, 28327, 28342, 28818, 29313, 29417, 30119, 30155, 30416,
       30894, 30951,

In [None]:
big_data = pd.merge(clinical)

Here is a big step: imputation. I am choosing to use sklearn's KNN imputer because, with a bit of clever coding, I can use all the availble data for a given patient to impute missing values. 

In [None]:
data_imputed_list = []
for patient_id in patient_list:
    masked_data = prot_pept_df[prot_pept_df['patient_id']==patient_id]
    num_rows = len(masked_data.index)
    knn = KNNImputer(missing_values=np.nan, keep_empty_features=True, n_neighbors=num_rows)
    X_knn = knn.fit_transform(masked_data)
    X_knn_df = pd.DataFrame(X_knn, columns = prot_pept_df.columns)
    data_imputed_list.append(X_knn_df)
prot_pept_imputed = pd.concat(data_imputed_list)

In [None]:
prot_pept_imputed.isna().sum()

In [None]:
prot_pept_imputed.reset_index(inplace=True)
prot_pept_imputed

In [None]:
prot_pept_imputed.drop(columns=['index'], inplace=True)

In [None]:
prot_pept_imputed = prot_pept_imputed.astype({'patient_id': 'int', 'visit_month': 'int'})
prot_pept_imputed['visit_id'] = prot_pept_df['visit_id']
print(prot_pept_imputed.dtypes)

In [None]:
prot_pept_imputed

In [None]:
print(clinical.isna().sum())
print(f'dataset shape: {clinical.shape}')

In [None]:
data_imputed_list = []
for patient_id in patient_list:
    masked_data = clinical[clinical['patient_id']==patient_id]
    num_rows = len(masked_data.index)
    knn = KNNImputer(missing_values=np.nan, keep_empty_features=True, n_neighbors=num_rows)
    X_knn = knn.fit_transform(masked_data)
    X_knn_df = pd.DataFrame(X_knn, columns = clinical.columns)
    data_imputed_list.append(X_knn_df)
clinical_imputed = pd.concat(data_imputed_list)

In [None]:
print(clinical_imputed.isna().sum())
print(f'imputed clinical dataset shape: {clinical_imputed.shape}')

In [None]:
clinical_imputed.reset_index(inplace=True)
clinical_imputed.drop(columns='index', inplace=True)
clinical_imputed

In [None]:
clinical_imputed = clinical_imputed.astype({'patient_id': 'int', 'visit_month': 'int'})
clinical_imputed['visit_id'] = clinical['visit_id']
print(clinical_imputed.dtypes)

In [None]:

prot_pept_clinical = pd.merge(prot_pept_imputed, clinical_imputed, on=['visit_id', 'visit_month', 'patient_id'], how='left')


In [None]:
print(prot_pept_clinical.isna().sum())
print(f'merged protein peptide and clinical dataset shape: {prot_pept_clinical.shape}')

In [None]:
df = prot_pept_clinical[prot_pept_clinical.columns[prot_pept_clinical.isna().any()]]

df1 = df[df.isna().any(axis=1)]

print(df1)
print()
print(f'Remaining NaN values is a dataframe of: {df1.shape}')

In [None]:
prot_pept_clinical.dropna(subset = targets, inplace=True, axis=0)
prot_pept_clinical.isna().sum()

In [None]:
def smape_score(actual, predicted):
    sum = 0
    for a, p in zip(actual, predicted):
        if a==0 and p==0:
            pass
        else:
            sum += (np.abs(p-a))/(np.abs(p)+np.abs(a))*2
    return sum/len(actual)*100

In [None]:
results = pd.DataFrame(columns=[])

In [None]:
cols = ['patient_id', 'num_entries']
patient_list = prot_pept_clinical.patient_id.unique()
n_list = []
p_list = []

for patient in patient_list:
    n=len(prot_pept_clinical[prot_pept_clinical.patient_id==patient].index)

    n_list.append(n)
    p_list.append(patient)

df = pd.DataFrame(list(zip(p_list, n_list)), columns=cols)

df

In [None]:
#group_k_fold = GroupKFold(n_splits=num_patients)
gss = GroupShuffleSplit(n_splits=2, random_state=100, test_size=0.1)

In [None]:
for target in targets:
    X = prot_pept_clinical.drop(columns=targets, axis=1)
    y = prot_pept_clinical[target]

    for i, (train_index, test_index) in enumerate(gss.split(X, y, groups=X.patient_id)):
    
        X_train = X.loc[train_index]
        y_train = y.loc[train_index]
        X_test = X.loc[test_index]
        y_test = y.loc[test_index]


In [None]:
#Random FOrest Regressor
mse_list = []
smape_list = []
mae_list = []

for target in targets:

    X = prot_pept_clinical.drop(columns=targets, axis=1)
    y = prot_pept_clinical[target]
    
    train, test = gss.split(X, y, test_size = 0.1, groups='patient_id') 

    print(f'{len(X_train)} samples in training, {len(X_test)} samples in testing.')

    RFR = RandomForestRegressor(n_estimators=100, max_features=100)

    RFR.fit(X_train, y_train)

    y_predict = RFR.predict(X_test)
    #cv = StratifiedKFold(n_splits = 5)

    #mse = mean_squared_error(y_test, y_predict)
    mae = mean_absolute_error(y_test, y_predict)
    smape = smape_score(y_test, y_predict)

    #mse_list.append(mse)
    smape_list.append(smape)
    mae_list.append(mae)

    #print(f'MSE score for {target}: {mse}')
    print(f'MAE score for {target}: {mae}')
    print(f'SMAPE score for {target}: {smape}')
    print()

#mse_average = np.mean(mse_list)
mae_average = np.mean(mae_list)
smape_average = np.mean(smape_list)

#print(f'MSE average score: {mse_average}')
print(f'MAE average score: {mae_average}')
print(f'SMAPE average score: {smape_average}')


In [None]:
#HistGradientBoostingRegressor

mse_list = []
smape_list = []
mae_list = []

for target in targets:

    X = prot_pept_clinical.drop(columns=targets, axis=1)
    y = prot_pept_clinical[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=100) 

    print(f'{len(X_train)} samples in training, {len(X_test)} samples in testing.')

    hgb = HistGradientBoostingRegressor()

    hgb.fit(X_train, y_train)

    y_predict = hgb.predict(X_test)

    #mse = mean_squared_error(y_test, y_predict)
    mae = mean_absolute_error(y_test, y_predict)
    smape = smape_score(y_test, y_predict)

    #mse_list.append(mse)
    smape_list.append(smape)
    mae_list.append(mae)

    #print(f'MSE score for {target}: {mse}')
    print(f'MAE score for {target}: {mae}')
    print(f'SMAPE score for {target}: {smape}')
    print()

#mse_average = np.mean(mse_list)
mae_average = np.mean(mae_list)
smape_average = np.mean(smape_list)

#print(f'MSE average score: {mse_average}')
print(f'MAE average score: {mae_average}')
print(f'SMAPE average score: {smape_average}')


In [None]:
#ADABOOST REGRESSOR with decision tree regressor
smape_list = []
mae_list = []


for target in targets:

    X = prot_pept_clinical.drop(columns=targets, axis=1)
    y = prot_pept_clinical[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=100) 


    print(f'{len(X_train)} samples in training, {len(X_test)} samples in testing.')

    ada = AdaBoostRegressor(DecisionTreeRegressor(max_features=100))

    ada.fit(X_train, y_train)

    y_predict = ada.predict(X_test)
    #cv = StratifiedKFold(n_splits = 5)

    #mse = mean_squared_error(y_test, y_predict)
    mae = mean_absolute_error(y_test, y_predict)
    smape = smape_score(y_test, y_predict)

    #mse_list.append(mse)
    smape_list.append(smape)
    mae_list.append(mae)

    #print(f'MSE score for {target}: {mse}')
    print(f'MAE score for {target}: {mae}')
    print(f'SMAPE score for {target}: {smape}')
    print()

    #mse_average = np.mean(mse_list)
    mae_average = np.mean(mae_list)
    smape_average = np.mean(smape_list)

    #print(f'MSE average score: {mse_average}')
    print(f'MAE average score: {mae_average}')
    print(f'SMAPE average score: {smape_average}')
    print()

In [None]:
X1 = prot_pept_clinical.drop(columns='visit_id', axis=1)
X = X1.drop(columns=targets, axis=1)

mse_list = []
smape_list = []
mae_list = []

for target in targets:

    y = prot_pept_clinical[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=100) 

    print(f'{len(X_train)} samples in training, {len(X_test)} samples in testing.')

    xgbr = xgboost.XGBRegressor(objective='reg:linear', n_estimators=100)

    xgbr.fit(X_train, y_train)

    y_predict = xgbr.predict(X_test)

    #mse = mean_squared_error(y_test, y_predict)
    mae = mean_absolute_error(y_test, y_predict)
    smape = smape_score(y_test, y_predict)

    #mse_list.append(mse)
    smape_list.append(smape)
    mae_list.append(mae)

    #print(f'MSE score for {target}: {mse}')
    print(f'MAE score for {target}: {mae}')
    print(f'SMAPE score for {target}: {smape}')
    print()

#mse_average = np.mean(mse_list)
mae_average = np.mean(mae_list)
smape_average = np.mean(smape_list)

#print(f'MSE average score: {mse_average}')
print(f'MAE average score: {mae_average}')
print(f'SMAPE average score: {smape_average}')

In [None]:
str = xgbr.__class__.__name__