In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from pathlib import Path

# classifiers we will use
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, HistGradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost

#imputers
from sklearn.impute import SimpleImputer, KNNImputer

# model selection bits
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.model_selection import learning_curve, validation_curve

# evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error


import scipy

In [2]:
# Load the competition datasets into Pandas DataFrame
proteins = pd.read_csv("~/Documents/amp-parkinsons-disease-progression-prediction/train_proteins.csv")
peptides = pd.read_csv("~/Documents/amp-parkinsons-disease-progression-prediction/train_peptides.csv")
clinical = pd.read_csv("~/Documents/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv")
supplemental = pd.read_csv("~/Documents/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv")

In [3]:
# drop the "medication status" column (due to over 50% NaN values), keep a copy of the original
# for later access. 
clinical_copy = clinical.copy()

clinical.drop('upd23b_clinical_state_on_medication', axis=1, inplace=True)

In [4]:

targets = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']
ids = ['patient_id', 'visit_id']
month = ['visit_month']

In [5]:
print(f'NaN value count:\n{clinical.isna().sum()}')
clinical

NaN value count:
visit_id          0
patient_id        0
visit_month       0
updrs_1           1
updrs_2           2
updrs_3          25
updrs_4        1038
dtype: int64


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4
0,55_0,55,0,10.0,6.0,15.0,
1,55_3,55,3,10.0,7.0,25.0,
2,55_6,55,6,8.0,10.0,34.0,
3,55_9,55,9,8.0,9.0,30.0,0.0
4,55_12,55,12,10.0,10.0,41.0,0.0
...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7.0,6.0,13.0,0.0
2611,65043_54,65043,54,4.0,8.0,11.0,1.0
2612,65043_60,65043,60,6.0,6.0,16.0,1.0
2613,65043_72,65043,72,3.0,9.0,14.0,1.0


In [6]:
print(f'NaN value count:\n{proteins.isna().sum()}')
proteins

NaN value count:
visit_id       0
visit_month    0
patient_id     0
UniProt        0
NPX            0
dtype: int64


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0
...,...,...,...,...,...
232736,58648_108,108,58648,Q9UBX5,27387.8
232737,58648_108,108,58648,Q9UHG2,369437.0
232738,58648_108,108,58648,Q9UKV8,105830.0
232739,58648_108,108,58648,Q9Y646,21257.6


In [7]:
print(f'NaN value count:\n{peptides.isna().sum()}')

peptides

NaN value count:
visit_id            0
visit_month         0
patient_id          0
UniProt             0
Peptide             0
PeptideAbundance    0
dtype: int64


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.30
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.00
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.00
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.90
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.70
...,...,...,...,...,...,...
981829,58648_108,108,58648,Q9UHG2,ILAGSADSEGVAAPR,202820.00
981830,58648_108,108,58648,Q9UKV8,SGNIPAGTTVDTK,105830.00
981831,58648_108,108,58648,Q9Y646,LALLVDTVGPR,21257.60
981832,58648_108,108,58648,Q9Y6R7,AGC(UniMod_4)VAESTAVC(UniMod_4)R,5127.26


In [8]:

proteins_grouped = proteins.groupby(['patient_id','visit_month', 'visit_id', 'UniProt'])['NPX'].mean().reset_index()
peptides_grouped = peptides.groupby(['patient_id','visit_month', 'visit_id', 'Peptide'])['PeptideAbundance'].mean().reset_index()

In [9]:
proteins_grouped.head(20)

Unnamed: 0,patient_id,visit_month,visit_id,UniProt,NPX
0,55,0,55_0,O00391,11254.3
1,55,0,55_0,O00533,732430.0
2,55,0,55_0,O00584,39585.8
3,55,0,55_0,O14498,41526.9
4,55,0,55_0,O14773,31238.0
5,55,0,55_0,O14791,4202.71
6,55,0,55_0,O15240,177775.0
7,55,0,55_0,O15394,62898.2
8,55,0,55_0,O43505,333376.0
9,55,0,55_0,O60888,166850.0


In [10]:
df_proteins = proteins_grouped.pivot(index=['patient_id', 'visit_month', 'visit_id'], columns='UniProt', values='NPX').rename_axis(columns=None).reset_index()
df_peptides = peptides_grouped.pivot(index=['patient_id', 'visit_month', 'visit_id'], columns='Peptide', values='PeptideAbundance').rename_axis(columns=None).reset_index()


In [11]:
df_proteins.head(20)

Unnamed: 0,patient_id,visit_month,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
0,55,0,55_0,11254.3,732430.0,39585.8,41526.9,31238.0,4202.71,177775.0,...,365475.0,35528.0,97005.6,23122.5,60912.6,408698.0,,29758.8,23833.7,18953.5
1,55,6,55_6,13163.6,630465.0,35220.8,41295.0,26219.9,4416.42,165638.0,...,405676.0,30332.6,109174.0,23499.8,51655.8,369870.0,,22935.2,17722.5,16642.7
2,55,12,55_12,15257.6,815083.0,41650.9,39763.3,30703.6,4343.6,151073.0,...,303953.0,43026.2,114921.0,21860.1,61598.2,318553.0,65762.6,29193.4,28536.1,19290.9
3,55,36,55_36,13530.8,753832.0,43048.9,43503.6,33577.6,5367.06,101056.0,...,303597.0,48188.4,109794.0,23930.6,70223.5,377550.0,74976.1,31732.6,22186.5,21717.1
4,942,6,942_6,11218.7,399518.0,20581.0,31290.9,6173.58,2564.37,160526.0,...,253373.0,27431.8,93796.7,17450.9,21299.1,306621.0,82335.5,24018.7,18939.5,15251.2
5,942,12,942_12,6757.32,360858.0,18367.6,14760.7,18603.4,1722.77,86847.4,...,45742.3,33518.6,94049.7,13415.7,21324.7,234094.0,82410.4,19183.7,17804.1,12277.0
6,942,24,942_24,,352722.0,22834.9,23393.1,16693.5,1487.91,114772.0,...,180475.0,29770.6,95949.9,11344.4,23637.6,256654.0,76931.9,19168.2,19215.9,14625.6
7,942,48,942_48,11627.8,251820.0,22046.5,26360.5,22440.2,2117.43,82241.9,...,197987.0,29283.8,121696.0,19169.8,16724.9,232301.0,96905.9,21120.9,14089.8,16418.5
8,1517,0,1517_0,11648.9,419015.0,35053.8,,17466.3,2784.4,118752.0,...,306280.0,24931.3,,12610.9,40018.9,164826.0,55379.8,31563.0,19771.6,14699.5
9,1517,24,1517_24,11671.9,438138.0,33740.8,21540.9,23847.4,3775.48,73355.2,...,196768.0,25513.1,,17933.3,25719.3,216029.0,57261.8,18728.5,14642.8,24159.4


In [12]:
df_peptides.head(20)

Unnamed: 0,patient_id,visit_month,visit_id,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55,0,55_0,8984260.0,53855.6,8579740.0,,19735.4,114400.0,46371.1,...,201158.0,16492.3,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55,6,55_6,8279770.0,45251.9,8655890.0,49927.5,23820.4,90539.4,38652.4,...,171079.0,13198.8,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
2,55,12,55_12,8382390.0,53000.9,8995640.0,45519.2,17813.5,147312.0,45840.9,...,231772.0,17873.8,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
3,55,36,55_36,10671500.0,58108.4,9985420.0,52374.0,19373.3,64356.1,49793.2,...,185290.0,18580.5,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
4,942,6,942_6,6177730.0,42682.6,3596660.0,25698.8,17130.6,86471.5,41007.9,...,226314.0,6399.8,,57571.4,480951.0,80001.2,79661.9,573300.0,48005.8,15674.1
5,942,12,942_12,11289900.0,46111.7,11297300.0,,13894.1,53755.0,40289.3,...,173259.0,4767.63,374307.0,35767.3,250397.0,65966.9,77976.8,486239.0,45032.7,
6,942,24,942_24,10161900.0,32145.0,12388000.0,25869.2,17341.8,48625.5,45223.9,...,185428.0,5554.53,,64049.8,479473.0,68505.7,74483.1,561398.0,52916.4,21847.6
7,942,48,942_48,8248490.0,30563.4,11882600.0,,19114.9,60221.4,46685.9,...,137611.0,6310.09,,28008.8,231359.0,63265.8,64601.8,632782.0,51123.7,20700.3
8,1517,0,1517_0,9465580.0,36200.7,8639050.0,32892.8,16584.9,34969.8,39739.1,...,210259.0,6789.37,3956620.0,101069.0,767457.0,65663.2,113977.0,468234.0,44771.3,20695.0
9,1517,24,1517_24,6282660.0,51011.9,7494410.0,,845.74,77770.8,,...,218111.0,10053.5,3952000.0,152704.0,1079680.0,102169.0,129905.0,373894.0,58716.7,26098.3


In [13]:
df_proteins.isna().sum().sort_values(ascending=False)

Q99829        624
Q99832        507
Q562R1        497
P01780        459
Q6UX71        452
             ... 
P02766          0
P02765          0
P02751          0
P02749          0
patient_id      0
Length: 230, dtype: int64

In [14]:
df_peptides.isna().sum().sort_values(ascending=False)

QALPQVR                   624
EPQVYTLPPSRDELTK          550
TPSGLYLGTC(UniMod_4)ER    523
SLEDQVEMLR                514
HYEGSTVPEK                508
                         ... 
visit_id                    0
IPTTFENGR                   0
AIGYLNTGYQR                 0
NILTSNNIDVK                 0
patient_id                  0
Length: 971, dtype: int64

In [15]:
prot_pept_df = pd.merge(df_proteins, df_peptides, on=['patient_id','visit_month','visit_id'], how='left')
prot_pept_df

Unnamed: 0,patient_id,visit_month,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55,0,55_0,11254.3,732430.0,39585.8,41526.9,31238.00,4202.71,177775.0,...,201158.0,16492.30,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55,6,55_6,13163.6,630465.0,35220.8,41295.0,26219.90,4416.42,165638.0,...,171079.0,13198.80,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
2,55,12,55_12,15257.6,815083.0,41650.9,39763.3,30703.60,4343.60,151073.0,...,231772.0,17873.80,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
3,55,36,55_36,13530.8,753832.0,43048.9,43503.6,33577.60,5367.06,101056.0,...,185290.0,18580.50,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
4,942,6,942_6,11218.7,399518.0,20581.0,31290.9,6173.58,2564.37,160526.0,...,226314.0,6399.80,,57571.4,480951.0,80001.2,79661.9,573300.0,48005.8,15674.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,64674,84,64674_84,,190487.0,24907.9,18543.1,10124.90,2308.71,62095.4,...,203523.0,3835.58,4901220.0,40325.9,335625.0,49250.4,64076.3,667993.0,38472.5,21949.1
1109,65043,0,65043_0,13472.4,927954.0,42661.5,43663.2,20071.30,3278.88,266339.0,...,257361.0,18316.60,2514660.0,51444.6,530245.0,156148.0,157548.0,336625.0,48423.2,10915.8
1110,65043,12,65043_12,14134.9,984651.0,28990.8,42440.9,25357.40,3267.66,270575.0,...,230437.0,16703.20,2481560.0,44405.0,543391.0,159828.0,161207.0,330337.0,45368.1,19023.2
1111,65043,24,65043_24,14659.5,1062020.0,46440.4,38293.0,21971.80,3990.34,221358.0,...,251228.0,18326.20,2939460.0,50588.2,597869.0,148032.0,192857.0,388125.0,65101.0,20790.1


In [16]:
prot_pept_df.isna().sum().sort_values(ascending=False)

Q99829                    624
QALPQVR                   624
EPQVYTLPPSRDELTK          550
TPSGLYLGTC(UniMod_4)ER    523
SLEDQVEMLR                514
                         ... 
P41222                      0
P02774                      0
P02787                      0
P02790                      0
patient_id                  0
Length: 1198, dtype: int64

In [17]:
prot_pept_df.isna().sum(axis=1).sort_values(ascending=False)

453     1064
335      532
1005     513
333      498
978      497
        ... 
631       29
445       28
280       28
793       28
791       24
Length: 1113, dtype: int64

In [18]:
patient_list=prot_pept_df['patient_id'].unique()
patient_list

array([   55,   942,  1517,  1923,  2660,  3636,  3863,  4161,  4172,
        4923,  5027,  5036,  5178,  5645,  5742,  6054,  6211,  6420,
        7051,  7117,  7151,  7265,  7508,  7568,  7832,  7886,  8344,
        8699, 10053, 10138, 10174, 10541, 10715, 10718, 11459, 11686,
       11928, 12516, 12636, 12703, 12755, 12931, 13360, 13368, 13618,
       13804, 13852, 13968, 14035, 14124, 14242, 14270, 14344, 14450,
       14811, 15009, 15245, 15504, 15590, 16238, 16347, 16566, 16574,
       16778, 16931, 17154, 17201, 17414, 17727, 18183, 18204, 18553,
       18560, 19088, 20212, 20216, 20352, 20404, 20460, 20581, 20664,
       20707, 20791, 20792, 21126, 21537, 21729, 22126, 22623, 23175,
       23192, 23244, 23391, 23636, 24278, 24690, 24818, 24820, 24911,
       25562, 25739, 25750, 25827, 25911, 26005, 26104, 26210, 26809,
       27079, 27300, 27464, 27468, 27607, 27715, 27872, 27893, 27971,
       27987, 28327, 28342, 28818, 29313, 29417, 30119, 30155, 30416,
       30894, 30951,

In [19]:
data_imputed_list = []
for patient_id in patient_list:
    masked_data = prot_pept_df[prot_pept_df['patient_id']==patient_id]
    num_rows = len(masked_data.index)
    knn = KNNImputer(missing_values=np.nan, keep_empty_features=True, n_neighbors=num_rows)
    X_knn = knn.fit_transform(masked_data)
    X_knn_df = pd.DataFrame(X_knn, columns = prot_pept_df.columns)
    data_imputed_list.append(X_knn_df)
prot_pept_imputed = pd.concat(data_imputed_list)

In [20]:
prot_pept_imputed.isna().sum()

patient_id               0
visit_month              0
visit_id                 0
O00391                   0
O00533                   0
                        ..
YVNKEIQNAVNGVK           0
YWGVASFLQK               0
YYC(UniMod_4)FQGNQFLR    0
YYTYLIMNK                0
YYWGGQYTWDMAK            0
Length: 1198, dtype: int64

In [21]:
prot_pept_imputed.reset_index(inplace=True)
prot_pept_imputed

Unnamed: 0,index,patient_id,visit_month,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,0,55.0,0.0,550.0,11254.3,732430.0,39585.8,41526.9,31238.00,4202.71,...,201158.0,16492.30,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,1,55.0,6.0,556.0,13163.6,630465.0,35220.8,41295.0,26219.90,4416.42,...,171079.0,13198.80,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
2,2,55.0,12.0,5512.0,15257.6,815083.0,41650.9,39763.3,30703.60,4343.60,...,231772.0,17873.80,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
3,3,55.0,36.0,5536.0,13530.8,753832.0,43048.9,43503.6,33577.60,5367.06,...,185290.0,18580.50,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
4,0,942.0,6.0,9426.0,11218.7,399518.0,20581.0,31290.9,6173.58,2564.37,...,226314.0,6399.80,374307.0,57571.4,480951.0,80001.2,79661.9,573300.0,48005.8,15674.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,5,64674.0,84.0,6467484.0,0.0,190487.0,24907.9,18543.1,10124.90,2308.71,...,203523.0,3835.58,4901220.0,40325.9,335625.0,49250.4,64076.3,667993.0,38472.5,21949.1
1109,0,65043.0,0.0,650430.0,13472.4,927954.0,42661.5,43663.2,20071.30,3278.88,...,257361.0,18316.60,2514660.0,51444.6,530245.0,156148.0,157548.0,336625.0,48423.2,10915.8
1110,1,65043.0,12.0,6504312.0,14134.9,984651.0,28990.8,42440.9,25357.40,3267.66,...,230437.0,16703.20,2481560.0,44405.0,543391.0,159828.0,161207.0,330337.0,45368.1,19023.2
1111,2,65043.0,24.0,6504324.0,14659.5,1062020.0,46440.4,38293.0,21971.80,3990.34,...,251228.0,18326.20,2939460.0,50588.2,597869.0,148032.0,192857.0,388125.0,65101.0,20790.1


In [22]:
prot_pept_imputed.drop(columns=['index'], inplace=True)

In [23]:
prot_pept_imputed = prot_pept_imputed.astype({'patient_id': 'int', 'visit_month': 'int'})
prot_pept_imputed['visit_id'] = prot_pept_df['visit_id']
print(prot_pept_imputed.dtypes)

patient_id                 int32
visit_month                int32
visit_id                  object
O00391                   float64
O00533                   float64
                          ...   
YVNKEIQNAVNGVK           float64
YWGVASFLQK               float64
YYC(UniMod_4)FQGNQFLR    float64
YYTYLIMNK                float64
YYWGGQYTWDMAK            float64
Length: 1198, dtype: object


In [24]:
prot_pept_imputed

Unnamed: 0,patient_id,visit_month,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55,0,55_0,11254.3,732430.0,39585.8,41526.9,31238.00,4202.71,177775.0,...,201158.0,16492.30,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55,6,55_6,13163.6,630465.0,35220.8,41295.0,26219.90,4416.42,165638.0,...,171079.0,13198.80,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
2,55,12,55_12,15257.6,815083.0,41650.9,39763.3,30703.60,4343.60,151073.0,...,231772.0,17873.80,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
3,55,36,55_36,13530.8,753832.0,43048.9,43503.6,33577.60,5367.06,101056.0,...,185290.0,18580.50,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
4,942,6,942_6,11218.7,399518.0,20581.0,31290.9,6173.58,2564.37,160526.0,...,226314.0,6399.80,374307.0,57571.4,480951.0,80001.2,79661.9,573300.0,48005.8,15674.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,64674,84,64674_84,0.0,190487.0,24907.9,18543.1,10124.90,2308.71,62095.4,...,203523.0,3835.58,4901220.0,40325.9,335625.0,49250.4,64076.3,667993.0,38472.5,21949.1
1109,65043,0,65043_0,13472.4,927954.0,42661.5,43663.2,20071.30,3278.88,266339.0,...,257361.0,18316.60,2514660.0,51444.6,530245.0,156148.0,157548.0,336625.0,48423.2,10915.8
1110,65043,12,65043_12,14134.9,984651.0,28990.8,42440.9,25357.40,3267.66,270575.0,...,230437.0,16703.20,2481560.0,44405.0,543391.0,159828.0,161207.0,330337.0,45368.1,19023.2
1111,65043,24,65043_24,14659.5,1062020.0,46440.4,38293.0,21971.80,3990.34,221358.0,...,251228.0,18326.20,2939460.0,50588.2,597869.0,148032.0,192857.0,388125.0,65101.0,20790.1


In [25]:
clinical.isna().sum()

visit_id          0
patient_id        0
visit_month       0
updrs_1           1
updrs_2           2
updrs_3          25
updrs_4        1038
dtype: int64

In [26]:
data_imputed_list = []
for patient_id in patient_list:
    masked_data = clinical[clinical['patient_id']==patient_id]
    num_rows = len(masked_data.index)
    knn = KNNImputer(missing_values=np.nan, keep_empty_features=True, n_neighbors=num_rows)
    X_knn = knn.fit_transform(masked_data)
    X_knn_df = pd.DataFrame(X_knn, columns = clinical.columns)
    data_imputed_list.append(X_knn_df)
clinical_imputed = pd.concat(data_imputed_list)

In [27]:
clinical_imputed.isna().sum()

visit_id       0
patient_id     0
visit_month    0
updrs_1        0
updrs_2        0
updrs_3        0
updrs_4        0
dtype: int64

In [28]:
clinical_imputed.reset_index(inplace=True)
clinical_imputed.drop(columns='index', inplace=True)
clinical_imputed

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4
0,550.0,55.0,0.0,10.0,6.0,15.0,0.0
1,553.0,55.0,3.0,10.0,7.0,25.0,0.0
2,556.0,55.0,6.0,8.0,10.0,34.0,0.0
3,559.0,55.0,9.0,8.0,9.0,30.0,0.0
4,5512.0,55.0,12.0,10.0,10.0,41.0,0.0
...,...,...,...,...,...,...,...
2610,6504348.0,65043.0,48.0,7.0,6.0,13.0,0.0
2611,6504354.0,65043.0,54.0,4.0,8.0,11.0,1.0
2612,6504360.0,65043.0,60.0,6.0,6.0,16.0,1.0
2613,6504372.0,65043.0,72.0,3.0,9.0,14.0,1.0


In [29]:
clinical_imputed = clinical_imputed.astype({'patient_id': 'int', 'visit_month': 'int'})
clinical_imputed['visit_id'] = clinical['visit_id']
print(clinical_imputed.dtypes)

visit_id        object
patient_id       int32
visit_month      int32
updrs_1        float64
updrs_2        float64
updrs_3        float64
updrs_4        float64
dtype: object


In [30]:

prot_pept_clinical = pd.merge(prot_pept_imputed, clinical_imputed, on=['visit_id', 'visit_month', 'patient_id'], how='left')


In [31]:
prot_pept_clinical.isna().sum()

patient_id        0
visit_month       0
visit_id          0
O00391            0
O00533            0
                 ..
YYWGGQYTWDMAK     0
updrs_1          45
updrs_2          45
updrs_3          45
updrs_4          45
Length: 1202, dtype: int64

In [32]:
df = prot_pept_clinical[prot_pept_clinical.columns[prot_pept_clinical.isna().any()]]

df1 = df[df.isna().any(axis=1)]

df1.head(20)

Unnamed: 0,updrs_1,updrs_2,updrs_3,updrs_4
16,,,,
47,,,,
50,,,,
57,,,,
92,,,,
95,,,,
156,,,,
172,,,,
182,,,,
216,,,,


In [33]:
prot_pept_clinical.dropna(subset = targets, inplace=True, axis=0)
prot_pept_clinical.isna().sum()

patient_id       0
visit_month      0
visit_id         0
O00391           0
O00533           0
                ..
YYWGGQYTWDMAK    0
updrs_1          0
updrs_2          0
updrs_3          0
updrs_4          0
Length: 1202, dtype: int64

In [34]:
def smape_score(actual, predicted):
    sum = 0
    for a, p in zip(actual, predicted):
        if a==0 and p==0:
            pass
        else:
            sum += (np.abs(p-a))/(np.abs(p)+np.abs(a))*2
    return sum/len(actual)*100

In [None]:
results = pd.DataFrame(columns=[])

In [35]:
#Random FOrest Regressor
mse_list = []
smape_list = []
mae_list = []

for target in targets:

    X = prot_pept_clinical.drop(columns=targets, axis=1)
    y = prot_pept_clinical[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9) 

    print(f'{len(X_train)} samples in training, {len(X_test)} samples in testing.')

    RFR = RandomForestRegressor(n_estimators=100)

    RFR.fit(X_train, y_train)

    y_predict = RFR.predict(X_test)
    #cv = StratifiedKFold(n_splits = 5)

    mse = mean_squared_error(y_test, y_predict)
    mae = mean_absolute_error(y_test, y_predict)
    smape = smape_score(y_test, y_predict)

    mse_list.append(mse)
    smape_list.append(smape)
    mae_list.append(mae)

    print(f'MSE score for {target}: {mse}')
    print(f'MAE score for {target}: {mae}')
    print(f'SMAPE score for {target}: {smape}')
    print()

mse_average = np.mean(mse_list)
mae_average = np.mean(mae_list)
smape_average = np.mean(smape_list)

print(f'MSE average score: {mse_average}')
print(f'MAE average score: {mae_average}')
print(f'SMAPE average score: {smape_average}')


961 samples in training, 107 samples in testing.
MSE score for updrs_1: 15.685151401869156
MAE score for updrs_1: 3.0781308411214954
SMAPE score for updrs_1: 62.30070910786756

961 samples in training, 107 samples in testing.
MSE score for updrs_2: 26.966490654205607
MAE score for updrs_2: 4.017289719626168
SMAPE score for updrs_2: 88.58341319868686

961 samples in training, 107 samples in testing.
MSE score for updrs_3: 146.59053631152645
MAE score for updrs_3: 9.57999554962172
SMAPE score for updrs_3: 71.52959339031723

961 samples in training, 107 samples in testing.
MSE score for updrs_4: 2.918323389289884
MAE score for updrs_4: 1.2240878220948315
SMAPE score for updrs_4: 146.707607784753

MSE average score: 48.04012543922278
MAE average score: 4.474875983116053
SMAPE average score: 92.28033087040616


In [36]:
#HistGradientBoostingRegressor

mse_list = []
smape_list = []
mae_list = []

for target in targets:

    X = prot_pept_clinical.drop(columns=targets, axis=1)
    y = prot_pept_clinical[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9) 

    print(f'{len(X_train)} samples in training, {len(X_test)} samples in testing.')

    hgb = HistGradientBoostingRegressor()

    hgb.fit(X_train, y_train)

    y_predict = hgb.predict(X_test)

    mse = mean_squared_error(y_test, y_predict)
    mae = mean_absolute_error(y_test, y_predict)
    smape = smape_score(y_test, y_predict)

    mse_list.append(mse)
    smape_list.append(smape)
    mae_list.append(mae)

    print(f'MSE score for {target}: {mse}')
    print(f'MAE score for {target}: {mae}')
    print(f'SMAPE score for {target}: {smape}')
    print()

mse_average = np.mean(mse_list)
mae_average = np.mean(mae_list)
smape_average = np.mean(smape_list)

print(f'MSE average score: {mse_average}')
print(f'MAE average score: {mae_average}')
print(f'SMAPE average score: {smape_average}')


961 samples in training, 107 samples in testing.
MSE score for updrs_1: 13.98665100635211
MAE score for updrs_1: 2.977530974926082
SMAPE score for updrs_1: 62.0992625691974

961 samples in training, 107 samples in testing.
MSE score for updrs_2: 19.010634437723624
MAE score for updrs_2: 3.268345289301794
SMAPE score for updrs_2: 91.08058580960078

961 samples in training, 107 samples in testing.
MSE score for updrs_3: 96.89754669056433
MAE score for updrs_3: 7.9854954000484195
SMAPE score for updrs_3: 75.31075053002773

961 samples in training, 107 samples in testing.
MSE score for updrs_4: 3.7984045434250997
MAE score for updrs_4: 1.2559067421394556
SMAPE score for updrs_4: 154.18997835377857

MSE average score: 33.423309169516294
MAE average score: 3.8718196016039377
SMAPE average score: 95.67014431565113


In [37]:
#ADABOOST REGRESSOR
mse_list = []
smape_list = []
mae_list = []

for target in targets:

    X = prot_pept_clinical.drop(columns=targets, axis=1)
    y = prot_pept_clinical[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9) 

    print(f'{len(X_train)} samples in training, {len(X_test)} samples in testing.')

    ada = AdaBoostRegressor()

    ada.fit(X_train, y_train)

    y_predict = ada.predict(X_test)
    #cv = StratifiedKFold(n_splits = 5)

    mse = mean_squared_error(y_test, y_predict)
    mae = mean_absolute_error(y_test, y_predict)
    smape = smape_score(y_test, y_predict)

    mse_list.append(mse)
    smape_list.append(smape)
    mae_list.append(mae)

    print(f'MSE score for {target}: {mse}')
    print(f'MAE score for {target}: {mae}')
    print(f'SMAPE score for {target}: {smape}')
    print()

mse_average = np.mean(mse_list)
mae_average = np.mean(mae_list)
smape_average = np.mean(smape_list)

print(f'MSE average score: {mse_average}')
print(f'MAE average score: {mae_average}')
print(f'SMAPE average score: {smape_average}')

961 samples in training, 107 samples in testing.
MSE score for updrs_1: 19.11511216509191
MAE score for updrs_1: 3.666237943426653
SMAPE score for updrs_1: 70.70347840171782

961 samples in training, 107 samples in testing.
MSE score for updrs_2: 28.974217145089085
MAE score for updrs_2: 4.405099093553781
SMAPE score for updrs_2: 93.04261711577738

961 samples in training, 107 samples in testing.
MSE score for updrs_3: 170.72641508960004
MAE score for updrs_3: 11.283392378417041
SMAPE score for updrs_3: 85.7281147354729

961 samples in training, 107 samples in testing.
MSE score for updrs_4: 5.978157320523685
MAE score for updrs_4: 2.191255788620998
SMAPE score for updrs_4: 157.06636173703941

MSE average score: 56.19847543007618
MAE average score: 5.386496301004618
SMAPE average score: 101.63514299750187


In [38]:
X1 = prot_pept_clinical.drop(columns='visit_id', axis=1)
X = X1.drop(columns=targets, axis=1)

mse_list = []
smape_list = []
mae_list = []

for target in targets:

    y = prot_pept_clinical[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9) 

    print(f'{len(X_train)} samples in training, {len(X_test)} samples in testing.')

    xgbr = xgboost.XGBRegressor(objective='reg:linear', n_estimators=100)

    xgbr.fit(X_train, y_train)

    y_predict = xgbr.predict(X_test)

    mse = mean_squared_error(y_test, y_predict)
    mae = mean_absolute_error(y_test, y_predict)
    smape = smape_score(y_test, y_predict)

    mse_list.append(mse)
    smape_list.append(smape)
    mae_list.append(mae)

    print(f'MSE score for {target}: {mse}')
    print(f'MAE score for {target}: {mae}')
    print(f'SMAPE score for {target}: {smape}')
    print()

mse_average = np.mean(mse_list)
mae_average = np.mean(mae_list)
smape_average = np.mean(smape_list)

print(f'MSE average score: {mse_average}')
print(f'MAE average score: {mae_average}')
print(f'SMAPE average score: {smape_average}')

961 samples in training, 107 samples in testing.
MSE score for updrs_1: 15.252723092480263
MAE score for updrs_1: 2.9384770839013785
SMAPE score for updrs_1: 53.93539577292677

961 samples in training, 107 samples in testing.
MSE score for updrs_2: 18.60028429096016
MAE score for updrs_2: 3.177724322008195
SMAPE score for updrs_2: 84.68132859479148

961 samples in training, 107 samples in testing.
MSE score for updrs_3: 144.86887542916963
MAE score for updrs_3: 9.443144733939215
SMAPE score for updrs_3: 75.38509753353999

961 samples in training, 107 samples in testing.
MSE score for updrs_4: 2.883802740742023
MAE score for updrs_4: 1.257278043173111
SMAPE score for updrs_4: 153.82484521492

MSE average score: 45.40142138833802
MAE average score: 4.204156045755475
SMAPE average score: 91.95666677904455


In [53]:
str = xgbr.__class__.__name__

str