In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# classifiers we will use
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

# model selection bits
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.model_selection import learning_curve, validation_curve

# evaluation
from sklearn.metrics import f1_score, accuracy_score


import scipy

In [2]:
# Load the competition datasets into Pandas DataFrame
proteins = pd.read_csv("~/Documents/amp-parkinsons-disease-progression-prediction/train_proteins.csv")
peptides = pd.read_csv("~/Documents/amp-parkinsons-disease-progression-prediction/train_peptides.csv")
clinical = pd.read_csv("~/Documents/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv")
supplemental = pd.read_csv("~/Documents/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv")

In [3]:
# drop the "medication status" column (due to over 50% NaN values), keep a copy of the original
# for later access. 
clinical_copy = clinical.copy()

clinical.drop('upd23b_clinical_state_on_medication', axis=1, inplace=True)

In [4]:
# and use interpolate() on the rest of the dataset.

clinical['updrs_1'].interpolate(method='linear', limit_direction='both', inplace=True)
clinical['updrs_2'].interpolate(method='linear', limit_direction='both', inplace=True)
clinical['updrs_3'].interpolate(method='linear', limit_direction='both', inplace=True)
clinical['updrs_4'].interpolate(method='linear', limit_direction='both', inplace=True)

In [5]:
clinical.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4
0,55_0,55,0,10.0,6.0,15.0,0.0
1,55_3,55,3,10.0,7.0,25.0,0.0
2,55_6,55,6,8.0,10.0,34.0,0.0
3,55_9,55,9,8.0,9.0,30.0,0.0
4,55_12,55,12,10.0,10.0,41.0,0.0


In [21]:
proteins

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0
...,...,...,...,...,...
232736,58648_108,108,58648,Q9UBX5,27387.8
232737,58648_108,108,58648,Q9UHG2,369437.0
232738,58648_108,108,58648,Q9UKV8,105830.0
232739,58648_108,108,58648,Q9Y646,21257.6


In [16]:
peptides

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.30
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.00
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.00
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.90
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.70
...,...,...,...,...,...,...
981829,58648_108,108,58648,Q9UHG2,ILAGSADSEGVAAPR,202820.00
981830,58648_108,108,58648,Q9UKV8,SGNIPAGTTVDTK,105830.00
981831,58648_108,108,58648,Q9Y646,LALLVDTVGPR,21257.60
981832,58648_108,108,58648,Q9Y6R7,AGC(UniMod_4)VAESTAVC(UniMod_4)R,5127.26


In [23]:
#protein_peptides = pd.merge(proteins, peptides, on=['visit_id', 'patient_id', 'visit_month','UniProt'], how='left')

proteins_grouped = proteins.groupby(['visit_id', 'UniProt'])['NPX'].mean().reset_index()
peptides_grouped = peptides.groupby(['visit_id', 'Peptide'])['PeptideAbundance'].mean().reset_index()

In [24]:
df_proteins = proteins_grouped.pivot(index='visit_id', columns='UniProt', values='NPX').rename_axis(columns=None).reset_index()
df_peptides = peptides_grouped.pivot(index='visit_id', columns='Peptide', values='PeptideAbundance').rename_axis(columns=None).reset_index()


In [25]:
df_proteins

Unnamed: 0,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
0,10053_0,9104.27,402321.0,,,7150.57,2497.84,83002.9,15113.6,167327.0,...,,9469.45,94237.6,,23016.0,177983.0,65900.0,15382.0,,19017.40
1,10053_12,10464.20,435586.0,,,,,197117.0,15099.1,164268.0,...,,14408.40,,,28537.0,171733.0,65668.1,,9295.65,25697.80
2,10053_18,13235.70,507386.0,7126.96,24525.7,,2372.71,126506.0,16289.6,168107.0,...,317477.0,38667.20,111107.0,,37932.6,245188.0,59986.1,10813.3,,29102.70
3,10138_12,12600.20,494581.0,9165.06,27193.5,22506.10,6015.90,156313.0,54546.4,204013.0,...,557904.0,44556.90,155619.0,14647.90,36927.7,229232.0,106564.0,26077.7,21441.80,7642.42
4,10138_24,12003.20,522138.0,4498.51,17189.8,29112.40,2665.15,151169.0,52338.1,240892.0,...,,47836.70,177619.0,17061.10,25510.4,176722.0,59471.4,12639.2,15091.40,6168.55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,8699_24,9983.00,400290.0,24240.10,,16943.50,6303.17,77493.6,46435.3,254247.0,...,,25690.60,,6859.82,19106.7,121161.0,113872.0,14413.9,28225.50,8062.07
1109,942_12,6757.32,360858.0,18367.60,14760.7,18603.40,1722.77,86847.4,37741.3,212132.0,...,45742.3,33518.60,94049.7,13415.70,21324.7,234094.0,82410.4,19183.7,17804.10,12277.00
1110,942_24,,352722.0,22834.90,23393.1,16693.50,1487.91,114772.0,36095.7,185836.0,...,180475.0,29770.60,95949.9,11344.40,23637.6,256654.0,76931.9,19168.2,19215.90,14625.60
1111,942_48,11627.80,251820.0,22046.50,26360.5,22440.20,2117.43,82241.9,30146.6,167633.0,...,197987.0,29283.80,121696.0,19169.80,16724.9,232301.0,96905.9,21120.9,14089.80,16418.50


In [26]:
df_peptides

Unnamed: 0,visit_id,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,ADDKETC(UniMod_4)FAEEGK,ADDKETC(UniMod_4)FAEEGKK,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,10053_0,6580710.0,31204.4,7735070.0,,,,46620.3,236144.0,,...,202274.0,,4401830.0,77482.6,583075.0,76705.7,104260.0,530223.0,,7207.30
1,10053_12,6333510.0,52277.6,5394390.0,,,,57554.5,108298.0,45885.4,...,201009.0,,5001750.0,36745.3,355643.0,92078.1,123254.0,453883.0,49281.9,25332.80
2,10053_18,7129640.0,61522.0,7011920.0,35984.7,17188.00,19787.3,36029.4,708729.0,5067790.0,...,220728.0,,5424380.0,39016.0,496021.0,63203.6,128336.0,447505.0,52389.1,21235.70
3,10138_12,7404780.0,46107.2,10610900.0,,20910.20,66662.3,55253.9,79575.5,6201210.0,...,188362.0,9433.71,3900280.0,48210.3,328482.0,89822.1,129964.0,552232.0,65657.8,9876.98
4,10138_24,13788300.0,56910.3,6906160.0,13785.5,11004.20,63672.7,36819.8,34160.9,2117430.0,...,206187.0,6365.15,3521800.0,69984.6,496737.0,80919.3,111799.0,,56977.6,4903.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,8699_24,6312970.0,44462.7,12455000.0,11051.3,1163.18,43279.8,67743.5,325328.0,4666550.0,...,289888.0,8615.27,8770410.0,33599.1,926094.0,118897.0,133682.0,571879.0,80268.3,54889.70
1109,942_12,11289900.0,46111.7,11297300.0,,13894.10,53755.0,40289.3,565112.0,,...,173259.0,4767.63,374307.0,35767.3,250397.0,65966.9,77976.8,486239.0,45032.7,
1110,942_24,10161900.0,32145.0,12388000.0,25869.2,17341.80,48625.5,45223.9,84448.0,4684800.0,...,185428.0,5554.53,,64049.8,479473.0,68505.7,74483.1,561398.0,52916.4,21847.60
1111,942_48,8248490.0,30563.4,11882600.0,,19114.90,60221.4,46685.9,81282.9,5542110.0,...,137611.0,6310.09,,28008.8,231359.0,63265.8,64601.8,632782.0,51123.7,20700.30


In [27]:
df_proteins.isna().sum()

visit_id      0
O00391      349
O00533        1
O00584       13
O14498       81
           ... 
Q9UHG2        0
Q9UKV8      206
Q9UNU6      352
Q9Y646       61
Q9Y6R7       69
Length: 228, dtype: int64

In [28]:
df_peptides.isna().sum()

visit_id                                              0
AADDTWEPFASGK                                        44
AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K     64
AAFTEC(UniMod_4)C(UniMod_4)QAADK                      6
AANEVSSADVK                                         204
                                                   ... 
YVNKEIQNAVNGVK                                        2
YWGVASFLQK                                           24
YYC(UniMod_4)FQGNQFLR                                 8
YYTYLIMNK                                            83
YYWGGQYTWDMAK                                       248
Length: 969, dtype: int64

In [29]:
data = pd.merge(df_proteins, df_peptides, on=['visit_id'], how='left')

In [30]:
data

Unnamed: 0,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,10053_0,9104.27,402321.0,,,7150.57,2497.84,83002.9,15113.6,167327.0,...,202274.0,,4401830.0,77482.6,583075.0,76705.7,104260.0,530223.0,,7207.30
1,10053_12,10464.20,435586.0,,,,,197117.0,15099.1,164268.0,...,201009.0,,5001750.0,36745.3,355643.0,92078.1,123254.0,453883.0,49281.9,25332.80
2,10053_18,13235.70,507386.0,7126.96,24525.7,,2372.71,126506.0,16289.6,168107.0,...,220728.0,,5424380.0,39016.0,496021.0,63203.6,128336.0,447505.0,52389.1,21235.70
3,10138_12,12600.20,494581.0,9165.06,27193.5,22506.10,6015.90,156313.0,54546.4,204013.0,...,188362.0,9433.71,3900280.0,48210.3,328482.0,89822.1,129964.0,552232.0,65657.8,9876.98
4,10138_24,12003.20,522138.0,4498.51,17189.8,29112.40,2665.15,151169.0,52338.1,240892.0,...,206187.0,6365.15,3521800.0,69984.6,496737.0,80919.3,111799.0,,56977.6,4903.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,8699_24,9983.00,400290.0,24240.10,,16943.50,6303.17,77493.6,46435.3,254247.0,...,289888.0,8615.27,8770410.0,33599.1,926094.0,118897.0,133682.0,571879.0,80268.3,54889.70
1109,942_12,6757.32,360858.0,18367.60,14760.7,18603.40,1722.77,86847.4,37741.3,212132.0,...,173259.0,4767.63,374307.0,35767.3,250397.0,65966.9,77976.8,486239.0,45032.7,
1110,942_24,,352722.0,22834.90,23393.1,16693.50,1487.91,114772.0,36095.7,185836.0,...,185428.0,5554.53,,64049.8,479473.0,68505.7,74483.1,561398.0,52916.4,21847.60
1111,942_48,11627.80,251820.0,22046.50,26360.5,22440.20,2117.43,82241.9,30146.6,167633.0,...,137611.0,6310.09,,28008.8,231359.0,63265.8,64601.8,632782.0,51123.7,20700.30


In [38]:
data.isna().sum(axis=1).sort_values(ascending=False).head(20)

334     1064
212      532
948      513
210      498
920      497
207      489
568      478
943      476
565      469
946      455
320      455
30       455
35       453
567      435
503      428
1046     421
941      415
1048     411
803      410
652      406
dtype: int64

In [40]:
data.describe()

Unnamed: 0,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,O60888,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
count,764.0,1112.0,1100.0,1032.0,1047.0,942.0,1113.0,1050.0,1110.0,1079.0,...,1100.0,994.0,1022.0,977.0,1091.0,1111.0,1089.0,1105.0,1030.0,865.0
mean,11641.264435,511164.9,26505.529157,27305.934884,17688.295406,3004.990691,126151.780054,50773.474638,195599.363694,145382.047368,...,215246.833636,9015.134433,3937256.0,67866.452927,611077.9,92581.223041,125937.993436,471554.596652,47068.709311,21072.04823
std,2817.00353,235735.7,10705.15254,8446.187506,7166.325369,1142.159575,72748.393517,21382.028764,79739.704279,58028.089713,...,50727.227704,3127.061637,1677710.0,61176.002825,504742.2,30138.957433,38696.448657,131150.715245,13689.667117,10360.5938
min,873.778,59718.2,591.103,8945.34,2811.12,336.517,10717.4,5806.84,29740.9,8358.08,...,12164.3,258.249,162464.0,884.26,7494.66,869.898,991.452,11371.2,6362.49,868.903
25%,9736.8575,349059.0,19941.075,21123.65,12920.05,2189.0875,70560.6,37008.975,142054.5,103983.0,...,186658.25,6899.805,2895622.0,30997.9,292950.5,70508.15,100691.0,384902.0,37752.375,14249.9
50%,11546.4,483442.5,26529.7,26624.0,17399.6,2865.46,116900.0,50375.8,185616.0,136452.0,...,217430.5,8604.345,3671010.0,52576.9,470245.0,88918.6,123588.0,463382.0,45503.15,20390.9
75%,13383.025,648557.2,33222.8,32459.275,22077.05,3593.1475,164947.0,63446.7,239731.5,177451.0,...,246423.5,10612.775,4710635.0,85369.6,759238.0,110140.0,149597.0,549455.0,54748.35,27031.9
max,21361.8,1806980.0,66252.4,65347.9,49695.6,9352.64,538862.0,137369.0,766591.0,427084.0,...,409939.0,27670.5,13855500.0,712856.0,3984710.0,251526.0,264224.0,948416.0,107220.0,70020.8


In [50]:
features = [feature for feature in data.columns[1:]]
features.append('visit_month')
features

['O00391',
 'O00533',
 'O00584',
 'O14498',
 'O14773',
 'O14791',
 'O15240',
 'O15394',
 'O43505',
 'O60888',
 'O75144',
 'O75326',
 'O94919',
 'P00441',
 'P00450',
 'P00734',
 'P00736',
 'P00738',
 'P00746',
 'P00747',
 'P00748',
 'P00751',
 'P01008',
 'P01009',
 'P01011',
 'P01019',
 'P01023',
 'P01024',
 'P01031',
 'P01033',
 'P01034',
 'P01042',
 'P01344',
 'P01591',
 'P01594',
 'P01608',
 'P01621',
 'P01717',
 'P01780',
 'P01833',
 'P01834',
 'P01857',
 'P01859',
 'P01860',
 'P01861',
 'P01876',
 'P01877',
 'P02452',
 'P02647',
 'P02649',
 'P02652',
 'P02655',
 'P02656',
 'P02671',
 'P02675',
 'P02679',
 'P02747',
 'P02748',
 'P02749',
 'P02750',
 'P02751',
 'P02753',
 'P02760',
 'P02763',
 'P02765',
 'P02766',
 'P02768',
 'P02774',
 'P02787',
 'P02790',
 'P02792',
 'P04004',
 'P04075',
 'P04156',
 'P04180',
 'P04196',
 'P04207',
 'P04211',
 'P04216',
 'P04217',
 'P04275',
 'P04406',
 'P04433',
 'P05060',
 'P05067',
 'P05090',
 'P05155',
 'P05156',
 'P05408',
 'P05452',
 'P05546',

In [56]:
targets = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']
clinical[targets]

Unnamed: 0,updrs_1,updrs_2,updrs_3,updrs_4
0,10.0,6.0,15.0,0.0
1,10.0,7.0,25.0,0.0
2,8.0,10.0,34.0,0.0
3,8.0,9.0,30.0,0.0
4,10.0,10.0,41.0,0.0
...,...,...,...,...
2610,7.0,6.0,13.0,0.0
2611,4.0,8.0,11.0,1.0
2612,6.0,6.0,16.0,1.0
2613,3.0,9.0,14.0,1.0


In [82]:

data_ready = pd.merge(data,clinical, on=['visit_id'], how='left')
data_ready.drop(columns=['visit_month', 'patient_id'], inplace=True)

data_ready.dropna(subset=targets, axis=0, inplace=True)
data_ready.dropna(axis=1, inplace=True)

In [86]:
for target in targets:

    #features.append(target)
    X = data_ready.drop(columns=targets, axis=1)
    y = data_ready[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) 

    print(f'{len(X_train)} samples in training, {len(X_test)} samples in testing.')

    RFR = RandomForestRegressor()

    RFR.fit(X_train, y_train)

    y_predict = RFR.predict(X_test)
    #cv = StratifiedKFold(n_splits = 5)

    f1_score(y_test, y_predict)

854 samples in training, 214 samples in testing.


ValueError: Classification metrics can't handle a mix of multiclass and continuous targets