In [1]:
import pandas as pd
import numpy as np
# group by peptides
df_by_peptide = df_all.groupby(['visit_id','Peptide'])['PeptideAbundance'].mean().reset_index()


# EDA

### Summary
- there are 248 patients
- patients have more than 1 record for each visit
  - there are ~1113 visit ids, so each patient visits around 4-5 times
- sampling random patient, there are around 200 proteins found in each of their sample visits

In [6]:
#load dataset
proteins = pd.read_csv("amp-parkinsons-disease-progression-prediction/train_proteins.csv")
peptides = pd.read_csv("amp-parkinsons-disease-progression-prediction/train_peptides.csv")
clinical = pd.read_csv("amp-parkinsons-disease-progression-prediction/train_clinical_data.csv")

proteins.shape,peptides.shape,clinical.shape

((232741, 5), (981834, 6), (2615, 8))

In [7]:
proteins.sample(10)

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
183515,20216_48,48,20216,P01780,7220.93
90644,40876_12,12,40876,Q02818,15242.3
187099,26104_48,48,26104,P02763,6456960.0
76654,16574_12,12,16574,P11142,82523.5
79193,20791_12,12,20791,P00736,75556.6
123883,27971_24,24,27971,P61916,203214.0
33257,56073_0,0,56073,P36980,10048.3
43549,7151_6,6,7151,O94919,31742.0
92640,44001_12,12,44001,P05156,98632.0
200416,58674_48,48,58674,P16035,65892.0


In [24]:
peptides.sample(10)

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
104931,40874_0,0,40874,P08603,DGWSAQPTC(UniMod_4)IK,51272.4
851921,63889_48,48,63889,Q14515,DQGNQEQDPNISNGEEEEEKEPGEVGTHNDNQER,13807.1
493092,18183_24,24,18183,P02649,KVEQAVETEPEPELR,2360880.0
133868,53103_0,0,53103,P02747,FQSVFTVTR,120655.0
162915,62329_0,0,62329,P36980,LVYPSC(UniMod_4)EEK,8186.86
972642,59550_84,84,59550,P36222,LVMGIPTFGR,286193.0
408072,52266_12,12,52266,P02671,QFTSSTSYNR,6530.43
68813,25911_0,0,25911,P08603,SC(UniMod_4)DNPYIPNGDYSPLR,19068.0
577988,56675_24,24,56675,P01009,LYHSEAFTVNFGDTEEAKK,36322.7
48995,20216_0,0,20216,P10645,GLSAEPGWQAK,66677.0


In [22]:
proteins.patient_id.nunique()

248

In [20]:
# number of unique visits
proteins.visit_id.nunique()
# average number of visits per patient
proteins.visit_id.nunique()/proteins.patient_id.nunique()

4.487903225806452

In [21]:
proteins.visit_month.nunique()

15

In [23]:
proteins[proteins['patient_id'] == 20216]['visit_month'].value_counts()

6     214
12    209
24    208
48    208
0     205
36    203
Name: visit_month, dtype: int64

In [28]:
# find number of missing values, max and min, data types, #unique values for each colum

def summary(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values * 100
    summ['%missing'] = df.isnull().sum().values / len(df)
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    
    return summ

In [30]:
summary(df_all)

data shape: (981834, 7)


Unnamed: 0,data type,#missing,%missing,#unique,min,max
visit_id,object,0,0.0,1113,,
visit_month,int64,0,0.0,15,0.0,108.0
patient_id,int64,0,0.0,248,55.0,65043.0
UniProt,object,0,0.0,227,,
NPX,float64,0,0.0,218795,84.6082,613851000.0
Peptide,object,0,0.0,968,,
PeptideAbundance,float64,0,0.0,738931,10.9985,178752000.0


# Data Preprocessing

In [25]:
# merge proteins and peptides dataset based on 'visit_id' and 'UniProt id'
df_all = proteins.merge(peptides[['visit_id', 'UniProt', 'Peptide','PeptideAbundance']], on = ['visit_id','UniProt'], how = 'left')
df_all

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX,Peptide,PeptideAbundance
0,55_0,0,55,O00391,11254.3,NEQEQPLGQWHLS,11254.30
1,55_0,0,55,O00533,732430.0,GNPEPTFSWTK,102060.00
2,55_0,0,55,O00533,732430.0,IEIPSSVQQVPTIIK,174185.00
3,55_0,0,55,O00533,732430.0,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.90
4,55_0,0,55,O00533,732430.0,SMEQNGPGLEYR,30838.70
...,...,...,...,...,...,...,...
981829,58648_108,108,58648,Q9UHG2,369437.0,ILAGSADSEGVAAPR,202820.00
981830,58648_108,108,58648,Q9UKV8,105830.0,SGNIPAGTTVDTK,105830.00
981831,58648_108,108,58648,Q9Y646,21257.6,LALLVDTVGPR,21257.60
981832,58648_108,108,58648,Q9Y6R7,17953.1,AGC(UniMod_4)VAESTAVC(UniMod_4)R,5127.26


In [31]:
# group by uniprot id
df_by_uniprot = df_all.groupby(['visit_id','UniProt'])['NPX'].mean().reset_index()

In [32]:
# group by peptides
df_by_peptide = df_all.groupby(['visit_id','Peptide'])['PeptideAbundance'].mean().reset_index()


In [35]:
# pivot groupby table for uniprot dataset
df_uniprot = df_by_uniprot.pivot(index='visit_id',columns = 'UniProt', values = 'NPX').rename_axis(columns=None).reset_index()
df_uniprot


Unnamed: 0,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
0,10053_0,9104.27,402321.0,,,7150.57,2497.84,83002.9,15113.6,167327.0,...,,9469.45,94237.6,,23016.0,177983.0,65900.0,15382.0,,19017.40
1,10053_12,10464.20,435586.0,,,,,197117.0,15099.1,164268.0,...,,14408.40,,,28537.0,171733.0,65668.1,,9295.65,25697.80
2,10053_18,13235.70,507386.0,7126.96,24525.7,,2372.71,126506.0,16289.6,168107.0,...,317477.0,38667.20,111107.0,,37932.6,245188.0,59986.1,10813.3,,29102.70
3,10138_12,12600.20,494581.0,9165.06,27193.5,22506.10,6015.90,156313.0,54546.4,204013.0,...,557904.0,44556.90,155619.0,14647.90,36927.7,229232.0,106564.0,26077.7,21441.80,7642.42
4,10138_24,12003.20,522138.0,4498.51,17189.8,29112.40,2665.15,151169.0,52338.1,240892.0,...,,47836.70,177619.0,17061.10,25510.4,176722.0,59471.4,12639.2,15091.40,6168.55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,8699_24,9983.00,400290.0,24240.10,,16943.50,6303.17,77493.6,46435.3,254247.0,...,,25690.60,,6859.82,19106.7,121161.0,113872.0,14413.9,28225.50,8062.07
1109,942_12,6757.32,360858.0,18367.60,14760.7,18603.40,1722.77,86847.4,37741.3,212132.0,...,45742.3,33518.60,94049.7,13415.70,21324.7,234094.0,82410.4,19183.7,17804.10,12277.00
1110,942_24,,352722.0,22834.90,23393.1,16693.50,1487.91,114772.0,36095.7,185836.0,...,180475.0,29770.60,95949.9,11344.40,23637.6,256654.0,76931.9,19168.2,19215.90,14625.60
1111,942_48,11627.80,251820.0,22046.50,26360.5,22440.20,2117.43,82241.9,30146.6,167633.0,...,197987.0,29283.80,121696.0,19169.80,16724.9,232301.0,96905.9,21120.9,14089.80,16418.50


In [34]:
# pivot groupby table for peptide dataset
df_peptide = df_by_peptide.pivot(index='visit_id',columns = 'Peptide', values = 'PeptideAbundance').rename_axis(columns=None).reset_index()
df_peptide.sample(5)


Unnamed: 0,visit_id,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,ADDKETC(UniMod_4)FAEEGK,ADDKETC(UniMod_4)FAEEGKK,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
310,23636_36,8340370.0,37626.0,9206360.0,21920.4,16533.9,18312.0,45248.8,507172.0,8434700.0,...,201752.0,6765.62,5572700.0,72490.6,816316.0,40052.9,153741.0,552240.0,47338.6,13092.5
1103,8344_24,7254380.0,39743.8,8185820.0,19223.4,17588.8,11032.0,36362.1,503480.0,6437940.0,...,212614.0,3801.31,3827320.0,14388.7,211949.0,50382.4,114154.0,396690.0,40111.2,24491.7
779,5178_24,6137290.0,79552.2,14751500.0,30336.3,11632.9,151495.0,54960.1,809795.0,5037490.0,...,289760.0,17919.2,3609270.0,,241446.0,100705.0,157346.0,417611.0,78377.5,36280.9
431,28327_6,6887080.0,37505.6,10267600.0,28026.8,17704.5,74757.0,43162.8,54512.9,5602390.0,...,198308.0,8517.34,4423710.0,67557.5,418102.0,80492.3,162787.0,421868.0,40905.1,
397,27468_36,6259060.0,36605.0,8722450.0,20383.7,8854.39,65851.9,,57170.4,4782070.0,...,277114.0,10560.5,,84813.2,460885.0,142576.0,112294.0,523755.0,35042.7,20323.5


In [37]:
# add patient id and visit month
df_uniprot[['patient_id','visit_month']] = df_uniprot.visit_id.str.split("_", expand=True)
df_peptide[['patient_id','visit_month']] = df_peptide.visit_id.str.split("_", expand=True)

In [38]:
df_uniprot

Unnamed: 0,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,...,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7,patient_id,visit_month
0,10053_0,9104.27,402321.0,,,7150.57,2497.84,83002.9,15113.6,167327.0,...,94237.6,,23016.0,177983.0,65900.0,15382.0,,19017.40,10053,0
1,10053_12,10464.20,435586.0,,,,,197117.0,15099.1,164268.0,...,,,28537.0,171733.0,65668.1,,9295.65,25697.80,10053,12
2,10053_18,13235.70,507386.0,7126.96,24525.7,,2372.71,126506.0,16289.6,168107.0,...,111107.0,,37932.6,245188.0,59986.1,10813.3,,29102.70,10053,18
3,10138_12,12600.20,494581.0,9165.06,27193.5,22506.10,6015.90,156313.0,54546.4,204013.0,...,155619.0,14647.90,36927.7,229232.0,106564.0,26077.7,21441.80,7642.42,10138,12
4,10138_24,12003.20,522138.0,4498.51,17189.8,29112.40,2665.15,151169.0,52338.1,240892.0,...,177619.0,17061.10,25510.4,176722.0,59471.4,12639.2,15091.40,6168.55,10138,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,8699_24,9983.00,400290.0,24240.10,,16943.50,6303.17,77493.6,46435.3,254247.0,...,,6859.82,19106.7,121161.0,113872.0,14413.9,28225.50,8062.07,8699,24
1109,942_12,6757.32,360858.0,18367.60,14760.7,18603.40,1722.77,86847.4,37741.3,212132.0,...,94049.7,13415.70,21324.7,234094.0,82410.4,19183.7,17804.10,12277.00,942,12
1110,942_24,,352722.0,22834.90,23393.1,16693.50,1487.91,114772.0,36095.7,185836.0,...,95949.9,11344.40,23637.6,256654.0,76931.9,19168.2,19215.90,14625.60,942,24
1111,942_48,11627.80,251820.0,22046.50,26360.5,22440.20,2117.43,82241.9,30146.6,167633.0,...,121696.0,19169.80,16724.9,232301.0,96905.9,21120.9,14089.80,16418.50,942,48


#### Clinical Data

In [42]:
clinical

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,6,,
1,55_3,55,3,10.0,7.0,19,,
2,55_6,55,6,8.0,10.0,58,,
3,55_9,55,9,8.0,9.0,17,0.0,On
4,55_12,55,12,10.0,10.0,2,0.0,On
...,...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7.0,6.0,51,0.0,Off
2611,65043_54,65043,54,4.0,8.0,55,1.0,Off
2612,65043_60,65043,60,6.0,6.0,0,1.0,Off
2613,65043_72,65043,72,3.0,9.0,58,1.0,Off


In [45]:
# gather updrs columns into rows
df_clinical = clinical.melt(id_vars=['visit_id', 'patient_id', 'visit_month', 'upd23b_clinical_state_on_medication'], 
                   var_name='updrs', value_name='rating')

In [46]:
# Save new dfs to pickle
df_clinical.to_pickle("./df_clinical.pkl")  

# df_clinical = pd.read_pickle("./df_clinical.pkl")  
