In [1]:
import pandas as pd
import numpy as np
from sklearn import decomposition, preprocessing, cluster, tree
from sklearn.preprocessing import LabelEncoder

In [2]:
#handy reference to file paths

files = {
    "peptides":"PD-datasets/train_peptides.csv",
    "proteins":"PD-datasets/train_proteins.csv",
    "clinical":"PD-datasets/train_clinical_data.csv"
}

In [3]:
#create dataframes from csv files

peptides = pd.read_csv("PD-datasets/train_peptides.csv")
proteins = pd.read_csv("PD-datasets/train_proteins.csv")
clinical = pd.read_csv("PD-datasets/train_clinical_data.csv")

In [4]:
#reference to df columns
peptides.columns

Index(['visit_id', 'visit_month', 'patient_id', 'UniProt', 'Peptide',
       'PeptideAbundance'],
      dtype='object')

In [5]:
#inspect dtypes
peptides.dtypes

visit_id             object
visit_month           int64
patient_id            int64
UniProt              object
Peptide              object
PeptideAbundance    float64
dtype: object

In [6]:
#use encoding to transform objects into numeric values

le = LabelEncoder()
encoded_peptides = peptides[peptides.columns[3:5]].apply(le.fit_transform)
encoded_peptides.head()

Unnamed: 0,UniProt,Peptide
0,0,592
1,1,301
2,1,364
3,1,433
4,1,744


In [7]:
#replace objects in original dataframe
peptides[["UniProt","Peptide"]] = encoded_peptides[["UniProt","Peptide"]]
peptides.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,0,592,11254.3
1,55_0,0,55,1,301,102060.0
2,55_0,0,55,1,364,174185.0
3,55_0,0,55,1,433,27278.9
4,55_0,0,55,1,744,30838.7


In [8]:
#repeat the process with the other dataframes
proteins.columns

Index(['visit_id', 'visit_month', 'patient_id', 'UniProt', 'NPX'], dtype='object')

In [9]:
proteins.dtypes

visit_id        object
visit_month      int64
patient_id       int64
UniProt         object
NPX            float64
dtype: object

In [10]:
encoded_proteins = proteins[proteins.columns[3:4]].apply(le.fit_transform)
encoded_proteins.value_counts()

UniProt
85         1113
68         1113
24         1113
26         1113
27         1113
           ... 
194         661
38          654
193         616
214         606
213         489
Length: 227, dtype: int64

In [11]:
#check that we are encoding correctly
proteins["UniProt"].value_counts()

P01024    1113
P05090    1113
P01011    1113
P01023    1113
Q92520    1113
          ... 
Q6UX71     661
P01780     654
Q562R1     616
Q99832     606
Q99829     489
Name: UniProt, Length: 227, dtype: int64

In [12]:
proteins[["UniProt"]] = encoded_proteins[["UniProt"]]
proteins.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,0,11254.3
1,55_0,0,55,1,732430.0
2,55_0,0,55,2,39585.8
3,55_0,0,55,3,41526.9
4,55_0,0,55,4,31238.0


In [13]:
clinical.columns

Index(['visit_id', 'patient_id', 'visit_month', 'updrs_1', 'updrs_2',
       'updrs_3', 'updrs_4', 'upd23b_clinical_state_on_medication'],
      dtype='object')

In [14]:
clinical.dtypes

visit_id                                object
patient_id                               int64
visit_month                              int64
updrs_1                                float64
updrs_2                                float64
updrs_3                                float64
updrs_4                                float64
upd23b_clinical_state_on_medication     object
dtype: object

In [15]:
encoded_clinical = clinical[clinical.columns[7:8]].apply(le.fit_transform)
clinical["upd23b_clinical_state_on_medication"] = encoded_clinical["upd23b_clinical_state_on_medication"]
clinical.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,,2
1,55_3,55,3,10.0,7.0,25.0,,2
2,55_6,55,6,8.0,10.0,34.0,,2
3,55_9,55,9,8.0,9.0,30.0,0.0,1
4,55_12,55,12,10.0,10.0,41.0,0.0,1


In [16]:
#join dataframes 

peptides_clinical = peptides.merge(clinical,how = "left", on=['visit_id', 'visit_month', 'patient_id'])
merge_df = peptides_clinical.merge(proteins,on=['visit_id', 'visit_month', 'patient_id'])

In [17]:
peptides_clinical.shape

(981834, 11)

In [18]:
merge_df.shape

(206138191, 13)

In [19]:
merge_df.head(20)

Unnamed: 0,visit_id,visit_month,patient_id,UniProt_x,Peptide,PeptideAbundance,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,UniProt_y,NPX
0,55_0,0,55,0,592,11254.3,10.0,6.0,15.0,,2.0,0,11254.3
1,55_0,0,55,0,592,11254.3,10.0,6.0,15.0,,2.0,1,732430.0
2,55_0,0,55,0,592,11254.3,10.0,6.0,15.0,,2.0,2,39585.8
3,55_0,0,55,0,592,11254.3,10.0,6.0,15.0,,2.0,3,41526.9
4,55_0,0,55,0,592,11254.3,10.0,6.0,15.0,,2.0,4,31238.0
5,55_0,0,55,0,592,11254.3,10.0,6.0,15.0,,2.0,5,4202.71
6,55_0,0,55,0,592,11254.3,10.0,6.0,15.0,,2.0,6,177775.0
7,55_0,0,55,0,592,11254.3,10.0,6.0,15.0,,2.0,7,62898.2
8,55_0,0,55,0,592,11254.3,10.0,6.0,15.0,,2.0,8,333376.0
9,55_0,0,55,0,592,11254.3,10.0,6.0,15.0,,2.0,9,166850.0


In [20]:
# merge_df.set_index("visit_id")
# merge_df.tail(10)

In [21]:
# for i in merge_df.columns:
#     print(merge_df[i].value_counts())

In [27]:
sample = merge_df.sample(10000)

In [28]:
sample

Unnamed: 0,visit_id,visit_month,patient_id,UniProt_x,Peptide,PeptideAbundance,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,UniProt_y,NPX
67538168,16347_12,12,16347,22,690,156265.00,2.0,0.0,1.0,,2.0,13,26175.60
168873216,34182_48,48,34182,126,56,7417.50,11.0,6.0,10.0,4.0,1.0,75,154458.00
66242431,14450_12,12,14450,9,833,154675.00,0.0,0.0,2.0,,2.0,152,37760.40
138042540,27464_36,36,27464,101,622,9407.27,8.0,5.0,20.0,0.0,1.0,200,4700.94
103194232,17727_24,24,17727,49,88,506869.00,0.0,0.0,0.0,0.0,2.0,170,13980.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...
105045955,20581_24,24,20581,27,725,185109.00,0.0,0.0,0.0,,2.0,205,1271620.00
172030308,46837_48,48,46837,148,90,89856.30,17.0,7.0,27.0,0.0,1.0,65,20999000.00
191216203,47881_60,60,47881,84,549,21377.80,3.0,18.0,20.0,0.0,1.0,94,99676.70
47027300,26104_6,6,26104,60,779,57639.40,4.0,3.0,22.0,,2.0,118,167924.00


In [29]:
# PCA

X = sample
std = preprocessing.StandardScaler()
X_std = pd.DataFrame(std.fit_transform(X), columns=X.columns)
X_std

Unnamed: 0,visit_id,visit_month,patient_id,UniProt_x,Peptide,PeptideAbundance,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,UniProt_y,NPX
0,-0.297728,-0.614720,-0.875932,-1.160499,0.733670,-0.143067,-0.849083,-0.984829,-1.084907,,0.777098,-1.506336,-0.119460
1,0.320045,0.963981,0.086206,0.690331,-1.523014,-0.188792,0.838053,0.030141,-0.485596,0.715868,-0.463412,-0.560478,-0.113731
2,-0.363436,-0.614720,-0.978269,-1.391852,1.242670,-0.143556,-1.224002,-0.984829,-1.018317,,0.777098,0.614218,-0.118943
3,0.087346,0.437748,-0.276208,0.245420,0.491628,-0.188181,0.275675,-0.139021,0.180305,-0.616597,-0.463412,1.346495,-0.120420
4,-0.249924,-0.088486,-0.801486,-0.679995,-1.409112,-0.035365,-1.224002,-0.984829,-1.151497,-0.616597,0.777098,0.888822,-0.120005
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.151069,-0.088486,-0.647522,-1.071516,0.858251,-0.134207,-1.224002,-0.984829,-1.151497,,0.777098,1.422774,-0.063833
9996,0.758384,0.963981,0.768900,1.081853,-1.401994,-0.163467,1.962810,0.199302,0.646436,-0.616597,-0.463412,-0.713036,0.817283
9997,0.794550,1.490215,0.825220,-0.057119,0.231789,-0.184503,-0.661623,2.060079,0.180305,-0.616597,-0.463412,-0.270618,-0.116178
9998,-0.773533,-0.877837,-0.349575,-0.484234,1.050460,-0.173364,-0.474163,-0.477344,0.313486,,0.777098,0.095521,-0.113129


In [30]:
pca = decomposition.PCA()
pca_X = pd.DataFrame(pca.fit_transform(X_std), columns=[f'PC{i+1}' for i in range(len(X.columns))])
pca_X

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').