In [2]:
!pip install dask

Collecting dask
  Downloading dask-2022.2.0-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hCollecting cloudpickle>=1.1.1 (from dask)
  Downloading cloudpickle-2.2.1-py3-none-any.whl (25 kB)
Collecting fsspec>=0.6.0 (from dask)
  Downloading fsspec-2023.1.0-py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.0/143.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting partd>=0.3.10 (from dask)
  Downloading partd-1.4.0-py3-none-any.whl (18 kB)
Collecting toolz>=0.8.2 (from dask)
  Downloading toolz-0.12.0-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.8/55.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting locket (from partd>=0.3.10->dask)
  Downloading locket-1.0.0-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: toolz, locket, fsspec, cloudpickle, partd, da

In [3]:
import pandas as pd
from sklearn import decomposition, preprocessing, cluster, tree
import dask.dataframe as dd

In [5]:
peptide_file = "PD-datasets/train_peptides.csv"
peptides = dd.read_csv(peptide_file)
peptides

Unnamed: 0_level_0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,object,int64,int64,object,object,float64
,...,...,...,...,...,...


In [6]:
proteins_file = "PD-datasets/train_proteins.csv"
proteins = dd.read_csv(proteins_file)
proteins

Unnamed: 0_level_0,visit_id,visit_month,patient_id,UniProt,NPX
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,object,int64,int64,object,float64
,...,...,...,...,...


In [4]:
clinical_file = "PD-datasets/train_clinical_data.csv"
clinical = pd.read_csv(clinical_file)
clinical_df = pd.DataFrame(clinical)
clinical_df.set_index("visit_id",inplace=True)
clinical_df.head()

Unnamed: 0_level_0,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
55_0,55,0,10.0,6.0,15.0,,
55_3,55,3,10.0,7.0,25.0,,
55_6,55,6,8.0,10.0,34.0,,
55_9,55,9,8.0,9.0,30.0,0.0,On
55_12,55,12,10.0,10.0,41.0,0.0,On


In [5]:
# merge all of the datasets together
protpep = pd.merge(left=peptides_df,right=proteins_df,how='outer',on="visit_id")

In [6]:
protpep = protpep.drop(columns=["visit_month_y", "patient_id_y"])

In [7]:
protpep.columns

Index(['visit_id', 'visit_month_x', 'patient_id_x', 'UniProt_x', 'Peptide',
       'PeptideAbundance', 'UniProt_y', 'NPX'],
      dtype='object')

In [8]:
protpep.set_index("visit_id",inplace=True)

In [9]:
merged_df = protpep.join(clinical_df)

In [10]:
#take a look at the data and identify what needs to be done to prepare for ML
merged_df.head(25)

Unnamed: 0_level_0,visit_month_x,patient_id_x,UniProt_x,Peptide,PeptideAbundance,UniProt_y,NPX,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10053_0,0,10053,O00391,NEQEQPLGQWHLS,9104.27,O00391,9104.27,10053.0,0.0,3.0,0.0,13.0,0.0,
10053_0,0,10053,O00391,NEQEQPLGQWHLS,9104.27,O00533,402321.0,10053.0,0.0,3.0,0.0,13.0,0.0,
10053_0,0,10053,O00391,NEQEQPLGQWHLS,9104.27,O14773,7150.57,10053.0,0.0,3.0,0.0,13.0,0.0,
10053_0,0,10053,O00391,NEQEQPLGQWHLS,9104.27,O14791,2497.84,10053.0,0.0,3.0,0.0,13.0,0.0,
10053_0,0,10053,O00391,NEQEQPLGQWHLS,9104.27,O15240,83002.9,10053.0,0.0,3.0,0.0,13.0,0.0,
10053_0,0,10053,O00391,NEQEQPLGQWHLS,9104.27,O15394,15113.6,10053.0,0.0,3.0,0.0,13.0,0.0,
10053_0,0,10053,O00391,NEQEQPLGQWHLS,9104.27,O43505,167327.0,10053.0,0.0,3.0,0.0,13.0,0.0,
10053_0,0,10053,O00391,NEQEQPLGQWHLS,9104.27,O60888,129048.0,10053.0,0.0,3.0,0.0,13.0,0.0,
10053_0,0,10053,O00391,NEQEQPLGQWHLS,9104.27,O75144,53069.5,10053.0,0.0,3.0,0.0,13.0,0.0,
10053_0,0,10053,O00391,NEQEQPLGQWHLS,9104.27,O94919,11074.6,10053.0,0.0,3.0,0.0,13.0,0.0,


In [None]:
cleaned_df = merged_df.drop(columns=["visit_month","patient_id"])

In [None]:
cleaned_df.dtypes

In [None]:
pd.get_dummies(merged_df, columns=["",""])

In [None]:
# PCA

X = #cleaned data will go here
std = preprocessing.StandardScaler()
X_std = pd.DataFrame(std.fit_transform(X), columns=X.columns)
X_std

In [None]:
pca = decomposition.PCA()
pca_X = pd.DataFrame(pca.fit_transform(X_std), columns=[f'PC{i+1}' for i in range(len(X.columns))])
pca_X

In [None]:
# plot PCs and dendrogram