# Explore the Raw Data for Initial Analysis

In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# import the data as a pandas dataframe
clin_df = pd.read_csv('../data/raw/train_clinical_data.csv')
prot_df = pd.read_csv('../data/raw/train_proteins.csv')
pep_df = pd.read_csv('../data/raw/train_peptides.csv')

In [5]:
clin_df.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,,
1,55_3,55,3,10.0,7.0,25.0,,
2,55_6,55,6,8.0,10.0,34.0,,
3,55_9,55,9,8.0,9.0,30.0,0.0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,On


In [8]:
prot_df.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0


In [10]:
# get only the visit_id, UniProt, NPX columns and transpose the dataframe
temp_prot = prot_df[['visit_id', 'UniProt', 'NPX']]
temp_prot = temp_prot.pivot(index='visit_id', columns='UniProt', values='NPX')
temp_prot.head()

UniProt,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,O60888,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10053_0,9104.27,402321.0,,,7150.57,2497.84,83002.9,15113.6,167327.0,129048.0,...,,9469.45,94237.6,,23016.0,177983.0,65900.0,15382.0,,19017.4
10053_12,10464.2,435586.0,,,,,197117.0,15099.1,164268.0,108114.0,...,,14408.4,,,28537.0,171733.0,65668.1,,9295.65,25697.8
10053_18,13235.7,507386.0,7126.96,24525.7,,2372.71,126506.0,16289.6,168107.0,163776.0,...,317477.0,38667.2,111107.0,,37932.6,245188.0,59986.1,10813.3,,29102.7
10138_12,12600.2,494581.0,9165.06,27193.5,22506.1,6015.9,156313.0,54546.4,204013.0,56725.0,...,557904.0,44556.9,155619.0,14647.9,36927.7,229232.0,106564.0,26077.7,21441.8,7642.42
10138_24,12003.2,522138.0,4498.51,17189.8,29112.4,2665.15,151169.0,52338.1,240892.0,85767.1,...,,47836.7,177619.0,17061.1,25510.4,176722.0,59471.4,12639.2,15091.4,6168.55


In [9]:
pep_df.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.7


In [11]:
# combine the UniProt and Peptide columns to create a new column
pep_df['prot_pep'] = pep_df['UniProt'] + '_' + pep_df['Peptide']
temp_pep = pep_df[['visit_id', 'prot_pep', 'PeptideAbundance']]

temp_pep = temp_pep.pivot(index='visit_id', columns='prot_pep', values='PeptideAbundance')

temp_pep.head()

prot_pep,O00391_NEQEQPLGQWHLS,O00533_GNPEPTFSWTK,O00533_IEIPSSVQQVPTIIK,O00533_KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,O00533_SMEQNGPGLEYR,O00533_TLKIENVSYQDKGNYR,O00533_VIAVNEVGR,O00533_VMTPAVYAPYDVK,O00533_VNGSPVDNHPFAGDVVFPR,O00584_ELDLNSVLLK,...,Q9UBX5_DQPFTILYR,Q9UHG2_AEAQEAEDQQAR,Q9UHG2_ARAEAQEAEDQQAR,Q9UHG2_GEAAGAVQELAR,Q9UHG2_ILAGSADSEGVAAPR,Q9UKV8_SGNIPAGTTVDTK,Q9UNU6_KNM(UniMod_35)FEFLK,Q9Y646_LALLVDTVGPR,Q9Y6R7_AGC(UniMod_4)VAESTAVC(UniMod_4)R,Q9Y6R7_GATTSPGVYELSSR
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10053_0,9104.27,60615.1,134795.0,24299.0,17715.2,,79119.1,53294.9,32483.1,,...,23016.0,,,31990.3,145993.0,65900.0,15382.0,,,19017.4
10053_12,10464.2,69318.8,171397.0,20462.6,15614.2,,20647.9,99466.8,38678.4,,...,28537.0,,,16099.4,155634.0,65668.1,,9295.65,,25697.8
10053_18,13235.7,85129.6,135717.0,21854.4,24246.1,,99758.3,98788.3,41892.1,,...,33160.9,5481.36,43275.5,66275.1,130156.0,59986.1,10813.3,,10438.7,18664.0
10138_12,12600.2,71619.8,114132.0,17616.5,20517.5,21857.2,98198.3,109087.0,41552.2,,...,31362.3,5538.51,47975.1,59557.7,116160.0,106564.0,26077.7,21441.8,,7642.42
10138_24,12003.2,92180.5,130398.0,27983.8,18453.9,14048.0,66186.7,125747.0,47139.4,,...,23527.3,3652.49,33256.6,49768.7,90044.6,59471.4,12639.2,15091.4,,6168.55


In [14]:
# combine the clinical, protein, and peptide dataframes on the visit_id column
clin_prot_df = pd.merge(clin_df, temp_prot, on='visit_id')
full_train_df = pd.merge(clin_prot_df, temp_pep, on='visit_id')

full_train_df.head()


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,O00391,O00533,...,Q9UBX5_DQPFTILYR,Q9UHG2_AEAQEAEDQQAR,Q9UHG2_ARAEAQEAEDQQAR,Q9UHG2_GEAAGAVQELAR,Q9UHG2_ILAGSADSEGVAAPR,Q9UKV8_SGNIPAGTTVDTK,Q9UNU6_KNM(UniMod_35)FEFLK,Q9Y646_LALLVDTVGPR,Q9Y6R7_AGC(UniMod_4)VAESTAVC(UniMod_4)R,Q9Y6R7_GATTSPGVYELSSR
0,55_0,55,0,10.0,6.0,15.0,,,11254.3,732430.0,...,51727.6,9481.83,81839.4,98654.3,218723.0,,29758.8,23833.7,,18953.5
1,55_6,55,6,8.0,10.0,34.0,,,13163.6,630465.0,...,44178.7,10315.9,75124.2,84806.6,199623.0,,22935.2,17722.5,,16642.7
2,55_12,55,12,10.0,10.0,41.0,0.0,On,15257.6,815083.0,...,52840.7,7138.93,61087.2,81764.2,168563.0,65762.6,29193.4,28536.1,,19290.9
3,55_36,55,36,17.0,18.0,51.0,0.0,On,13530.8,753832.0,...,58775.8,9959.98,75518.6,90704.0,201367.0,74976.1,31732.6,22186.5,,21717.1
4,942_6,942,6,8.0,2.0,21.0,,,11218.7,399518.0,...,18595.6,5938.09,56768.1,76512.3,167402.0,82335.5,24018.7,18939.5,5138.82,10112.4


## How many total patients are there?

In [15]:
full_train_df['patient_id'].nunique()

248

## How many patients have protein data?