# Predicting Parkinson’s Disease Progression Data Analysis

### Import Libraries

In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import matplotlib as mpl
from matplotlib import pyplot as plt

### Import Data

In [9]:
train_clinical_path = "PD-datasets/train_clinical_data.csv"
train_clinical = pd.read_csv(train_clinical_path)

train_peptides_path = "PD-datasets/train_peptides.csv"
train_peptides = pd.read_csv(train_peptides_path)

train_proteins_path = "PD-datasets/train_proteins.csv"
train_proteins = pd.read_csv(train_proteins_path)

### Study Data

In [10]:
print('train_clinical shape :',  train_clinical.shape)
print('train_peptide shape :', train_peptides.shape)
print('train_protein shape :', train_proteins.shape)

train_clinical shape : (2615, 8)
train_peptide shape : (981834, 6)
train_protein shape : (232741, 5)


In [11]:
print('train_clinical info :\n \n')
train_clinical.info()

train_clinical info :
 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2615 entries, 0 to 2614
Data columns (total 8 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   visit_id                             2615 non-null   object 
 1   patient_id                           2615 non-null   int64  
 2   visit_month                          2615 non-null   int64  
 3   updrs_1                              2614 non-null   float64
 4   updrs_2                              2613 non-null   float64
 5   updrs_3                              2590 non-null   float64
 6   updrs_4                              1577 non-null   float64
 7   upd23b_clinical_state_on_medication  1288 non-null   object 
dtypes: float64(4), int64(2), object(2)
memory usage: 163.6+ KB


In [12]:
train_clinical["visit_month"].describe()

count    2615.000000
mean       31.190822
std        25.199053
min         0.000000
25%        10.500000
50%        24.000000
75%        48.000000
max       108.000000
Name: visit_month, dtype: float64

In [13]:
train_clinical["updrs_1"].describe()

count    2614.000000
mean        7.110559
std         5.525955
min         0.000000
25%         3.000000
50%         6.000000
75%        10.000000
max        33.000000
Name: updrs_1, dtype: float64

In [14]:
train_clinical["updrs_2"].describe()

count    2613.00000
mean        6.74359
std         6.32323
min         0.00000
25%         1.00000
50%         5.00000
75%        10.00000
max        40.00000
Name: updrs_2, dtype: float64

In [15]:
train_clinical["updrs_3"].describe()

count    2590.000000
mean       19.421236
std        15.000289
min         0.000000
25%         6.000000
50%        19.000000
75%        29.000000
max        86.000000
Name: updrs_3, dtype: float64

In [16]:
train_clinical["updrs_4"].describe()

count    1577.000000
mean        1.861763
std         3.022112
min         0.000000
25%         0.000000
50%         0.000000
75%         3.000000
max        20.000000
Name: updrs_4, dtype: float64

In [17]:
print('train_peptides info :\n \n')
train_peptides.info()

train_peptides info :
 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981834 entries, 0 to 981833
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   visit_id          981834 non-null  object 
 1   visit_month       981834 non-null  int64  
 2   patient_id        981834 non-null  int64  
 3   UniProt           981834 non-null  object 
 4   Peptide           981834 non-null  object 
 5   PeptideAbundance  981834 non-null  float64
dtypes: float64(1), int64(2), object(3)
memory usage: 44.9+ MB


In [18]:
train_peptides["Peptide"].nunique()

968

In [19]:
train_peptides["patient_id"].nunique()

248

In [20]:
print('train_proteins info :\n \n')
train_proteins.info()

train_proteins info :
 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232741 entries, 0 to 232740
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   visit_id     232741 non-null  object 
 1   visit_month  232741 non-null  int64  
 2   patient_id   232741 non-null  int64  
 3   UniProt      232741 non-null  object 
 4   NPX          232741 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 8.9+ MB


In [21]:
train_proteins["patient_id"].nunique()

248

## Cleaning column names

In [22]:
train_clinical = train_clinical.rename(columns={'visit_id': 'Visit ID',
                                                'patient_id': 'Patient ID',
                                                'visit_month':'Visit Month',
                                                'updrs_1': 'UPDRS 1', 
                                              'updrs_2': 'UPDRS 2', 
                                              'updrs_3': 'UPDRS 3',
                                                'updrs_4' : 'UPDRS 4',
                                              'upd23b_clinical_state_on_medication': 'Clinical State on Medication UPD23B'})
train_clinical.head()

Unnamed: 0,Visit ID,Patient ID,Visit Month,UPDRS 1,UPDRS 2,UPDRS 3,UPDRS 4,Clinical State on Medication UPD23B
0,55_0,55,0,10.0,6.0,15.0,,
1,55_3,55,3,10.0,7.0,25.0,,
2,55_6,55,6,8.0,10.0,34.0,,
3,55_9,55,9,8.0,9.0,30.0,0.0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,On


### Are there differences in the UPDR scores of PD patients receiving medication versus those who do not receive medication?

In [38]:
fig=px.histogram(train_clinical,
                 x = 'Visit Month',
                 color = 'Clinical State on Medication UPD23B',
                 title="Count of Visit Month",
                 color_discrete_sequence=px.colors.qualitative.Vivid,
                 width=1000, height=500)
fig.update_layout(title_font_size=15)
fig.show()

In [33]:
 fig=px.histogram(train_clinical,
                     x = 'UPDRS 1',
                     color = 'Clinical State on Medication UPD23B',
                     title= 'Count of UPDR 1 Scores of Parkinson’s Disease Patients',
                     color_discrete_sequence=px.colors.qualitative.Vivid,
                     width=800, height=500)
align = "left"
fig.update_layout(title_font_size=15)
fig.show()

In [34]:
 fig=px.histogram(train_clinical,
                     x = 'UPDRS 2',
                     color = 'Clinical State on Medication UPD23B',
                     title= 'Count of UPDR 2 Scores of Parkinson’s Disease Patients',
                     color_discrete_sequence=px.colors.qualitative.Vivid,
                     width=800, height=500)
fig.update_layout(title_font_size=15)
fig.show()

In [35]:
 fig=px.histogram(train_clinical,
                     x = 'UPDRS 3',
                     color = 'Clinical State on Medication UPD23B',
                     title= 'Count of UPDR 3 Scores of Parkinson’s Disease Patients',
                     color_discrete_sequence=px.colors.qualitative.Vivid,
                     width=800, height=500)
fig.update_layout(title_font_size=15)
fig.show()

In [36]:
 fig=px.histogram(train_clinical,
                     x = 'UPDRS 4',
                     color = 'Clinical State on Medication UPD23B',
                     title= 'Count of UPDR 4 Scores of Parkinson’s Disease Patients',
                     color_discrete_sequence=px.colors.qualitative.Vivid,
                     width=800, height=500)
fig.update_layout(title_font_size=15)
fig.show()