In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
from pathlib import Path

In [None]:
# Load the competition datasets into Pandas DataFrame
path = Path("/Users/13392/Documents/amp-parkinsons-disease-progression-prediction")
train_proteins = pd.read_csv(path/"train_proteins.csv")
train_peptides = pd.read_csv(path/"train_peptides.csv")
train_clinical = pd.read_csv(path/"train_clinical_data.csv")
supplemental_clinical = pd.read_csv(path/"supplemental_clinical_data.csv")

In [None]:
print(train_clinical.shape)
print(train_peptides.shape)
print(train_proteins.shape)
print(supplemental_clinical.shape)

In [None]:
train_clinical.head(10)

In [None]:
train_peptides.head(20)

In [None]:
train_proteins.head(10)

In [None]:
supplemental_clinical.head(10)

Observations: 
1) visit_id = patient_id + visit_month
2) NaN values will need to be dealt with. 
3) Visits seem to be made and recorded every 3 or 6 months for "clinical data", while in "supplemental data" a visit is recorded only every 36 months. 


In [None]:
# Determining the number of unique patients in the datasets.

clinical_patient_count = train_clinical['patient_id'].nunique()
print(f'There are {clinical_patient_count} number of unique patients in "train_clinical_data.csv"')

patient_count1 = train_peptides['patient_id'].nunique()
print(f'There are {patient_count1} number of unique patients in "train_peptides.csv"')

patient_count2 = train_proteins['patient_id'].nunique()
print(f'There are {patient_count2} number of unique patients in "train_proteins.csv"')
      
supplemental_patient_count = supplemental_clinical['patient_id'].nunique()
print(f'There are {supplemental_patient_count} number of unique patients in "supplemental_clinical_data.csv"')

As we can see, all 248 clinical patients have their peptide and protein data recorded. 

In [None]:
# list all the unique patient_id
train_clinical['patient_id'].unique()

In [None]:
# Look at all the available clincal data for one patient: patient_id = 55
train_clinical[train_clinical['patient_id']==55]

In [None]:
# Look at all the available clincal data for another patient: patient_id = 942
train_clinical[train_clinical['patient_id']==942]

In [None]:
# Look at all the avaiable clincal data for another patient: patient_id = 1517
train_clinical[train_clinical['patient_id']==1517]

I notice a lot of NaN values in the previous patient data, especially in 'clinical_state_on_medication' column, so let's count all the NaN values in each column.

In [None]:
train_clinical.isna().sum()

Out of 2615 entries for each column, "upd23b_clinical_state_on_medication" has over 50% of of the values as NaN. Since it's indication whether a patient is either "on" or "off" medication, there is no straight forward way to interpolate the missing values. I'm going to drop the column entirely, especially considering that the testing data will not have any medication data. 

In [None]:
# counting the number of unique scores in 'updrs_4' and their occurances.
train_clinical['updrs_4'].value_counts()

In [None]:
# counting the number of unique scores for 'clinical state on medication' and their occurances.

train_clinical['upd23b_clinical_state_on_medication'].value_counts()

In [None]:
# drop the "medication status" column (due to over 50% NaN values), keep a copy of the original
# for later access. 
train_clinical_copy = train_clinical.copy()

train_clinical.drop('upd23b_clinical_state_on_medication', axis=1, inplace=True)


In [None]:
train_clinical

In [None]:
# and use interpolate() on the rest of the dataset.

train_clinical['updrs_1'].interpolate(method='linear', limit_direction='both', inplace=True)
train_clinical['updrs_2'].interpolate(method='linear', limit_direction='both', inplace=True)
train_clinical['updrs_3'].interpolate(method='linear', limit_direction='both', inplace=True)
train_clinical['updrs_4'].interpolate(method='linear', limit_direction='both', inplace=True)

In [None]:
# no more NaN values remaining. 

train_clinical.isna().sum()

In [None]:
# let's look at the other datasets. 

train_peptides.isna().sum()

In [None]:
train_proteins.isna().sum()

In [None]:
supplemental_clinical.isna().sum()

In [None]:
# we'll also drop the 'medication' column and interpolate the rest using the same method.

supplemental_clinical.drop('upd23b_clinical_state_on_medication', axis=1, inplace=True)


In [None]:
supplemental_clinical['updrs_1'].interpolate(method='linear', limit_direction='both', inplace=True)
supplemental_clinical['updrs_2'].interpolate(method='linear', limit_direction='both', inplace=True)
supplemental_clinical['updrs_3'].interpolate(method='linear', limit_direction='both', inplace=True)
supplemental_clinical['updrs_4'].interpolate(method='linear', limit_direction='both', inplace=True)

In [None]:
supplemental_clinical.isnull().sum()

In [None]:
cols = ['visit_month','updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']
train_clinical[cols].describe().T

In [None]:
supplemental_clinical[cols].describe().T

Max follow-up for patients in "clinical Data" is 108 months, and max follow-up for patients in "supplemental data" is 36 months.

Let's see how many of the 'clinical' patients are followed up for 56 months (mean + 1 std) or more, and how many of the 'supplemental' patients are followed up for 36 months (max).

In [None]:
months_followed = 56
unique_pt_count = train_clinical[train_clinical['visit_month']>=months_followed]['patient_id'].nunique()

In [None]:
unique_pt_count_suppl = supplemental_clinical[supplemental_clinical['visit_month']>=36]['patient_id'].nunique()

In [None]:
print(f'{unique_pt_count/clinical_patient_count*100:.2f}% in the clinical data are follwed up for 56 months or more.')
print(f'{unique_pt_count_suppl/supplemental_patient_count*100:.2f}% in the supplemental data are follwed up for 36 months.')

In [None]:
# label the source of data before combining them.
train_clinical['source'] = 'Clinical Data'
supplemental_clinical['source'] = 'Supplemental Data'

combined_data = pd.concat([train_clinical, supplemental_clinical],ignore_index=True)

In [None]:
combined_data

In [None]:
# check the total number of patients in the combined data. 
combined_data['patient_id'].nunique()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15,5))
sns.set_style('dark')
sns.histplot(data=combined_data, x='visit_month', hue='source', kde=True, ax=ax, element='step')
ax.set_title('Recorded Visiting Months')
ax.set_ylabel('Count')
ax.set_xlabel('Visiting Month');

In [None]:
cols = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']
fig, axs = plt.subplots(nrows=4, ncols=1, figsize=(15,20))
sns.set_style('dark')

for x, col in enumerate(cols):
    ax = axs[x]
    sns.histplot(data=combined_data, x=col, hue='source', kde=True, ax=ax, element='step')
    ax.set_title(f'{col} Score')
    ax.set_ylabel('Count')
    ax.set_xlabel(f'{col} Score')

UPDRS_1 and UPDRS_4 are similarly distributed across clincal data and supplemental data, while UPDRS_2 and UPDRS_3 scores are not. UPDRS_2 and UPDRS_3 scores have a much higher percentage of '0'. Since the two datasets are so different (in both score distribution, duration followed, and frequency of follow-up visits), should we be combining them and training a model on the combined data? My intution tells me that a model should only be trained on the "clinical" dataset.

In [None]:
# taking a quick look at the protein data
train_proteins.describe().T

In [None]:
data = train_proteins['NPX']
fig, ax = plt.subplots(ncols=1,nrows=1, figsize=(15,5))
sns.histplot(data, ax=ax, log_scale=True, color='g')
ax.set_title('Logarithmic Distribution of Normalized Protein Expression');

In [None]:
# peptide data
train_peptides.describe().T

In [None]:
data = train_peptides['PeptideAbundance']
fig, ax = plt.subplots(ncols=1,nrows=1, figsize=(15,5))
sns.histplot(data, ax=ax, log_scale=True, color='g')
ax.set_title('Logarithmic Distribution of Peptide Abundance');

In [None]:
fig, axs = plt.subplots(nrows=4, ncols=1, figsize=(15,25))
cols = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']
#axs = axs.flatten()

for i, col in enumerate(cols):
    ax=axs[i]
    sns.stripplot(data=train_clinical, x='visit_month', y=col, ax=ax)
    sns.pointplot(data=train_clinical_copy, x='visit_month', y=col, color='r', ax=ax, linestyles=':')


The data trend is not overly encouraging, as it shows relatively little disease progression trend, as one would expect with Parkinson's, i.e. worsening symptoms over time showing increasing UPDRS scores. There is only a slight upward trend in UPDRS_1, _2, and _3. 

What if we combine all four UPDRS scores?

In [None]:
train_clinical['updrs_total'] = train_clinical['updrs_1']+train_clinical['updrs_2']+train_clinical['updrs_3']+train_clinical['updrs_4']

In [None]:
train_clinical.head()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15,5))
sns.boxplot(data=train_clinical, x='visit_month', y='updrs_total', ax=ax)

Again, there is only a very slight upward trend of the total UPDRS score over 108 months of follow-up.

Let's now look at the Protein Data. We'll convert NPX to natural log since the dataset range is too wide otherwise.

In [None]:
train_proteins['log(NPX)'] = np.log(train_proteins['NPX'])
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15,5))

sns.boxplot(data=train_proteins, x='visit_month', y='log(NPX)', ax=ax)

This likely indicates that the total protein expressed (NPX) shows very little correlation with disease progression, but maybe certain types of peptides and their amounts can be telling?