## Libraries and Packages
 - Python version: 3.7.7
 - IPython version: 7.16.1
 - Pandas version: 1.0.5
 - SciKit Learn version: 0.23.1

In [1]:
import pandas as pd

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier

## Data Import

In [2]:
original_df = pd.read_csv('2021-02-17_full_data_flatfile.csv', low_memory=False,
                         usecols=['excluded','research_auth','case_control',
                                  'sex','race','parktype_case','cardinal_sym___tremor',
                                  'cardinal_sym___brady','cardinal_sym___reflex',
                                  'cardinal_sym___rigid','cardinal_sym___asymmetry',
                                  'synucleinopathy_type','dlb_dementia',
                                  'dyskinesia','rem_sleep','rem_sleep_without_atonia',
                                  'restless_legs_syndrome','obstructive_sleep_apnea',
                                  'primary_snoring','narcolepsy','insomnia',
                                  'hypersomnolence','hypnotic_med_use','erdys',
                                  'constipation','constipation_longstanding',
                                  'anosmia','anosmia_longstanding','neurogenic_bladder',
                                  'neurogenic_longstanding'])


# Eliminate all instances missing the y - parktype_case
original_df = original_df[original_df['parktype_case'].notna()]

# Drop excluded patients
original_df = original_df[~original_df['excluded'].isin([1,2])]
original_df = original_df[~original_df['research_auth'].isin(['N'])]
original_df = original_df[~original_df['case_control'].isin(['control'])]

# Final drops
original_df = original_df.drop(columns=['excluded','research_auth','case_control'])

In [3]:
original_df.shape

(658, 27)

## Analysis Plan

<img src='Figure1_Pipeline_4.png'>

# Preprocessing

### Encoding of Categoricals

In [4]:
# Binary Encoding and Re-encoding
df_encoded = pd.DataFrame()

df_encoded['sex'] = original_df['sex'].replace('M',1)
df_encoded['sex'] = df_encoded['sex'].replace('F',0)

df_encoded['dlb_dementia'] = original_df['dlb_dementia'].replace('1995-12-07',1)
df_encoded['dlb_dementia'] = df_encoded['dlb_dementia'].replace('1998-12-11',1)
df_encoded['dlb_dementia'] = df_encoded['dlb_dementia'].replace('2009-01-07',1)

df_encoded['dyskinesia'] = original_df['dyskinesia'].replace(3.0,1.0)

df_encoded['constipation'] = original_df['constipation'].replace(3.0,0.0)

df_encoded['anosmia'] = original_df['anosmia'].replace(3.0,0.0)

df_encoded['parktype_case'] = original_df['parktype_case'].replace('als_parkinsonism',1)
df_encoded['parktype_case'] = df_encoded['parktype_case'].replace('corticobasal_syndrome',2)
df_encoded['parktype_case'] = df_encoded['parktype_case'].replace('drug_induced_parkinsons_disease',3)
df_encoded['parktype_case'] = df_encoded['parktype_case'].replace('lewy_body_disease',4)
df_encoded['parktype_case'] = df_encoded['parktype_case'].replace('multi_system_atrophy',5)
df_encoded['parktype_case'] = df_encoded['parktype_case'].replace('parkinsons_disease',6)
df_encoded['parktype_case'] = df_encoded['parktype_case'].replace('parkinsons_disease_in_dementia',7)
df_encoded['parktype_case'] = df_encoded['parktype_case'].replace('parkinsons_disease_unspecified_insufficient_data',8)
df_encoded['parktype_case'] = df_encoded['parktype_case'].replace('progressive_supranuclear_palsy',9)
df_encoded['parktype_case'] = df_encoded['parktype_case'].replace('surgical_parkinsons_disease',10)
df_encoded['parktype_case'] = df_encoded['parktype_case'].replace('vascular_parkinsons_disease',11)

### Missing Data

In [5]:
# Custom NAs (unique because of the hand data curation nuances and assumptions)
df_filled = df_encoded

df_filled['synucleinopathy_type'] = original_df['synucleinopathy_type'].fillna(0.0)
df_filled['dlb_dementia'] = df_encoded['dlb_dementia'].fillna(0.0)
df_filled['dyskinesia'] = df_encoded['dyskinesia'].fillna(0.0)
df_filled['rem_sleep'] = original_df['rem_sleep'].fillna(0.0)
df_filled['rem_sleep_without_atonia'] = original_df['rem_sleep_without_atonia'].fillna(0.0)
df_filled['restless_legs_syndrome'] = original_df['restless_legs_syndrome'].fillna(0.0)
df_filled['obstructive_sleep_apnea'] = original_df['obstructive_sleep_apnea'].fillna(0.0)
df_filled['primary_snoring'] = original_df['primary_snoring'].fillna(0.0)
df_filled['narcolepsy'] = original_df['narcolepsy'].fillna(0.0)
df_filled['insomnia'] = original_df['insomnia'].fillna(0.0)
df_filled['hypersomnolence'] = original_df['hypersomnolence'].fillna(0.0)
df_filled['hypnotic_med_use'] = original_df['hypnotic_med_use'].fillna(0.0)
df_filled['erdys'] = original_df['erdys'].fillna(0.0)
df_filled['constipation'] = df_encoded['constipation'].fillna(0.0)
df_filled['constipation_longstanding'] = original_df['constipation_longstanding'].fillna(0.0)
df_filled['anosmia'] = df_encoded['anosmia'].fillna(0.0)
df_filled['anosmia_longstanding'] = original_df['anosmia_longstanding'].fillna(0.0)
df_filled['neurogenic_bladder'] = original_df['neurogenic_bladder'].fillna(0.0)
df_filled['neurogenic_longstanding'] = original_df['neurogenic_longstanding'].fillna(0.0)

# Add in remaining columns to active dataframe
df_filled['race'] = original_df['race']
df_filled['cardinal_sym___tremor'] = original_df['cardinal_sym___tremor']
df_filled['cardinal_sym___brady'] = original_df['cardinal_sym___brady']
df_filled['cardinal_sym___reflex'] = original_df['cardinal_sym___reflex']
df_filled['cardinal_sym___rigid'] = original_df['cardinal_sym___rigid']
df_filled['cardinal_sym___asymmetry'] = original_df['cardinal_sym___asymmetry']

In [6]:
# Iterative Imputer for remaining missing data points
itimp = IterativeImputer(estimator=RandomForestClassifier(),
                         max_iter=50)
itimp.fit_transform(df_filled)

df_imputed = pd.DataFrame(itimp.fit_transform(df_filled),
                          columns=list(df_filled),
                          index=df_filled.index)

In [7]:
df_imputed.isna().sum()

sex                          0
dlb_dementia                 0
dyskinesia                   0
constipation                 0
anosmia                      0
parktype_case                0
synucleinopathy_type         0
rem_sleep                    0
rem_sleep_without_atonia     0
restless_legs_syndrome       0
obstructive_sleep_apnea      0
primary_snoring              0
narcolepsy                   0
insomnia                     0
hypersomnolence              0
hypnotic_med_use             0
erdys                        0
constipation_longstanding    0
anosmia_longstanding         0
neurogenic_bladder           0
neurogenic_longstanding      0
race                         0
cardinal_sym___tremor        0
cardinal_sym___brady         0
cardinal_sym___reflex        0
cardinal_sym___rigid         0
cardinal_sym___asymmetry     0
dtype: int64

### Class Balance Check

In [8]:
X = df_imputed.drop(columns=['parktype_case'])
y = df_imputed['parktype_case']

y_list = list(y)
print('1: '+str(y_list.count(1)))
print('2: '+str(y_list.count(2)))
print('3: '+str(y_list.count(3)))
print('4: '+str(y_list.count(4)))
print('5: '+str(y_list.count(5)))
print('6: '+str(y_list.count(6)))
print('7: '+str(y_list.count(7)))
print('8: '+str(y_list.count(8)))
print('9: '+str(y_list.count(9)))
print('10: '+str(y_list.count(10)))
print('11: '+str(y_list.count(11)))

1: 1
2: 4
3: 47
4: 81
5: 16
6: 304
7: 56
8: 115
9: 19
10: 2
11: 13


Classes are significantly imbalanced. For Clustering Analysis, drop all with 10 or fewer instances. For developing a Predictive Model, leave them all instances in initially to enable a one-vs-rest classification approach. 

### Categorical Variable Encoding

In [9]:
# Note, binary categoricals were not encoded
categoricals_to_encode = ['dyskinesia','constipation','anosmia',
                          'synucleinopathy_type','race']

final_df = pd.get_dummies(df_imputed, columns=categoricals_to_encode)

# Serialize final dataframe

In [10]:
final_df = df_imputed

In [11]:
final_df.to_pickle('preprocessed_df.pkl')