# 1 - Load data

In [1]:
import pandas as pd
import numpy as np

# Load original data
original_data = pd.read_csv("./original_data.csv", sep=";")
# Replace ? with NaN
original_data = original_data.replace("?", np.nan)
original_data.shape

(402, 412)

In [2]:
original_data["moca"].dropna().astype("int64").describe()

count    400.000000
mean      26.737500
std        2.480721
min       21.000000
25%       25.000000
50%       27.000000
75%       29.000000
max       30.000000
Name: moca, dtype: float64

# 2 - Select original attributes

## 2.1 - Socio-demographic

In [3]:
original_socio_demographic_cols = ["patnum", "age", "sex", "pdonset", "durat_pd"]
original_socio_demographic = original_data[original_socio_demographic_cols]

## 2.2 - Hoehn-Yahr

In [4]:
original_hoehn_yahr_cols = ["hy"]
original_hoehn_yahr = original_data[original_hoehn_yahr_cols]

## 2.3 - NMSS

First we need to convert nmss variables from "object" to "int". Then, we generate the NMSS domains.

In [5]:
original_nmss_cols = [
    "nmss1f", "nmss1s",
    "nmss2f", "nmss2s",
    "nmss3f", "nmss3s",
    "nmss4f", "nmss4s",
    "nmss5f", "nmss5s",
    "nmss6f", "nmss6s",
    "nmss7f", "nmss7s",
    "nmss8f", "nmss8s",
    "nmss9f", "nmss9s",
    "nmss10f", "nmss10s",
    "nmss11f", "nmss11s",
    "nmss12f", "nmss12s",
    "nmss13f", "nmss13s",
    "nmss14f", "nmss14s",
    "nmss15f", "nmss15s",
    "nmss16f", "nmss16s",
    "nmss17f", "nmss17s",
    "nmss18f", "nmss18s",
    "nmss19f", "nmss19s",
    "nmss20f", "nmss20s",
    "nmss21f", "nmss21s",
    "nmss22f", "nmss22s",
    "nmss23f", "nmss23s",
    "nmss24f", "nmss24s",
    "nmss25f", "nmss25s",
    "nmss26f", "nmss26s",
    "nmss27f", "nmss27s",
    "nmss28f", "nmss28s",
    "nmss29f", "nmss29s",
    "nmss30f", "nmss30s",
]

original_nmss = original_data[original_nmss_cols]

for col in original_nmss_cols:
    original_nmss[col] = original_data[col].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## 2.4 - UPDRS - Motor

Manually select those variables that represent motor symptoms corresponding to the "Part 2: Part II: Motor Aspects of Experiences of Daily Living (M-EDL)", "Part 3: Motor examination", and "Part 4: Motor complications" domains.

In [6]:
original_updrs_motor_cols = [
    # M-EDL
    "mdsupdrs2_1", "mdsupdrs2_2", "mdsupdrs2_3", "mdsupdrs2_4", "mdsupdrs2_5", "mdsupdrs2_6", "mdsupdrs2_7", "mdsupdrs2_8", "mdsupdrs2_9",
    "mdsupdrs2_10", "mdsupdrs2_11", "mdsupdrs2_12", "mdsupdrs2_13",
    # Motor examination
    "mdsupdrs3_1","mdsupdrs3_2","mdsupdrs3_3a","mdsupdrs3_3b","mdsupdrs3_3c","mdsupdrs3_3d","mdsupdrs3_3e","mdsupdrs3_4a","mdsupdrs3_4b",
    "mdsupdrs3_5a","mdsupdrs3_5b","mdsupdrs3_6a","mdsupdrs3_6b","mdsupdrs3_7a","mdsupdrs3_7b","mdsupdrs3_8a","mdsupdrs3_8b","mdsupdrs3_9",
    "mdsupdrs3_10","mdsupdrs3_11","mdsupdrs3_12","mdsupdrs3_13","mdsupdrs3_14","mdsupdrs3_15a","mdsupdrs3_15b","mdsupdrs3_16a","mdsupdrs3_16b",
    "mdsupdrs3_17a","mdsupdrs3_17b","mdsupdrs3_17c","mdsupdrs3_17d","mdsupdrs3_17e","mdsupdrs3_18",
    # Motor complications
    "mdsupdrs4_1","mdsupdrs4_2","mdsupdrs4_3","mdsupdrs4_4","mdsupdrs4_5","mdsupdrs4_6"
]
original_updrs_motor = original_data[original_updrs_motor_cols]

for col in original_updrs_motor.columns:
    original_updrs_motor[col] = pd.to_numeric(original_updrs_motor[col])
    
for col in original_updrs_motor.columns:
    original_updrs_motor[col] = original_updrs_motor[col].astype("float64")

#original_updrs_motor.dtypes
#original_updrs_motor.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## 2.5 - Combine them all

Once selected, combine all data frames in a single one, which we are going to use to locate missing values

In [7]:
data_combined = pd.concat([original_socio_demographic, original_hoehn_yahr, original_nmss, original_updrs_motor], axis = 1)

# 3 - Missing data

Count missing values in each column, then filter those rows with misssing

In [8]:
missing_data = data_combined.isnull()
true_counts = [(column, missing_data[column].values.sum()) for column in missing_data.columns]
false_counts = [(column, (~missing_data[column].values).sum()) for column in missing_data.columns]
true_counts.sort(key=lambda x:x[1], reverse = True)

We observe a concentration of missing values on certain variables of the MDS-NMS and MDS-UPDRS scales

In [9]:
data_no_missing = data_combined.dropna()
data_no_missing.shape

(347, 118)

In [10]:
data_combined.dtypes

patnum          object
age              int64
sex              int64
pdonset          int64
durat_pd         int64
                ...   
mdsupdrs4_2    float64
mdsupdrs4_3    float64
mdsupdrs4_4    float64
mdsupdrs4_5    float64
mdsupdrs4_6    float64
Length: 118, dtype: object

# 4 - Generate final attributes
We have two datasets: 
* One with missing values <code>data_combined</code> (402, 96)
* One without them <code>data_no_missing</code> (352, 96)

By default we are going to select the dataset with missing values because our multidimensional clustering method can deal with them.

In [11]:
data = data_combined
#data = data_no_missing
data.shape

(402, 118)

## 4.1 - NMSS

For this, we are going to combine all the attributes of the same subscale (A, B, C, etc) into a single attribute, following the work of Chaudhuri et al. (2007). 

* **Cardiovascular** (2 attributes)
* **Sleep/Fatigue** (4 attributes)
* **Mood/Cognition** (6 attributes)
* **Perception/Hallucinations** (3 attributes)
* **Attention/Memory** (3 attributes)
* **Gastrointestinal** (3 attributes)
* **Urinary** (3 attributes)
* **Sexual** (2 attributes)
* **Smell** (1 attribute)
* **Weight change** (1 attribute)
* **Sweating** (1 attribute)

We are going to divide the Miscellaneous domain intro 3 attributes, which are easier to analyze. Reason being, each one of them represent a different aspect.

In [12]:
nmss_domain_names=["cardiovascular", "sleep_fatigue", "mood_cognition", "hallucinations", "attention_memory", 
            "gastrointestinal", "urinary", "sexual", "smell", "weight_change", "sweating"]

nmss = pd.DataFrame(columns=nmss_domain_names)

nmss["cardiovascular"] = ((data["nmss1f"] * data["nmss1s"]) + 
                          (data["nmss2f"] * data["nmss2s"])) / 2

nmss["sleep_fatigue"] = ((data["nmss3f"] * data["nmss3s"]) +
                        (data["nmss4f"] * data["nmss4s"]) +
                        (data["nmss5f"] * data["nmss5s"]) +
                        (data["nmss6f"] * data["nmss6s"])) / 4

nmss["mood_cognition"] = ((data["nmss7f"] * data["nmss7s"]) + 
           (data["nmss8f"] * data["nmss8s"]) +
           (data["nmss9f"] * data["nmss9s"]) +
           (data["nmss10f"] * data["nmss10s"]) +
           (data["nmss11f"] * data["nmss11s"]) +
           (data["nmss12f"] * data["nmss12s"])) / 6

nmss["hallucinations"] = ((data["nmss13f"] * data["nmss13s"]) + 
                         (data["nmss14f"] * data["nmss14s"]) + 
                         (data["nmss15f"] * data["nmss15s"])) / 3

nmss["attention_memory"] = ((data["nmss16f"] * data["nmss16s"]) +
                            (data["nmss17f"] * data["nmss17s"]) +
                            (data["nmss18f"] * data["nmss18s"])) / 3

nmss["gastrointestinal"] = ((data["nmss19f"] * data["nmss19s"]) +
                            (data["nmss20f"] * data["nmss20s"]) +
                            (data["nmss21f"] * data["nmss21s"])) / 3

nmss["urinary"] = ((data["nmss22f"] * data["nmss22s"]) +
                            (data["nmss23f"] * data["nmss23s"]) +
                            (data["nmss24f"] * data["nmss24s"])) / 3

nmss["sexual"] = ((data["nmss25f"] * data["nmss25s"]) +
                            (data["nmss26f"] * data["nmss26s"])) / 2

nmss["smell"] = data["nmss28f"] * data["nmss28s"]

nmss["weight_change"] = data["nmss29f"] * data["nmss29s"]

nmss["sweating"] = data["nmss30f"] * data["nmss30s"]

## 4.2 - MDS-UPDRS

There are multiple motor variables, in this case we are going to select the following key motor aspects:

* Speech (mdsupdrs3_1)
* Rigidity (max of mdsupdrs3_3a-e)
* Gait (mdsupdrs3_10)
* Freezing (mdsupdrs3_11)
* Postural stability (mdsuprds3_12)
* Postural tremor (max of mdsupdrs15a-b)
* Kinetic tremor (max of mdsupdrs16a-b)
* Rest tremor (max of mdsupdrs3_17a-d)
* Bradykinesia (max of mdsupdrs3_4,mdsupdrs3_8, mdsupdrs3_14)
* Dyskinesias (max of mdsupdrs4_1-4_2)
* Motor fluctuations (max of mdsupdrs4_3-4_6)

In [13]:
mds_updrs_names=["speech", "rigidity", "gait", "freezing", 
                 "postural_stability", "tremor_post", "tremor_kin","tremor_rest",
                 "bradykinesia", "dyskinesias", "motor_fluctuations"]

mds_updrs= pd.DataFrame(columns=mds_updrs_names)

mds_updrs["speech"] = data["mdsupdrs3_1"]

rigidity_cols = ["mdsupdrs3_3a", "mdsupdrs3_3b", "mdsupdrs3_3c", "mdsupdrs3_3d", "mdsupdrs3_3e"]
mds_updrs["rigidity"] = data[rigidity_cols].max(axis=1)

mds_updrs["gait"] = data["mdsupdrs3_10"]

mds_updrs["freezing"] = data["mdsupdrs3_11"]

mds_updrs["postural_stability"] = data["mdsupdrs3_12"]

postural_tremor_cols = ["mdsupdrs3_15a", "mdsupdrs3_15b"]
mds_updrs["tremor_post"] = data[postural_tremor_cols].max(axis=1)

kinetic_tremor_cols = ["mdsupdrs3_16a", "mdsupdrs3_16b"]
mds_updrs["tremor_kin"] = data[kinetic_tremor_cols].max(axis=1)

rest_tremor_cols = ["mdsupdrs3_17a", "mdsupdrs3_17b", "mdsupdrs3_17c", "mdsupdrs3_17d"]
mds_updrs["tremor_rest"] = data[rest_tremor_cols].max(axis=1)

bradykinesia_cols = ["mdsupdrs3_4a", "mdsupdrs3_4b", "mdsupdrs3_5a", "mdsupdrs3_5b", "mdsupdrs3_6a", "mdsupdrs3_6b", 
                     "mdsupdrs3_7a", "mdsupdrs3_7b", "mdsupdrs3_8a", "mdsupdrs3_8b", "mdsupdrs3_14"]
mds_updrs["bradykinesia"] = data[bradykinesia_cols].max(axis=1)

dyskinesias_cols = ["mdsupdrs4_1", "mdsupdrs4_2"]
mds_updrs["dyskinesias"] = data[dyskinesias_cols].max(axis=1)

motor_fluctuations_cols = ["mdsupdrs4_3", "mdsupdrs4_4", "mdsupdrs4_5", "mdsupdrs4_6"]
mds_updrs["motor_fluctuations"] = data[motor_fluctuations_cols].max(axis=1)

In [14]:
mds_updrs

Unnamed: 0,speech,rigidity,gait,freezing,postural_stability,tremor_post,tremor_kin,tremor_rest,bradykinesia,dyskinesias,motor_fluctuations
0,3.0,3.0,1.0,1.0,0.0,1.0,0.0,0.0,3.0,1.0,2.0
1,2.0,3.0,1.0,0.0,2.0,1.0,1.0,0.0,1.0,1.0,1.0
2,1.0,2.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,1.0,0.0
3,2.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,3.0
4,2.0,1.0,2.0,0.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
397,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
398,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,1.0,0.0,1.0
399,1.0,0.0,1.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0
400,2.0,0.0,1.0,0.0,0.0,2.0,0.0,2.0,3.0,0.0,0.0


## 4.3 - Socio-demographic

In [15]:
socio_demographic = pd.DataFrame(columns=original_socio_demographic_cols)
socio_demographic["patnum"] = data["patnum"].values
socio_demographic["age"] = data["age"].values
socio_demographic["sex"] = data["sex"].values
socio_demographic["pdonset"] = data["pdonset"].values
socio_demographic["durat_pd"] = data["durat_pd"].values

# Change sex codes from (0,1) to (male, female)
socio_demographic["sex"] = socio_demographic["sex"].astype("category")
socio_demographic["sex"].cat.categories = ["male", "female"]

socio_demographic.shape

(402, 5)

## 4.4 - Hoehn-Yahr

In [16]:
hoehn_yahr = pd.DataFrame(columns=original_hoehn_yahr_cols)
hoehn_yahr = data["hy"]
hoehn_yahr.shape

(402,)

# 5 - Impute missing data

First, we need to transform MDS_UPDRS columns to categorical format to allow the imputer to recognize them as such. Then combine both the nmss and mds_updrs variables data in a single dataset and impute all their missing values.

In [17]:
# categorical
from sklearn.impute import SimpleImputer

cat_mds_updrs = pd.DataFrame(columns=mds_updrs.columns)
for col in mds_updrs.columns:
    cat_mds_updrs[col] = mds_updrs[col].astype("category")
    
imputer = SimpleImputer(strategy="most_frequent")
imputed_mds_updrs = imputer.fit_transform(cat_mds_updrs)
imputed_mds_updrs = pd.DataFrame(imputed_mds_updrs, columns = mds_updrs.columns)

# numerical
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(random_state=0)
imputed_nmss = imputer.fit_transform(nmss)
imputed_nmss = pd.DataFrame(imputed_nmss, columns = nmss.columns)

# 6 - Combine and generate the final data frame

In [18]:
final_df = pd.concat([socio_demographic, hoehn_yahr], axis = 1)
final_df.to_csv("./mds_parkinson_info.csv", index=False, na_rep="?")
final_df.shape

(402, 6)

In [19]:
train_df = pd.concat([imputed_nmss, imputed_mds_updrs], axis = 1)
train_df.to_csv("./mds_parkinson_train.csv", index=False, na_rep="?")
train_df.shape

(402, 22)