In [None]:
import pandas as pd
import numpy as np


PPMI_CLINICAL

In [None]:
#data should be downloaded from https://www.ppmi-info.org/ and saved in PPMI_clinical and PPMI_remote folders accordingly
#dates on file paths should be adapted accordingly

#load all data
age = pd.read_csv("data/PPMI_clinical/Age_at_visit_24May2024.csv")
demographics = pd.read_csv("data/PPMI_clinical/Demographics_10Jun2024.csv")
pat_status = pd.read_csv("data/PPMI_clinical/Participant_Status_24May2024.csv")
socio_eco = pd.read_csv("data/PPMI_clinical/Socio-Economics_24May2024.csv")
upsit = pd.read_csv("data/PPMI_clinical/University_of_Pennsylvania_Smell_Identification_Test_UPSIT_24May2024.csv")

In [None]:
#check for columns that are in all dataframes
intersect = set(age.columns)
for df in [demographics, pat_status, socio_eco, upsit]:
    intersect = intersect.intersection(set(df.columns))
print(intersect)


In [None]:
#print columns for each dataframe
print("Age")
print(age.columns)
print("Demographics")
print(demographics.columns)
print("Participant Status")
print(pat_status.columns) 
print("Socio-Economics") 
print(socio_eco.columns)
print("UPSIT")
print(upsit.columns)



In [None]:
# List of dataframes
dfs = [age, demographics, socio_eco, upsit]

# Merge all dataframes bases on PATNO and EVENT_ID
merged_df = dfs[0]
for df in dfs[1:]:
    merged_df = merged_df.merge(df, on=['PATNO', 'EVENT_ID'], how='outer')


In [None]:
#merge with participant status (this dataframe has no EVENT_ID)
merged_df = merged_df.merge(pat_status, on=['PATNO'], how='outer', suffixes=('', '_y'))

merged_df.drop(merged_df.filter(regex='_y$').columns, axis=1, inplace=True)

In [None]:
#save merged dataframe
merged_df.to_csv("data/merged_ppmi_clinical.csv", index=False)


PPMI_remote

In [None]:
high_int = pd.read_csv("data/PPMI_remote/Remote_Screening_High_Interest_24May2024.csv")
pat_prog = pd.read_csv("data/PPMI_remote/Remote_Screening_Participant_Progress_24May2024.csv")
screen = pd.read_csv("data/PPMI_remote/Remote_Screening_Screener_24May2024.csv")
smell = pd.read_csv("data/PPMI_remote/Remote_Screening_Smell_Test_Direct_Screener_24May2024.csv", low_memory=False)
upsit_screen = pd.read_csv("data/PPMI_remote/Remote_Screening_UPSIT_Screening_24May2024.csv")
upsit = pd.read_csv("data/PPMI_remote/Remote_University_of_Pennsylvania_Smell_Identification_Test_24May2024.csv")


In [None]:
#rename upsit ORIG_ENTRY to UPSIT_ORIG_ENTRY as this one will be used to get age
upsit = upsit.rename(columns={"ORIG_ENTRY": "UPSIT_ORIG_ENTRY"})

#print columns
print("High Interest")
print(high_int.columns)
print("Participant Progress")
print(pat_prog.columns)
print("Screen")
print(screen.columns)
print("Smell Test")
print(smell.columns)
print("UPSIT Screening")
print(upsit_screen.columns)
print("UPSIT")
print(upsit.columns)

In [None]:
#merge all remote screening data
remote_dfs = [high_int, pat_prog, screen, smell, upsit_screen, upsit]
remote_merged_df = remote_dfs[0]
for df in remote_dfs[1:]:
    remote_merged_df = remote_merged_df.merge(df, on=['PATNO', 'EVENT_ID'], how='outer', suffixes=('', '_y'))

remote_merged_df.drop(remote_merged_df.filter(regex='_y$').columns, axis=1, inplace=True)

print(remote_merged_df.shape)

In [None]:
#save df
remote_merged_df.to_csv("data/merged_ppmi_remote.csv", index=False)

In [None]:
# #save columns in txt
# with open("merged_ppmi_remote_columns.txt", "w") as f:
#     for col in remote_merged_df.columns:
#         f.write(col + "\n")
        
# with open("merged_ppmi_clinical_columns.txt", "w") as f:
#     for col in merged_df.columns:
#         f.write(col + "\n")

adding year of birth and age in years

In [None]:
import pandas as pd

In [None]:
remote = pd.read_csv("data/merged_ppmi_remote.csv", low_memory=False)
year = pd.read_csv("data/PPMI_remote/Remote_Screening_BirthYear.csv")

#get birth year and PATNO only
year = year[["PATNO", "BIRTHDT"]]

#print unique values of birth year
print(year["BIRTHDT"].unique())


In [None]:
#merge with remote data
print(remote.shape)
remote = remote.merge(year, on=["PATNO"], how="inner")

print(remote.shape)


In [None]:
#get year from UPSIT_ORIG_ENTRY
remote["UPSIT_ORIG_ENTRY"] = pd.to_datetime(remote["UPSIT_ORIG_ENTRY"])
remote["YEAR"] = remote["UPSIT_ORIG_ENTRY"].dt.year
print(remote["YEAR"].unique())

#AGE as YEAR - BIRTHDT
remote["AGE"] = remote["YEAR"] - remote["BIRTHDT"]
print(remote["AGE"].unique())

In [None]:
#save remote
print(remote.shape)
remote.to_csv("data/merged_ppmi_remote.csv", index=False)

PART 2 - Computing Hyposmia Score

In [None]:
import pandas as pd
import numpy as np

In [None]:
#load both dataframes
remote = pd.read_csv("data/merged_ppmi_remote.csv", low_memory=False)
clinical = pd.read_csv("data/merged_ppmi_clinical.csv", low_memory=False)

#print shapes
print(remote.shape)
print(clinical.shape)



In [None]:
def compute_hyposmia_ppmi(df):
    #Compute Hyposmia according to PPMI old cutoff values
    df['HYPOSMIA_PPMI'] = np.nan
    df["BIRTHSEX"] = df["BIRTHSEX"].astype(float)
    for index, row in df.iterrows():
        if row['BIRTHSEX'] == 1.0:  # 1: Male
            if row['TOTAL_CORRECT'] <= 33:
                df.at[index, "HYPOSMIA_PPMI"] = 1
            else:
                df.at[index, "HYPOSMIA_PPMI"] = 0
        elif row['BIRTHSEX'] == 0.0: # 0: Female
            if row['TOTAL_CORRECT'] <= 34:
                df.at[index, "HYPOSMIA_PPMI"] = 1
            else:
                df.at[index, "HYPOSMIA_PPMI"] = 0
    return df

def compute_hyposmia(df):
    #Compute Hyposmia according to AGE and UPSIT total score (from https://doi.org/10.1212/wnl.0000000000207077)
    df["HYPOSMIA"] = np.nan

    df["AGE"] = df["AGE"].astype(float)
    df["BIRTHSEX"] = df["BIRTHSEX"].astype(float)
    df["TOTAL_CORRECT"] = df["TOTAL_CORRECT"].astype(float)
    for index, row in df.iterrows():
        if row["AGE"] < 60.0:
            print(f"Age is less than 60: {row['PATNO']}")
        if row["BIRTHSEX"] == 0.0:
            if row["AGE"] >= 60 and row["AGE"] <= 64 and row["TOTAL_CORRECT"] <= 30:
                df.at[index, "HYPOSMIA"] = 1
            elif row["AGE"] >= 65 and row["AGE"] <= 69 and row["TOTAL_CORRECT"] <= 28:
                df.at[index, "HYPOSMIA"] = 1
            elif row["AGE"] >= 70 and row["AGE"] <= 74 and row["TOTAL_CORRECT"] <= 26:
                df.at[index, "HYPOSMIA"] = 1
            elif row["AGE"] >= 75 and row["AGE"] <= 79 and row["TOTAL_CORRECT"] <= 24:
                df.at[index, "HYPOSMIA"] = 1
            elif row["AGE"] >= 80 and row["TOTAL_CORRECT"] <= 22:
                df.at[index, "HYPOSMIA"] = 1
            else:
                #no HYPOSMIA
                df.at[index, "HYPOSMIA"] = 0
                    
        elif row["BIRTHSEX"] == 1.0:
            if row["AGE"] >= 60 and row["AGE"] <= 64 and row["TOTAL_CORRECT"] <= 26:
                df.at[index, "HYPOSMIA"] = 1
            elif row["AGE"] >= 65 and row["AGE"] <= 69 and row["TOTAL_CORRECT"] <= 24:
                df.at[index, "HYPOSMIA"] = 1
            elif row["AGE"] >= 70 and row["AGE"] <= 74 and row["TOTAL_CORRECT"] <= 23:
                df.at[index, "HYPOSMIA"] = 1
            elif row["AGE"] >= 75 and row["AGE"] <= 79 and row["TOTAL_CORRECT"] <= 21:
                df.at[index, "HYPOSMIA"] = 1
            elif row["AGE"] >= 80 and row["TOTAL_CORRECT"] <= 18:
                df.at[index, "HYPOSMIA"] = 1     
            else:
                #no HYPOSMIA
                df.at[index, "HYPOSMIA"] = 0
    return df

In [None]:
#print unique values for sex
print(clinical["SEX"].value_counts())

In [None]:
#find columns with sufixes
remote_cols = remote.columns
clinical_cols = clinical.columns
remote_suf = set()
clinical_suf = set()
for col in remote_cols:
    if col.endswith("_x"):
        remote_suf.add(col)
for col in clinical_cols:
    if col.endswith("_x"):
        clinical_suf.add(col)
print(remote_suf)
print(clinical_suf)
#drop columns with suffixes
remote.drop(remote.filter(regex='_x$').columns, axis=1, inplace=True)

CLINICAL

In [None]:
from tqdm import tqdm

#for each PATNO, create a new row with EVENT_ID = BLSC
#sort values by PATNO and EVENT_ID 
clinical = clinical.sort_values(by=["PATNO", "EVENT_ID"])
#get list of unique PATNOs
patnos = clinical["PATNO"].unique()
print(len(patnos))

#for each PATNO, create a new row with EVENT_ID = BLSC
new_rows = []
for patno in tqdm(patnos):
    #get all rows for this patno
    patno_rows = clinical[clinical["PATNO"] == patno]
    #check if there is a BL row
    bl_rows = patno_rows[patno_rows["EVENT_ID"] == "BL"]
    #check if there is a SC row
    sc_rows = patno_rows[patno_rows["EVENT_ID"] == "SC"]
    #check if there is a TRANS row
    trans_rows = patno_rows[patno_rows["EVENT_ID"] == "TRANS"]
    
    #if there is a BL row or a SC row, create a new row with EVENT_ID = BLSC
    if bl_rows.shape[0] > 0:
        bl_row = bl_rows.iloc[0]
        #check if age is null
        
        if sc_rows.shape[0] > 0:
            sc_row = sc_rows.iloc[0]
            new_row = bl_row.copy()
            new_row["EVENT_ID"] = "BLSC"
            #for each missing value in bl_row, fill with sc_row
            for col in clinical.columns:
                if pd.isnull(new_row[col]):
                    new_row[col] = sc_row[col]
        else:
            new_row = bl_row.copy()
            new_row["EVENT_ID"] = "BLSC"
    else:
        if sc_rows.shape[0] > 0:
            sc_row = sc_rows.iloc[0]
            new_row = sc_row.copy()
            new_row["EVENT_ID"] = "BLSC"
        else:
            continue
    
    if len(new_row) > 0:
        #check if BIRTHSEX has a value
        if pd.isnull(new_row["SEX"]):
            #get it from any other row from that patno THAT HAS DATA
            if trans_rows.shape[0] > 0:
                new_row["SEX"] = trans_rows["SEX"].iloc[0]
    new_rows.append(new_row)
    
print(len(new_rows))

#transform it into a df
clinical_new = pd.DataFrame(new_rows)
print(clinical_new.shape)

In [None]:
#NEW_CODE
clinical = clinical_new
#keep only rows where concohort is 1, if missing cohort shold be 1
print(clinical.shape)

#print unique values of EVENT_ID
print(clinical["EVENT_ID"].value_counts())
print("cohort and concohort")

clinical = clinical.loc[(clinical['CONCOHORT'] == 1) | ((clinical['CONCOHORT'].isna()) & (clinical['COHORT'] == 1))]

print(clinical.shape)
print("removed screen failed")
#remove screen failures
clinical = clinical[clinical["ENROLL_STATUS"] != "Screen failed"]

#filter for AGE_AT_VISIT at least 60
print(clinical.shape)
print("AGE")
clinical = clinical[clinical["AGE_AT_VISIT"] >= 60]
print(clinical.shape)
print("UPSITFORM")
clinical = clinical[clinical["UPSITFORM"] == 2.0]
print(clinical.shape)

clinical = clinical.rename(columns={"AGE_AT_VISIT": "AGE", "SEX": "BIRTHSEX"})

#get all columns that end with _CORRECT in clinical
clinical_cols = [col for col in clinical.columns if col.endswith("_CORRECT")]
#drop TOTAL_CORRECT columns
clinical_cols.remove("TOTAL_CORRECT")

#drop patients with missing data at clinical_cols
print(clinical.shape)

missing_dict= {}
#check how much missing data we have for each patients
for patno in clinical["PATNO"].unique():
    patno_rows = clinical[clinical["PATNO"] == patno]
    missing = patno_rows[clinical_cols].isnull().sum().sum()
    missing_dict[patno] = missing
        
#print how many patients we have with each number of missing values
from collections import Counter
missing_counter = Counter(missing_dict.values())



In [None]:
clinical_orig = clinical.copy()

In [None]:
#keep only patients that have less than 3 missing values (5%)
print(clinical.shape)
print("missing values")
clinical = clinical[clinical["PATNO"].map(missing_dict) < 3]
print(clinical.shape)


In [None]:
#imput data for missing values with KNN
print(clinical.shape)
print("imput missing values")
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=4)
clinical[clinical_cols] = imputer.fit_transform(clinical[clinical_cols])

In [None]:

#overwrite column that is the sum of all columns in clinical
clinical["TOTAL_CORRECT"] = clinical[clinical_cols].sum(axis=1)

#if it has missing data in any of clinical_cols, drop row

#print("removing missing data from any column of UPSIT")
clinical = clinical.dropna(subset=clinical_cols, how='any')

print(clinical.shape)
clinical=compute_hyposmia(clinical)
clinical=compute_hyposmia_ppmi(clinical)
print(clinical.shape)

print(clinical["HYPOSMIA"].value_counts(dropna=False))
print(clinical["HYPOSMIA_PPMI"].value_counts(dropna=False))

#repeat for clinical_orig

clinical_orig["TOTAL_CORRECT"] = clinical_orig[clinical_cols].sum(axis=1)
clinical_orig = clinical_orig.dropna(subset=clinical_cols, how='any')
clinical_orig=compute_hyposmia(clinical_orig)
clinical_orig=compute_hyposmia_ppmi(clinical_orig)
print(clinical_orig.shape)
 


In [None]:
#check counts of HYPOSMIA
print(clinical["HYPOSMIA"].value_counts(dropna=False))
print(clinical["HYPOSMIA_PPMI"].value_counts(dropna=False))
#check counts of TOTAL_CORRECT
#print(clinical["TOTAL_CORRECT"].value_counts(dropna=False))
#check counts of AGE
#age_count =clinical["AGE"].value_counts(dropna=False)
#check counts of BIRTHSEX
print(clinical["BIRTHSEX"].value_counts(dropna=False))


In [None]:
#if it dos not exist, create processed folder
import os
if not os.path.exists("data/processed"):
    os.makedirs("data/processed")

#save new_df
clinical.to_csv("data/processed/clinical_processed_imputed.csv", index=False)
clinical_orig.to_csv("data/processed/clinical_processed.csv", index=False)

REMOTE

In [None]:
remotecopy=remote.copy()

In [None]:

remote = remotecopy#for remote: HIQ1 =0; HIQ2 =0; HIQ4 =0 (or 1 if not enough); checar hyposmia
remote_cols = [col for col in remote.columns if col.endswith("_CORRECT")]
remote_cols.remove("TOTAL_CORRECT")
print(remote_cols)
print(len(remote_cols))
 
remote["TOTAL_CORRECT"] = remote[remote_cols].sum(axis=1)
print(remote.shape)
print("AGE")
remote = remote[remote["AGE"] >= 60]

print(remote.shape)
print("HIQ1")
remote = remote[remote["HIQ1"] == 0]
print(remote.shape)
print("HIQ2")
remote = remote[remote["HIQ2"] == 0]
print(remote.shape)
print("HIQ4")
remote = remote[remote["HIQ4"] == 0]
#filter for age at least 60



print(remote.shape)
print("removing missing data from any column of UPSIT")
remote = remote.dropna(subset=remote_cols, how='any')
print(remote.shape)

In [None]:
remote = compute_hyposmia(remote)
remote = compute_hyposmia_ppmi(remote)
print(remote.shape)

In [None]:
print(remote["HYPOSMIA"].value_counts(dropna=False))
print(remote["HYPOSMIA_PPMI"].value_counts(dropna=False))

In [None]:


#save remote
remote.to_csv("data/processed/remote_processed.csv", index=False)

HYPOSMIA

In [None]:
# for Female participants, hyposmia definition according to AGE and UPSIT total score
Fem_Hyposmia_cutoff ={
    50-54: 32,
    55-59: 32,
    60-64: 30,
    65-69: 28,
    70-74: 26,
    75-79: 24,
    80: 22
}

Mal_Hyposmia_cutoff ={
    50-54: 30,
    55-59: 28,
    60-64: 26,
    65-69: 24,
    70-74: 23,
    75-79: 21,
    80: 18
}

GETTING DURATION OF DISEASE AT TIME OF UPSIT

In [None]:
import pandas as pd

In [None]:
#read upsit
upsit = pd.read_csv("data/PPMI_clinical/University_of_Pennsylvania_Smell_Identification_Test_UPSIT_24May2024.csv")
diag_hist = pd.read_csv('data/PPMI_clinical/PD_Diagnosis_History_11Jun2024.csv')

In [None]:
#get date for each of them. upsit Orig_entry is the date of the test
upsit["ORIG_ENTRY"] = pd.to_datetime(upsit["ORIG_ENTRY"])
print(upsit["ORIG_ENTRY"])
#diag_hist has the PDDXDT in the following format: mon-yy
diag_hist["PDDXDT"] = pd.to_datetime(diag_hist["PDDXDT"], format="%b-%y")
print(diag_hist["PDDXDT"])
#merge both dataframes
merged = upsit.merge(diag_hist, on=["PATNO"], how="inner", suffixes=('', '_y'))
print(merged.shape)
print(merged.columns)
print(merged["PDDXDT"])
print(merged["ORIG_ENTRY"])
#now compute disease duration at date of the test
merged["PD_Duration"] = (merged["ORIG_ENTRY"] - merged["PDDXDT"]).dt.days
print(merged["PD_Duration"]/365)

In [None]:
#load clinical
clinical = pd.read_csv("data/processed/clinical_processed_imputed.csv")
clinical_orig = pd.read_csv("data/processed/clinical_processed.csv")
print(clinical.shape)

#now include PD_Duration in clinical
clinical = clinical.merge(merged[["PATNO", "PD_Duration"]], on=["PATNO"], how="left")
clinical_orig = clinical_orig.merge(merged[["PATNO", "PD_Duration"]], on=["PATNO"], how="left")
print(clinical.shape)
print(clinical_orig.shape)
#check for how many rows PD_Duration is null
print(clinical["PD_Duration"].value_counts(dropna=False))

#save clinical
clinical.to_csv("data/processed/clinical_processed_imputed.csv", index=False)
clinical_orig.to_csv("data/processed/clinical_processed.csv", index=False)
