In [None]:
import numpy as np, pandas as pd

In [None]:
baseline_datadir = '../../../datasets/ppmi/visit_feature_inputs_asof_2019Jan24_using_CMEDTM/'
baseline_filepath = baseline_datadir + 'PD_baseline.csv'
screening_filepath = baseline_datadir + 'PD_screening.csv'
questions_filepath = baseline_datadir + 'PD_questions_across_time.csv'
totals_filepath = baseline_datadir + 'PD_totals_across_time.csv'
other_filepath = baseline_datadir + 'PD_other_across_time.csv'
baseline_df = pd.read_csv(baseline_filepath)
screening_df = pd.read_csv(screening_filepath)
questions_df = pd.read_csv(questions_filepath)
totals_df = pd.read_csv(totals_filepath)
totals_df['STAI'] = np.where(np.logical_and(~pd.isnull(totals_df['STATE_ANXIETY']), \
                                            ~pd.isnull(totals_df['TRAIT_ANXIETY'])), \
                             totals_df[['STATE_ANXIETY','TRAIT_ANXIETY']].sum(axis=1), float('NaN'))
sc_totals_df = totals_df.loc[totals_df['EVENT_ID_DUR']==0]
bl_totals_df = totals_df.loc[totals_df['EVENT_ID_DUR']==0.125]
other_df = pd.read_csv(other_filepath)
other_df['BMI'] = np.where(np.logical_and(~pd.isnull(other_df['WGTKG']), ~pd.isnull(other_df['HTCM'])), \
                           other_df['WGTKG']/other_df['HTCM']*10000., float('NaN'))
sc_other_df = other_df.loc[other_df['EVENT_ID_DUR']==0]
bl_other_df = other_df.loc[other_df['EVENT_ID_DUR']==0.125]
treatment_filepath = '../../../datasets/ppmi/treatment_pipeline_output_asof_2019Jan24/PD_treatment_between_visits.csv'
treatment_df = pd.read_csv(treatment_filepath)
treatment_df = treatment_df.loc[treatment_df['EVENT_ID']=='SC']

In [None]:
subtotal_maps = {'NUPDRS3_TREMOR': ['NP3RTALL', 'NP3RTALU', 'NP3KTRML', 'NP3PTRML', 'NP3KTRMR', 'NP3PTRMR', 'NP3RTARU', \
                                    'NP3RTALJ', 'NP3RTARL', 'NP2TRMR', 'NP3RTCON'], \
                 'NUPDRS3_RIGID_LEFT': ['NP3RIGLU', 'NP3RIGLL', 'NP3PRSPL', 'NP3FTAPL', 'NP3HMOVL', 'NP3LGAGL', \
                                        'NP3TTAPL'], \
                 'NUPDRS3_RIGID_RIGHT': ['NP3RIGRL', 'NP3RIGRU', 'NP3PRSPR', 'NP3FTAPR', 'NP3HMOVR', 'NP3LGAGR', \
                                         'NP3TTAPR'], \
                 'NUPDRS3_FACE': ['NP3SPCH', 'NP3RIGN', 'NP3BRADY', 'NP3FACXP'], \
                 'NUPDRS3_GAIT': ['NP3FRZGT', 'NP3PSTBL', 'NP3RISNG', 'NP3GAIT', 'NP3POSTR'], \
                 'NUPDRS2_DAILYACT': ['NP2HWRT', 'NP2FREZ', 'NP2HYGN', 'NP2EAT', 'NP2HOBB', 'NP2WALK', 'NP2DRES', \
                                      'NP2RISE', 'NP2TURN', 'NP2SWAL', 'NP2SALV', 'NP2SPCH']
                }
for subtotal in subtotal_maps:
    untreated_components = []
    for col in subtotal_maps[subtotal]:
        if col.startswith('NP3'):
            untreated_components.append(col + '_untreated')
        else:
            untreated_components.append(col)
    questions_df[subtotal] = questions_df[untreated_components].sum(axis=1)
sc_questions_df = questions_df.loc[questions_df['EVENT_ID_DUR']==0]
bl_questions_df = questions_df.loc[questions_df['EVENT_ID_DUR']==0.125]


In [None]:
cognitive_totals = ['MOCA', 'HVLT_discrim_recog', 'HVLT_immed_recall', 'HVLT_retent', 'BJLO', 'LNS', \
                    'SEMANTIC_FLUENCY']
cognitive_questions = ['NP1COG']
autonomic_totals = ['SCOPA-AUT']
autonomic_other = ['DIASUP', 'DIASTND', 'SYSSTND', 'SYSSUP', 'BMI']
autonomic_questions = ['NP1URIN', 'NP1LTHD', 'NP1FATG', 'NP1PAIN', 'NP1CNST']
autonomic_screening = ['MedHist_1i. gastrointestinal']
standard_other = ['GENETIC_RISK_SCORE', 'AGE']
standard_baseline = ['UPSIT', 'MALE']
standard_screening = ['DIS_DUR_BY_CONSENTDT']
psychiatric_baseline = ['RAWHITE']
psychiatric_totals = ['STAI','QUIP','GDSSHORT']
psychiatric_screening = ['MedHist_1q. psychiatric', 'PhysExam_Psychiatric']
psychiatric_questions = ['NP1HALL', 'NP1DPRS', 'NP1ANXS', 'NP1APAT']
sleep_totals = ['EPWORTH', 'REMSLEEP']
sleep_questions = ['NP1SLPN', 'NP1SLPD']
treatments = ['DIGESTIVE AID', 'SLEEP AID', 'ANTIDEPRESSANT', 'ANXIOLYTIC']

In [None]:
all_totals = cognitive_totals + autonomic_totals + psychiatric_totals + sleep_totals
all_questions = cognitive_questions + autonomic_questions + psychiatric_questions + subtotal_maps.keys() \
    + sleep_questions
all_screening = autonomic_screening + standard_screening + psychiatric_screening
all_other = autonomic_other + standard_other
all_baseline = standard_baseline + psychiatric_baseline

In [None]:
print(sc_totals_df.columns.values)
print(bl_totals_df.columns.values)

In [None]:
selected_df = baseline_df[['PATNO']+all_baseline].merge(screening_df[['PATNO']+all_screening], how='outer', \
                                                        validate='one_to_one')
selected_df = selected_df.merge(sc_totals_df[['PATNO']+all_totals], how='outer', validate='one_to_one')
selected_df = selected_df.merge(bl_totals_df[['PATNO']+all_totals], on=['PATNO'], how='outer', \
                                suffixes=['_sc', '_bl'], validate='one_to_one')
selected_df = selected_df.merge(sc_questions_df[['PATNO']+all_questions], on=['PATNO'], how='outer', \
                                validate='one_to_one')
selected_df = selected_df.merge(bl_questions_df[['PATNO']+all_questions], on=['PATNO'], how='outer', \
                                suffixes=['_sc', '_bl'], validate='one_to_one')
selected_df = selected_df.merge(sc_other_df[['PATNO']+all_other], on=['PATNO'], how='outer', validate='one_to_one')
selected_df = selected_df.merge(bl_other_df[['PATNO']+all_other], on=['PATNO'], how='outer', \
                                suffixes=['_sc', '_bl'], validate='one_to_one')
for total_col in all_totals + all_questions + all_other:
    bl_col_df = selected_df[[total_col + '_bl']]
    del selected_df[total_col + '_bl']
    selected_df.rename(columns={total_col + '_sc': total_col}, inplace=True)
    bl_col_df.rename(columns={total_col + '_bl': total_col}, inplace=True)
    selected_df.update(bl_col_df, overwrite=False)
selected_df = selected_df.merge(treatment_df[['PATNO'] + treatments], how='left', validate='one_to_one')
selected_df[treatments].fillna(0, inplace=True)

In [None]:
with pd.option_context('display.max_columns', None):
    display(selected_df.head())

In [None]:
print(len(selected_df))

In [None]:
selected_df.to_csv('survival_baseline_data_diff_feats.csv', index=False)