In [1]:
import numpy as np, pandas as pd

In [2]:
baseline_datadir = '../../../datasets/ppmi/visit_feature_inputs_asof_2019Jan24_using_CMEDTM/'
baseline_filepath = baseline_datadir + 'PD_baseline.csv'
screening_filepath = baseline_datadir + 'PD_screening.csv'
questions_filepath = baseline_datadir + 'PD_questions_across_time.csv'
totals_filepath = baseline_datadir + 'PD_totals_across_time.csv'
other_filepath = baseline_datadir + 'PD_other_across_time.csv'
baseline_df = pd.read_csv(baseline_filepath)
screening_df = pd.read_csv(screening_filepath)
questions_df = pd.read_csv(questions_filepath)
totals_df = pd.read_csv(totals_filepath)
totals_df['STAI'] = np.where(np.logical_and(~pd.isnull(totals_df['STATE_ANXIETY']), \
                                            ~pd.isnull(totals_df['TRAIT_ANXIETY'])), \
                             totals_df[['STATE_ANXIETY','TRAIT_ANXIETY']].sum(axis=1), float('NaN'))
sc_totals_df = totals_df.loc[totals_df['EVENT_ID_DUR']==0]
bl_totals_df = totals_df.loc[totals_df['EVENT_ID_DUR']==0.125]
other_df = pd.read_csv(other_filepath)
other_df['BMI'] = np.where(np.logical_and(~pd.isnull(other_df['WGTKG']), ~pd.isnull(other_df['HTCM'])), \
                           other_df['WGTKG']/other_df['HTCM']*10000., float('NaN'))
sc_other_df = other_df.loc[other_df['EVENT_ID_DUR']==0]
bl_other_df = other_df.loc[other_df['EVENT_ID_DUR']==0.125]
treatment_filepath = '../../../datasets/ppmi/treatment_pipeline_output_asof_2019Jan24/PD_treatment_between_visits.csv'
treatment_df = pd.read_csv(treatment_filepath)
treatment_df = treatment_df.loc[treatment_df['EVENT_ID']=='SC']

In [3]:
subtotal_maps = {'NUPDRS3_TREMOR': ['NP3RTALL', 'NP3RTALU', 'NP3KTRML', 'NP3PTRML', 'NP3KTRMR', 'NP3PTRMR', 'NP3RTARU', \
                                    'NP3RTALJ', 'NP3RTARL', 'NP2TRMR', 'NP3RTCON'], \
                 'NUPDRS3_RIGID_LEFT': ['NP3RIGLU', 'NP3RIGLL', 'NP3PRSPL', 'NP3FTAPL', 'NP3HMOVL', 'NP3LGAGL', \
                                        'NP3TTAPL'], \
                 'NUPDRS3_RIGID_RIGHT': ['NP3RIGRL', 'NP3RIGRU', 'NP3PRSPR', 'NP3FTAPR', 'NP3HMOVR', 'NP3LGAGR', \
                                         'NP3TTAPR'], \
                 'NUPDRS3_FACE': ['NP3SPCH', 'NP3RIGN', 'NP3BRADY', 'NP3FACXP'], \
                 'NUPDRS3_GAIT': ['NP3FRZGT', 'NP3PSTBL', 'NP3RISNG', 'NP3GAIT', 'NP3POSTR'], \
                 'NUPDRS2_DAILYACT': ['NP2HWRT', 'NP2FREZ', 'NP2HYGN', 'NP2EAT', 'NP2HOBB', 'NP2WALK', 'NP2DRES', \
                                      'NP2RISE', 'NP2TURN', 'NP2SWAL', 'NP2SALV', 'NP2SPCH']
                }
for subtotal in subtotal_maps:
    untreated_components = []
    for col in subtotal_maps[subtotal]:
        if col.startswith('NP3'):
            untreated_components.append(col + '_untreated')
        else:
            untreated_components.append(col)
    questions_df[subtotal] = questions_df[untreated_components].sum(axis=1)
sc_questions_df = questions_df.loc[questions_df['EVENT_ID_DUR']==0]
bl_questions_df = questions_df.loc[questions_df['EVENT_ID_DUR']==0.125]


In [4]:
cognitive_totals = ['MOCA', 'HVLT_discrim_recog', 'HVLT_immed_recall', 'HVLT_retent', 'BJLO', 'LNS', \
                    'SEMANTIC_FLUENCY']
cognitive_questions = ['NP1COG']
autonomic_totals = ['SCOPA-AUT']
autonomic_other = ['DIASUP', 'DIASTND', 'SYSSTND', 'SYSSUP', 'BMI']
autonomic_questions = ['NP1URIN', 'NP1LTHD', 'NP1FATG', 'NP1PAIN', 'NP1CNST']
autonomic_screening = ['MedHist_1i. gastrointestinal']
standard_other = ['GENETIC_RISK_SCORE', 'AGE']
standard_baseline = ['UPSIT', 'MALE']
standard_screening = ['DIS_DUR_BY_CONSENTDT']
psychiatric_baseline = ['RAWHITE']
psychiatric_totals = ['STAI','QUIP','GDSSHORT']
psychiatric_screening = ['MedHist_1q. psychiatric', 'PhysExam_Psychiatric']
psychiatric_questions = ['NP1HALL', 'NP1DPRS', 'NP1ANXS', 'NP1APAT']
sleep_totals = ['EPWORTH', 'REMSLEEP']
sleep_questions = ['NP1SLPN', 'NP1SLPD']
treatments = ['DIGESTIVE AID', 'SLEEP AID', 'ANTIDEPRESSANT', 'ANXIOLYTIC']

In [5]:
all_totals = cognitive_totals + autonomic_totals + psychiatric_totals + sleep_totals
all_questions = cognitive_questions + autonomic_questions + psychiatric_questions + subtotal_maps.keys() \
    + sleep_questions
all_screening = autonomic_screening + standard_screening + psychiatric_screening
all_other = autonomic_other + standard_other
all_baseline = standard_baseline + psychiatric_baseline

In [6]:
print(sc_totals_df.columns.values)
print(bl_totals_df.columns.values)

['PATNO' 'EVENT_ID' 'SCOPA-AUT' 'HVLT_discrim_recog' 'HVLT_immed_recall'
 'NUPDRS3_untreated' 'QUIP' 'NUPDRS3_on' 'EPWORTH' 'STATE_ANXIETY'
 'NUPDRS3_off' 'GDSSHORT' 'NUPDRS1' 'NUPDRS2' 'HVLT_retent' 'BJLO' 'MOCA'
 'LNS' 'TRAIT_ANXIETY' 'SEMANTIC_FLUENCY' 'NUPDRS3_maob' 'REMSLEEP'
 'NUPDRS4' 'INFODT' 'INFODT_DIS_DUR' 'INFODT_TIME_SINCE_ENROLL'
 'EVENT_ID_DUR' 'DIS_DUR_BY_CONSENTDT' 'STAI']
['PATNO' 'EVENT_ID' 'SCOPA-AUT' 'HVLT_discrim_recog' 'HVLT_immed_recall'
 'NUPDRS3_untreated' 'QUIP' 'NUPDRS3_on' 'EPWORTH' 'STATE_ANXIETY'
 'NUPDRS3_off' 'GDSSHORT' 'NUPDRS1' 'NUPDRS2' 'HVLT_retent' 'BJLO' 'MOCA'
 'LNS' 'TRAIT_ANXIETY' 'SEMANTIC_FLUENCY' 'NUPDRS3_maob' 'REMSLEEP'
 'NUPDRS4' 'INFODT' 'INFODT_DIS_DUR' 'INFODT_TIME_SINCE_ENROLL'
 'EVENT_ID_DUR' 'DIS_DUR_BY_CONSENTDT' 'STAI']


In [7]:
selected_df = baseline_df[['PATNO']+all_baseline].merge(screening_df[['PATNO']+all_screening], how='outer', \
                                                        validate='one_to_one')
selected_df = selected_df.merge(sc_totals_df[['PATNO']+all_totals], how='outer', validate='one_to_one')
selected_df = selected_df.merge(bl_totals_df[['PATNO']+all_totals], on=['PATNO'], how='outer', \
                                suffixes=['_sc', '_bl'], validate='one_to_one')
selected_df = selected_df.merge(sc_questions_df[['PATNO']+all_questions], on=['PATNO'], how='outer', \
                                validate='one_to_one')
selected_df = selected_df.merge(bl_questions_df[['PATNO']+all_questions], on=['PATNO'], how='outer', \
                                suffixes=['_sc', '_bl'], validate='one_to_one')
selected_df = selected_df.merge(sc_other_df[['PATNO']+all_other], on=['PATNO'], how='outer', validate='one_to_one')
selected_df = selected_df.merge(bl_other_df[['PATNO']+all_other], on=['PATNO'], how='outer', \
                                suffixes=['_sc', '_bl'], validate='one_to_one')
for total_col in all_totals + all_questions + all_other:
    bl_col_df = selected_df[[total_col + '_bl']]
    del selected_df[total_col + '_bl']
    selected_df.rename(columns={total_col + '_sc': total_col}, inplace=True)
    bl_col_df.rename(columns={total_col + '_bl': total_col}, inplace=True)
    selected_df.update(bl_col_df, overwrite=False)
selected_df = selected_df.merge(treatment_df[['PATNO'] + treatments], how='left', validate='one_to_one')
selected_df[treatments].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [8]:
with pd.option_context('display.max_columns', None):
    display(selected_df.head())

Unnamed: 0,PATNO,UPSIT,MALE,RAWHITE,MedHist_1i. gastrointestinal,DIS_DUR_BY_CONSENTDT,MedHist_1q. psychiatric,PhysExam_Psychiatric,MOCA,HVLT_discrim_recog,HVLT_immed_recall,HVLT_retent,BJLO,LNS,SEMANTIC_FLUENCY,SCOPA-AUT,STAI,QUIP,GDSSHORT,EPWORTH,REMSLEEP,NP1COG,NP1URIN,NP1LTHD,NP1FATG,NP1PAIN,NP1CNST,NP1HALL,NP1DPRS,NP1ANXS,NP1APAT,NUPDRS2_DAILYACT,NUPDRS3_GAIT,NUPDRS3_RIGID_RIGHT,NUPDRS3_FACE,NUPDRS3_TREMOR,NUPDRS3_RIGID_LEFT,NP1SLPN,NP1SLPD,DIASUP,DIASTND,SYSSTND,SYSSUP,BMI,GENETIC_RISK_SCORE,AGE,DIGESTIVE AID,SLEEP AID,ANTIDEPRESSANT,ANXIOLYTIC
0,3001,25.0,1.0,1.0,0.0,0.8378,1.0,0.0,29.0,10.0,27.0,1.2,15.0,16.0,42.0,12.0,51.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,8.0,5.0,5.0,0.0,0.0,1.0,90.0,90.0,136.0,152.0,4054.644809,-0.018314,65.085525,0,0,0,0
1,3002,17.0,0.0,1.0,1.0,1.075997,0.0,0.0,29.0,9.0,28.0,0.916667,13.0,12.0,62.0,22.0,69.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,6.0,2.0,5.0,2.0,7.0,8.0,0.0,1.0,91.0,84.0,152.0,146.0,4468.35443,-0.017087,67.582497,1,0,0,0
2,3003,23.0,0.0,1.0,1.0,1.998672,0.0,0.0,25.0,11.0,29.0,0.916667,13.0,12.0,47.0,16.0,51.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,3.0,12.0,5.0,4.0,3.0,3.0,1.0,70.0,60.0,90.0,102.0,4767.44186,,56.666461,0,0,0,0
3,3006,13.0,0.0,1.0,0.0,0.328549,0.0,0.0,27.0,10.0,24.0,0.5,9.0,15.0,66.0,10.0,49.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,9.0,1.0,6.0,5.0,1.0,8.0,0.0,0.0,80.0,80.0,118.0,124.0,4162.650602,-0.025484,57.41391,1,0,1,1
4,3007,18.0,1.0,1.0,1.0,0.246412,1.0,0.0,29.0,10.0,26.0,0.8,9.0,10.0,41.0,12.0,60.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,5.0,4.0,7.0,5.0,1.0,2.0,90.0,88.0,138.0,150.0,4505.617978,,64.414738,0,0,0,0


In [9]:
print(len(selected_df))

423


In [10]:
selected_df.to_csv('survival_baseline_data_diff_feats.csv', index=False)