In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt

In [None]:
datadir = '../../../datasets/ppmi/visit_feature_inputs_asof_2019Jan24_using_CMEDTM/'
treatment_dir = '../../../datasets/ppmi/treatment_pipeline_output_asof_2019Jan24/'
raw_datadir = '../../../datasets/ppmi/raw_data_asof_2019Jan24/'

### Treatment frequency table

In [None]:
pd_treatment_df = pd.read_csv(treatment_dir + 'PD_treatment_between_visits.csv')
hc_treatment_df = pd.read_csv(treatment_dir + 'HC_treatment_between_visits.csv')

In [None]:
pd_treatment_df.columns.values

In [None]:
treatment_groupings_list = ['Dopamine replacement', 'Muscle', 'Pain', 'Urinary', 'Anxiety', 'Depression', \
                            'Other psychiatric', 'Cognitive', 'Sleep', 'Digestive', 'Cardiovascular', \
                            'Anti-inflammatory', 'Immune', 'Respiratory', 'Thyroid', 'Supplement', 'Eye', 'Other']
treatment_groupings = {'Dopamine replacement': ['DOPAMINE REPLACEMENT'], \
                       'Muscle': ['ANTICONVULSANT',  'ANTITREMOR',  'ANTISPASMODIC', 'MUSCLE RELAXER'], \
                       'Pain': ['ANESTHETIC',  'ANALGESIC'], \
                       'Urinary': ['BLADDER CONTROL'], \
                       'Anxiety': ['ANXIOLYTIC'], \
                       'Depression': ['ANTIDEPRESSANT'], \
                       'Other psychiatric': ['ANTIPSYCHOTIC',  'MOOD STABILIZER'], \
                       'Cognitive': ['COGNITIVE ENHANCER'], \
                       'Sleep': ['SLEEP AID'], \
                       'Digestive': ['ANTIEMETIC', 'ANTIDIARRHEAL', 'DIGESTIVE AID',  'ANTACID'], \
                       'Cardiovascular': ['ANTIHYPERTENSIVE', 'ANTIARRHYTHMIC',  'ANTIHYPOTENSIVE', \
                                          'ANTICOAGULANT/BLOOD THINNER'], \
                       'Anti-inflammatory': ['ANTIINFLAMMATORY',  'NSAID'], \
                       'Immune': ['IMMUNOSUPPRESSANT', 'ANTIHISTAMINE', 'ANTIVIRAL', 'ANTIBIOTIC', 'ANTIFUNGAL', \
                                  'ANTICANCER', 'VACCINE', 'ADRENALCORTICAL REPLACEMENT'], \
                       'Respiratory': ['DECONGESTANT', 'MUCOLYTIC',  'BRONCHODILATOR'], \
                       'Thyroid': ['THYROID', 'ANTITHYROID AGENT', 'THYROID HORMONE'], \
                       'Supplement': ['SUPPLEMENT',  'PD SUPPLEMENT', 'BONE/JOINT HEALTH'], \
                       'Eye': ['OPTHALAMIC', 'ANTIGLAUCOMA'], \
                       'Other': ['CONTRACEPTIVE', 'ANTI BPH', 'NEUROTOXIN', 'DERMATOLOGIC', 'STIMULANT', \
                                 'HORMONE REPLACEMENT', 'URIC ACID REDUCER', 'OTHER']}
assert set(treatment_groupings_list) == set(treatment_groupings.keys())
pd_num_patnos = float(pd_treatment_df.PATNO.nunique())
hc_num_patnos = float(hc_treatment_df.PATNO.nunique())
for grouping in treatment_groupings_list:
    pd_treatment_df[grouping] = pd_treatment_df[treatment_groupings[grouping]].sum(axis=1)
    hc_treatment_df[grouping] = hc_treatment_df[treatment_groupings[grouping]].sum(axis=1)
    pd_grouping_num_patnos = pd_treatment_df.loc[pd_treatment_df[grouping] > 0].PATNO.nunique()
    hc_grouping_num_patnos = hc_treatment_df.loc[hc_treatment_df[grouping] > 0].PATNO.nunique()
    print(grouping + ': PD: ' + '{0:.3f}'.format(pd_grouping_num_patnos/pd_num_patnos) \
          + ', HC: ' + '{0:.3f}'.format(hc_grouping_num_patnos/hc_num_patnos))

### MDS-UPDRS treatment initiation + venn diagram

In [None]:
pd_totals_df = pd.read_csv(datadir + 'PD_totals_across_time.csv')

In [None]:
nupdrs3_treated_df = pd_totals_df[['PATNO','EVENT_ID_DUR','NUPDRS3_on','NUPDRS3_off',\
                                   'NUPDRS3_maob']].dropna(subset=['NUPDRS3_on','NUPDRS3_off','NUPDRS3_maob'], \
                                                           how='all')
nupdrs3_treated_df = nupdrs3_treated_df.sort_values(by=['EVENT_ID_DUR'])
nupdrs3_first_treated_df = nupdrs3_treated_df.drop_duplicates(subset=['PATNO'], keep='first')
print(nupdrs3_first_treated_df.EVENT_ID_DUR.mean())
print(nupdrs3_first_treated_df.EVENT_ID_DUR.std())
print(len(nupdrs3_first_treated_df.loc[nupdrs3_first_treated_df['EVENT_ID_DUR']<=1]))
print(len(nupdrs3_first_treated_df.loc[nupdrs3_first_treated_df['EVENT_ID_DUR']>1]))
nupdrs3_first_treated_df.EVENT_ID_DUR.plot.hist(bins=20)

In [None]:
raw_mdsupdrs3_df = pd.read_csv(raw_datadir + 'MDS_UPDRS_Part_III.csv')
raw_mdsupdrs3_df = raw_mdsupdrs3_df.loc[raw_mdsupdrs3_df['PATNO'].isin(set(pd_totals_df.PATNO.unique().tolist()))]
raw_mdsupdrs3_df = raw_mdsupdrs3_df.drop_duplicates(subset=['PATNO','EVENT_ID'])
raw_mdsupdrs3_df.PD_MED_USE.value_counts()

In [None]:
from matplotlib_venn import venn3
# 1: levodopa, 2: dopamine agonist, 3: MAO-B, 4: 1 + 3, 5: 1 + 2, 6: 2 + 3, 7: 1 + 2 + 3
#(Abc, aBc, ABc, abC, AbC, aBC, ABC)
venn3(subsets=(1169, 449, 326, 446, 367, 264, 248), set_labels=('Levodopa', 'Dopamine agonist', 'MAO-B inhibitors'))
#plt.show()
plt.savefig('mdsupdrs3_treatment_venn.pdf')
plt.show()

### Baseline and year 3 stats

In [None]:
cohorts = ['PD','HC','PRODROMA','GENPD','GENUN','REGPD','REGUN','SWEDD']
for cohort in cohorts:
    cohort_totals_df = pd.read_csv(datadir + cohort + '_totals_across_time.csv')
    cohort_totals_df['QUIP'] = np.where(cohort_totals_df['QUIP']>0, 1, 0)
    cohort_totals_df['STAI'] = np.where(np.logical_and(~pd.isnull(cohort_totals_df['STATE_ANXIETY']), \
                                                       ~pd.isnull(cohort_totals_df['TRAIT_ANXIETY'])), \
                                        cohort_totals_df[['STATE_ANXIETY','TRAIT_ANXIETY']].sum(axis=1), float('NaN'))
    sc_df = cohort_totals_df.loc[cohort_totals_df['EVENT_ID_DUR']==0]
    bl_df = cohort_totals_df.loc[cohort_totals_df['EVENT_ID_DUR']==0.125]
    sc_df \
        = sc_df[['PATNO','GDSSHORT','QUIP','STAI']].merge(bl_df[['PATNO','GDSSHORT','QUIP','STAI']], on=['PATNO'], \
                                                          suffixes=['_sc','_bl'], how='outer', validate='one_to_one')
    bl_df = sc_df[['PATNO','GDSSHORT_bl','QUIP_bl','STAI_bl']]
    bl_df.rename(columns={'GDSSHORT_bl': 'GDSSHORT', 'QUIP_bl': 'QUIP', 'STAI_bl': 'STAI'}, inplace=True)
    sc_df = sc_df[['PATNO','GDSSHORT_sc','QUIP_sc','STAI_sc']]
    sc_df.rename(columns={'GDSSHORT_sc': 'GDSSHORT', 'QUIP_sc': 'QUIP', 'STAI_sc': 'STAI'}, inplace=True)
    sc_df.update(bl_df, overwrite=False)
    yr3_df = cohort_totals_df.loc[cohort_totals_df['EVENT_ID_DUR']==3.125]
    print(cohort)
    print('GDS: {0:.4f}'.format(np.nanmean(sc_df['GDSSHORT'].values)) \
          + ', {0:.4f}, '.format(np.nanmean(yr3_df['GDSSHORT'].values)) \
          + str(len(sc_df['GDSSHORT'].dropna())) + ', ' + str(len(yr3_df['GDSSHORT'].dropna())))
    print('QUIP: {0:.4f}'.format(np.nanmean(sc_df['QUIP'].values)) \
          + ', {0:.4f}, '.format(np.nanmean(yr3_df['QUIP'].values)) \
          + str(len(sc_df['QUIP'].dropna())) + ', ' + str(len(yr3_df['QUIP'].dropna())))
    print('STAI: {0:.4f}'.format(np.nanmean(sc_df['STAI'].values)) \
          + ' ({0:.4f})'.format(np.nanstd(sc_df['STAI'].values)) \
          + ', {0:.4f}'.format(np.nanmean(yr3_df['STAI'].values)) \
          +' ({0:.4f}), '.format(np.nanstd(yr3_df['STAI'].values)) \
          + str(len(sc_df['STAI'].dropna())) + ', ' + str(len(yr3_df['STAI'].dropna())))

In [None]:
pd_other_df = pd.read_csv(datadir + 'PD_other_across_time.csv')
pd_other_df.columns.values

In [None]:
pd_other_df.COGSTATE.value_counts()

In [None]:
cohorts = ['PD','HC','PRODROMA','GENPD','GENUN','REGPD','REGUN','SWEDD']
for cohort in cohorts:
    cohort_totals_df = pd.read_csv(datadir + cohort + '_totals_across_time.csv')
    sc_totals_df = cohort_totals_df.loc[cohort_totals_df['EVENT_ID_DUR']==0]
    bl_totals_df = cohort_totals_df.loc[cohort_totals_df['EVENT_ID_DUR']==0.125]
    cog_cols = ['HVLT_retent', 'HVLT_discrim_recog', 'HVLT_immed_recall', 'LNS', 'BJLO', 'SEMANTIC_FLUENCY']
    sc_totals_df \
        = sc_totals_df[['PATNO']+cog_cols].merge(bl_totals_df[['PATNO']+cog_cols], on=['PATNO'], \
                                                 suffixes=['_sc','_bl'], how='outer', validate='one_to_one')
    cog_bl_cols = []
    cog_bl_cols_dict = dict()
    cog_sc_cols = []
    cog_sc_cols_dict = dict()
    for col in cog_cols:
        cog_bl_cols.append(col + '_bl')
        cog_bl_cols_dict[col + '_bl'] = col
        cog_sc_cols.append(col + '_sc')
        cog_sc_cols_dict[col + '_sc'] = col
    bl_totals_df = sc_totals_df[['PATNO']+cog_bl_cols]
    bl_totals_df.rename(columns=cog_bl_cols_dict, inplace=True)
    sc_totals_df = sc_totals_df[['PATNO']+cog_sc_cols]
    sc_totals_df.rename(columns=cog_sc_cols_dict, inplace=True)
    sc_totals_df.update(bl_totals_df, overwrite=False)
    cohort_other_df = pd.read_csv(datadir + cohort + '_other_across_time.csv')
    cohort_other_df['MCI_dementia'] = np.where(cohort_other_df['COGSTATE']>1, 1, 0)
    sc_other_df = cohort_other_df.loc[cohort_other_df['EVENT_ID_DUR']==0]
    bl_other_df = cohort_other_df.loc[cohort_other_df['EVENT_ID_DUR']==0.125]
    sc_other_df \
        = sc_other_df[['PATNO','MCI_dementia','DVT_SDM']].merge(bl_other_df[['PATNO','MCI_dementia','DVT_SDM']], \
                                                                on=['PATNO'], how='outer', validate='one_to_one', \
                                                                suffixes=['_sc','_bl'])
    bl_other_df = sc_other_df[['PATNO','MCI_dementia_bl','DVT_SDM_bl']]
    bl_other_df.rename(columns={'MCI_dementia_bl':'MCI_dementia', 'DVT_SDM_bl':'DVT_SDM'}, inplace=True)
    sc_other_df = sc_other_df[['PATNO','MCI_dementia_sc','DVT_SDM_sc']]
    sc_other_df.rename(columns={'MCI_dementia_sc':'MCI_dementia', 'DVT_SDM_sc':'DVT_SDM'}, inplace=True)
    sc_other_df.update(bl_other_df, overwrite=False)
    print(cohort)
    for col in cog_cols:
        print(col + ': {0:.4f}'.format(np.nanmean(sc_totals_df[col].values)) \
              + ' ({0:.4f}), '.format(np.nanstd(sc_totals_df[col].values)) \
              + str(len(sc_totals_df[col].dropna())))
    print('MCI_dementia: {0:.4f}'.format(np.nanmean(sc_other_df['MCI_dementia'].values)) \
          + ' ({0:.4f}), '.format(np.nanstd(sc_other_df['MCI_dementia'].values)) \
          + str(len(sc_other_df['MCI_dementia'].dropna())))
    print('DVT_SDM: {0:.4f}'.format(np.nanmean(sc_other_df['DVT_SDM'].values)) \
          + ' ({0:.4f})'.format(np.nanstd(sc_other_df['DVT_SDM'].values)) \
          + str(len(sc_other_df['DVT_SDM'].dropna())))

In [None]:
cohorts = ['PD','HC','PRODROMA','GENPD','GENUN','REGPD','REGUN','SWEDD']
for cohort in cohorts:
    cohort_baseline_df = pd.read_csv(datadir + cohort + '_baseline.csv')
    print(cohort + ': {0:.4f}'.format(np.nanmean(cohort_baseline_df.UPSIT.values)) \
          + ' ({0:.4f}), '.format(np.nanstd(cohort_baseline_df.UPSIT.values)) \
          + str(len(cohort_baseline_df.dropna(subset=['UPSIT']))))

In [None]:
pd_totals_df = pd.read_csv(datadir + 'PD_totals_across_time.csv')
print(pd_totals_df.EPWORTH.value_counts())
print(pd_totals_df.REMSLEEP.value_counts())

In [None]:
cohorts = ['PD','HC','PRODROMA','GENPD','GENUN','REGPD','REGUN','SWEDD']
for cohort in cohorts:
    cohort_totals_df = pd.read_csv(datadir + cohort + '_totals_across_time.csv')
    sc_df = cohort_totals_df.loc[cohort_totals_df['EVENT_ID_DUR']==0]
    bl_df = cohort_totals_df.loc[cohort_totals_df['EVENT_ID_DUR']==0.125]
    sc_df \
        = sc_df[['PATNO','SCOPA-AUT','REMSLEEP','EPWORTH']].merge(bl_df[['PATNO','SCOPA-AUT','REMSLEEP','EPWORTH']], \
                                                                  on=['PATNO'], suffixes=['_sc','_bl'], how='outer', \
                                                                  validate='one_to_one')
    bl_df = sc_df[['PATNO','SCOPA-AUT_bl','REMSLEEP_bl','EPWORTH_bl']]
    bl_df.rename(columns={'SCOPA-AUT_bl': 'SCOPA-AUT', 'REMSLEEP_bl': 'REMSLEEP', 'EPWORTH_bl': 'EPWORTH'}, \
                 inplace=True)
    sc_df = sc_df[['PATNO','SCOPA-AUT_sc','REMSLEEP_sc','EPWORTH_sc']]
    sc_df.rename(columns={'SCOPA-AUT_sc': 'SCOPA-AUT', 'REMSLEEP_sc': 'REMSLEEP', 'EPWORTH_sc': 'EPWORTH'}, \
                 inplace=True)
    sc_df.update(bl_df, overwrite=False)
    yr3_df = cohort_totals_df.loc[cohort_totals_df['EVENT_ID_DUR']==3.125]
    print(cohort)
    print('SCOPA-AUT: {0:.4f}'.format(np.nanmean(sc_df['SCOPA-AUT'].values)) \
          + ' ({0:.4f})'.format(np.nanstd(sc_df['SCOPA-AUT'].values)) \
          + ', {0:.4f}'.format(np.nanmean(yr3_df['SCOPA-AUT'].values)) \
          +' ({0:.4f}), '.format(np.nanstd(yr3_df['SCOPA-AUT'].values)) \
          + str(len(sc_df['SCOPA-AUT'].dropna())) + ', ' + str(len(yr3_df['SCOPA-AUT'].dropna())))
    print('EPWORTH: {0:.4f}'.format(np.nanmean(sc_df['EPWORTH'].values)) \
          + ', {0:.4f}, '.format(np.nanmean(yr3_df['EPWORTH'].values)) \
          + str(len(sc_df['EPWORTH'].dropna())) + ', ' + str(len(yr3_df['EPWORTH'].dropna())))
    print('REMSLEEP: {0:.4f}'.format(np.nanmean(sc_df['REMSLEEP'].values)) \
          + ', {0:.4f}, '.format(np.nanmean(yr3_df['REMSLEEP'].values)) \
          + str(len(sc_df['SCOPA-AUT'].dropna())) + ', ' + str(len(yr3_df['GDSSHORT'].dropna())))

In [None]:
pd_baseline_df = pd.read_csv(datadir + 'PD_baseline.csv')
pd_baseline_df.columns.values

In [None]:
# TODO: baseline features (table 1.1)
cohorts = ['PD','HC','PRODROMA','GENPD','GENUN','REGPD','REGUN','SWEDD']
for cohort in cohorts:
    cohort_baseline_df = pd.read_csv(datadir + cohort + '_baseline.csv')
    cohort_baseline_df['FAMHIST'] \
        = np.where(cohort_baseline_df[['BIOMOMPD','BIODADPD','PATAUPD','KIDSPD']].sum(axis=1) > 0, 1, 0)
    print(cohort)
    print('Male: {0:.3f}'.format(np.nanmean(cohort_baseline_df.MALE.values)))
    print('White: {0:.3f}'.format(np.nanmean(cohort_baseline_df.RAWHITE.values)))
    print('Fam hist: {0:.3f}'.format(np.nanmean(cohort_baseline_df.FAMHIST.values)))
    print('Time since diag: {0:.3f}'.format(np.nanmean(cohort_baseline_df.DIS_DUR_BY_CONSENTDT.values)) \
          + ' ({0:.3f}), '.format(np.nanstd(cohort_baseline_df.DIS_DUR_BY_CONSENTDT.values)) \
          + str(len(cohort_baseline_df['DIS_DUR_BY_CONSENTDT'].dropna())))

In [None]:
cohorts = ['PD','HC','PRODROMA','GENPD','GENUN','REGPD','REGUN','SWEDD']
for cohort in cohorts:
    cohort_other_df = pd.read_csv(datadir + cohort + '_other_across_time.csv')
    sc_df = cohort_other_df.loc[cohort_other_df['EVENT_ID_DUR']==0]
    bl_df = cohort_other_df.loc[cohort_other_df['EVENT_ID_DUR']==0.125]
    sc_df = sc_df[['PATNO','AGE']].merge(bl_df[['PATNO','AGE']], on=['PATNO'], suffixes=['_sc','_bl'], how='outer', \
                                         validate='one_to_one')
    bl_df = sc_df[['PATNO','AGE_bl']]
    bl_df.rename(columns={'AGE_bl': 'AGE'}, inplace=True)
    sc_df = sc_df[['PATNO','AGE_sc']]
    sc_df.rename(columns={'AGE_sc': 'AGE'}, inplace=True)
    sc_df.update(bl_df, overwrite=False)
    print(cohort)
    print('AGE: {0:.4f}'.format(np.nanmean(sc_df['AGE'].values)) \
          + ' ({0:.4f})'.format(np.nanstd(sc_df['AGE'].values)))

In [None]:
cohorts = ['PD','HC','PRODROMA','GENPD','GENUN','REGPD','REGUN','SWEDD']
print('Num visits')
for cohort in cohorts:
    cohort_totals_df = pd.read_csv(datadir + cohort + '_totals_across_time.csv')
    cohort_totals_df = cohort_totals_df.dropna(subset=['NUPDRS2'])
    num_visits = []
    for patno in cohort_totals_df.PATNO.unique():
        num_visits.append(len(cohort_totals_df.loc[cohort_totals_df['PATNO']==patno]))
    print(cohort + ': {0:.4f}'.format(np.nanmean(np.array(num_visits))) \
          + ' ({0:.4f})'.format(np.nanstd(np.array(num_visits))))

In [None]:
cohorts = ['PD','HC','PRODROMA','GENPD','GENUN','REGPD','REGUN','SWEDD']
print('Enroll time')
for cohort in cohorts:
    cohort_totals_df = pd.read_csv(datadir + cohort + '_totals_across_time.csv')
    cohort_totals_df = cohort_totals_df.sort_values(by=['EVENT_ID_DUR'])
    cohort_totals_df = cohort_totals_df.drop_duplicates(subset=['PATNO'], keep='last')
    print(cohort + ': {0:.4f}'.format(np.nanmean(cohort_totals_df.EVENT_ID_DUR.values)) \
          + ' ({0:.4f})'.format(np.nanstd(cohort_totals_df.EVENT_ID_DUR.values)))

In [None]:
pd_other_df = pd.read_csv(datadir + 'PD_other_across_time.csv')
td_pigd_cols = []
for col in pd_other_df.columns:
    if col.startswith('TD_PIGD'):
        td_pigd_cols.append(col)
td_pigd_cols

In [None]:
sc_other_df = pd_other_df.loc[pd_other_df['EVENT_ID_DUR']==0]
yr3_other_df = pd_other_df.loc[pd_other_df['EVENT_ID_DUR']==3.125]
sc_td_count = len(sc_other_df.loc[sc_other_df['TD_PIGD_untreated:tremor']==1])
sc_pigd_count = len(sc_other_df.loc[sc_other_df['TD_PIGD_untreated:posture']==1])
sc_indet_count = len(sc_other_df.loc[sc_other_df['TD_PIGD_untreated:indet']==1])
sc_total_count = float(sc_td_count + sc_pigd_count + sc_indet_count)
print(sc_td_count/sc_total_count, sc_pigd_count/sc_total_count)
tremor_cols = []
posture_cols = []
for col in td_pigd_cols:
    if col.endswith('tremor'):
        tremor_cols.append(col)
    elif col.endswith('posture'):
        posture_cols.append(col)
yr3_other_df = yr3_other_df.dropna(subset=td_pigd_cols, how='all')
yr3_other_df['TD_PIGD:tremor'] = np.where(np.nansum(yr3_other_df[tremor_cols].values, axis=1)>0, 1, 0)
yr3_other_df['TD_PIGD:posture'] = np.where(np.nansum(yr3_other_df[posture_cols].values, axis=1)>0, 1, 0)
yr3_other_df['TD_PIGD:both'] = np.where(yr3_other_df[['TD_PIGD:tremor','TD_PIGD:posture']].sum(axis=1)==2,1,0)
print(yr3_other_df['TD_PIGD:tremor'].mean(), yr3_other_df['TD_PIGD:posture'].mean(), \
      yr3_other_df['TD_PIGD:both'].mean())

In [None]:
cohorts = ['PD','HC','PRODROMA','GENPD','GENUN','REGPD','REGUN','SWEDD']
state_anxiety = []
trait_anxiety = []
for cohort in cohorts:
    cohort_totals_df = pd.read_csv(datadir + cohort + '_totals_across_time.csv')
    cohort_totals_df = cohort_totals_df.dropna(subset=['STATE_ANXIETY','TRAIT_ANXIETY'])
    state_anxiety += cohort_totals_df.STATE_ANXIETY.values.tolist()
    trait_anxiety += cohort_totals_df.TRAIT_ANXIETY.values.tolist()
from scipy.stats import pearsonr
print(pearsonr(state_anxiety, trait_anxiety))
plt.scatter(state_anxiety, trait_anxiety)
plt.rcParams.update({'font.size':18})
plt.xlabel('State anxiety')
plt.ylabel('Trait anxiety')
plt.savefig('state_trait_anxiety_corr.pdf')
plt.show()

In [None]:
cohorts = ['PD','HC','PRODROMA','GENPD','GENUN','REGPD','REGUN','SWEDD']
state_anxiety_min = 0
state_anxiety_max = 0
trait_anxiety_min = 0
trait_anxiety_max = 0
for cohort in cohorts:
    cohort_totals_df = pd.read_csv(datadir + cohort + '_totals_across_time.csv')
    cohort_totals_df = cohort_totals_df.dropna(subset=['STATE_ANXIETY','TRAIT_ANXIETY'])
    cohort_state_max = cohort_totals_df.STATE_ANXIETY.max()
    cohort_state_min = cohort_totals_df.STATE_ANXIETY.min()
    cohort_trait_max = cohort_totals_df.TRAIT_ANXIETY.max()
    cohort_trait_min = cohort_totals_df.TRAIT_ANXIETY.min()
    if cohort_state_max > state_anxiety_max:
        state_anxiety_max = cohort_state_max
    if cohort_trait_max > trait_anxiety_max:
        trait_anxiety_max = cohort_trait_max
    if cohort_state_min < state_anxiety_min:
        state_anxiety_min = cohort_state_min
    if cohort_trait_min < trait_anxiety_min:
        trait_anxiety_min = cohort_trait_min
state_trait_arr = np.zeros((int(state_anxiety_max - state_anxiety_min + 1), \
                            int(trait_anxiety_max - trait_anxiety_min + 1)))
for cohort in cohorts:
    cohort_totals_df = pd.read_csv(datadir + cohort + '_totals_across_time.csv')
    cohort_totals_df = cohort_totals_df.dropna(subset=['STATE_ANXIETY','TRAIT_ANXIETY'])
    state_arr = cohort_totals_df.STATE_ANXIETY.values.astype(np.int)
    trait_arr = cohort_totals_df.TRAIT_ANXIETY.values.astype(np.int)
    for idx in range(len(state_arr)):
        state_idx = state_arr[idx] - state_anxiety_min
        trait_idx = trait_arr[idx] - trait_anxiety_min
        state_trait_arr[state_idx, trait_idx] += 1
state_trait_arr_df = pd.DataFrame({'index': np.arange(state_anxiety_min, state_anxiety_max+1)})
for trait_idx in range(state_trait_arr.shape[1]):
    state_trait_arr_df[trait_anxiety_min + trait_idx] = state_trait_arr[:,trait_idx]
state_trait_arr_df.set_index('index', inplace=True)
from seaborn import heatmap
plt.rcParams.update({'font.size':14})
heatmap(state_trait_arr_df)
plt.ylabel('State anxiety')
plt.xlabel('Trait anxiety')
plt.show()

### Prodromal diagnoses

In [None]:
prodromal_other_df = pd.read_csv(datadir + 'PRODROMA_other_across_time.csv')
prodromal_other_df.columns.values

In [None]:
prodromal_diag_cols = []
for col in prodromal_other_df.columns:
    if col.startswith('PRODROMAL_DIAG:'):
        prodromal_diag_cols.append(col)
prodromal_diag_cols

In [None]:
phenoconv_patnos = prodromal_other_df.loc[prodromal_other_df['PRODROMAL_DIAG:PHENOCONV']==1].PATNO.unique()
motor_patnos = prodromal_other_df.loc[prodromal_other_df['PRODROMAL_DIAG:MOTOR_PRODROMA']==1].PATNO.unique()
nonmotor_patnos = prodromal_other_df.loc[prodromal_other_df['PRODROMAL_DIAG:NONMOTOR_PRODROMA']==1].PATNO.unique()
no_neuro_patnos = prodromal_other_df.loc[prodromal_other_df['PRODROMAL_DIAG:NO_NEURO']==1].PATNO.unique()

In [None]:
print(phenoconv_patnos)
print(motor_patnos)
print(nonmotor_patnos)
print(no_neuro_patnos)

In [None]:
def display_patno_prodromal_diags(patno1):
    patno1_df = prodromal_other_df.loc[prodromal_other_df['PATNO']==patno1]
    patno1_df = patno1_df.loc[np.nansum(patno1_df[prodromal_diag_cols], axis=1)>0]
    patno1_df = patno1_df.sort_values(by=['EVENT_ID_DUR'])
    patno1_df['PRODROMAL_DIAG'] = 'None'
    patno1_df['PRODROMAL_DIAG'] = np.where(patno1_df['PRODROMAL_DIAG:NO_NEURO']==1, 'NO_NEURO', \
                                           patno1_df['PRODROMAL_DIAG'])
    patno1_df['PRODROMAL_DIAG'] = np.where(patno1_df['PRODROMAL_DIAG:PHENOCONV']==1, 'PHENOCONV', \
                                           patno1_df['PRODROMAL_DIAG'])
    patno1_df['PRODROMAL_DIAG'] = np.where(patno1_df['PRODROMAL_DIAG:MOTOR_PRODROMA']==1, 'MOTOR_PRODROMA', \
                                           patno1_df['PRODROMAL_DIAG'])
    patno1_df['PRODROMAL_DIAG'] = np.where(patno1_df['PRODROMAL_DIAG:NONMOTOR_PRODROMA']==1, 'NONMOTOR_PRODROMA', \
                                           patno1_df['PRODROMAL_DIAG'])
    display(patno1_df[['EVENT_ID_DUR','PRODROMAL_DIAG']])

In [None]:
np.random.seed(28037)
np.random.shuffle(phenoconv_patnos)
np.random.shuffle(nonmotor_patnos)
np.random.shuffle(motor_patnos)
np.random.shuffle(no_neuro_patnos)
for idx in range(5):
    patno = phenoconv_patnos[idx]
    print(patno)
    display_patno_prodromal_diags(patno)
    patno = nonmotor_patnos[idx]
    print(patno)
    display_patno_prodromal_diags(patno)
    patno = motor_patnos[idx]
    print(patno)
    display_patno_prodromal_diags(patno)
    patno = no_neuro_patnos[idx]
    print(patno)
    display_patno_prodromal_diags(patno)

In [None]:
# conversion patterns
diag_seq_counts = dict()
for patno in prodromal_other_df.PATNO.unique():
    patno_df = prodromal_other_df.loc[prodromal_other_df['PATNO']==patno]
    patno_df = patno_df.loc[np.nansum(patno_df[prodromal_diag_cols], axis=1)>0]
    patno_df = patno_df.sort_values(by=['EVENT_ID_DUR'])
    patno_df['PRODROMAL_DIAG'] = 'None'
    patno_df['PRODROMAL_DIAG'] = np.where(patno_df['PRODROMAL_DIAG:NO_NEURO']==1, 'NO_NEURO', \
                                          patno_df['PRODROMAL_DIAG'])
    patno_df['PRODROMAL_DIAG'] = np.where(patno_df['PRODROMAL_DIAG:PHENOCONV']==1, 'PHENOCONV', \
                                          patno_df['PRODROMAL_DIAG'])
    patno_df['PRODROMAL_DIAG'] = np.where(patno_df['PRODROMAL_DIAG:MOTOR_PRODROMA']==1, 'MOTOR_PRODROMA', \
                                          patno_df['PRODROMAL_DIAG'])
    patno_df['PRODROMAL_DIAG'] = np.where(patno_df['PRODROMAL_DIAG:NONMOTOR_PRODROMA']==1, 'NONMOTOR_PRODROMA', \
                                          patno_df['PRODROMAL_DIAG'])
    diag_vals = patno_df.PRODROMAL_DIAG.values
    diag_seq = diag_vals[0]
    prev_diag = diag_vals[0]
    if len(diag_vals) > 1:
        for diag_val in diag_vals[1:]:
            if diag_val != prev_diag:
                diag_seq += ', ' + diag_val
                prev_diag = diag_val
    if diag_seq in diag_seq_counts.keys():
        diag_seq_counts[diag_seq] += 1
    else:
        diag_seq_counts[diag_seq] = 1
diag_seq_counts

In [None]:
has_phenoconv = 0
has_phenoconv_then_something_else = 0
for diag_seq in diag_seq_counts.keys():
    if 'PHENOCONV' in diag_seq:
        has_phenoconv += diag_seq_counts[diag_seq]
        if 'PHENOCONV, ' in diag_seq:
            has_phenoconv_then_something_else += diag_seq_counts[diag_seq]
has_phenoconv, has_phenoconv_then_something_else

In [None]:
has_no_neuro = 0
for diag_seq in diag_seq_counts.keys():
    if 'NO_NEURO' in diag_seq:
        has_no_neuro += diag_seq_counts[diag_seq]
has_no_neuro

In [None]:
# 30 only nonmotor
# 15 phenoconverted, 3 oscillated with something else
# 11 nonmotor to motor without osccilate
# 8 oscillate nonmotor and motor
# 11 oscillate with no neuro

### Hoehn and Yahr

In [None]:
raw_datadir = '../../../datasets/ppmi/raw_data_asof_2019Jan24/'
mds3_df = pd.read_csv(raw_datadir + 'MDS_UPDRS_Part_III.csv')
mds3_df.NHY.value_counts()

In [None]:
raw_datadir = '../../../datasets/ppmi/raw_data_asof_2019Jan24/'
mds3_df = pd.read_csv(raw_datadir + 'MDS_UPDRS_Part_III.csv').dropna(subset=['NHY'])
datadir = '../../../datasets/ppmi/visit_feature_inputs_asof_2019Jan24/'
cohorts = ['PD','HC','PRODROMA','GENPD','GENUN','REGPD','REGUN','SWEDD']
for cohort in cohorts:
    cohort_patnos = pd.read_csv(datadir + cohort + '_totals_across_time.csv').PATNO.unique()
    cohort_mds3_df = mds3_df.loc[mds3_df['PATNO'].isin(cohort_patnos)]
    cohort_mds3_yr0_df = cohort_mds3_df.loc[cohort_mds3_df['EVENT_ID']=='SC']
    cohort_mds3_yr0_df = cohort_mds3_yr0_df.drop_duplicates(subset=['PATNO'])
    cohort_mds3_yr3_df = cohort_mds3_df.loc[cohort_mds3_df['EVENT_ID'].isin({'V12','PV12'})]
    cohort_mds3_yr3_df = cohort_mds3_yr3_df.drop_duplicates(subset=['PATNO'])
    num_patnos_yr0 = float(len(cohort_mds3_yr0_df))
    num_patnos_yr3 = float(len(cohort_mds3_yr3_df))
    yr0_output_str = cohort + ' at yr 0: ' + str(len(cohort_mds3_yr0_df))
    if num_patnos_yr3 > 0:
        yr3_output_str = cohort + ' at yr 3: ' + str(len(cohort_mds3_yr3_df))
    for idx in range(6):
        idx_freq_yr0 = len(cohort_mds3_yr0_df.loc[cohort_mds3_yr0_df['NHY']==idx])/num_patnos_yr0
        yr0_output_str += ', {0:.4f}'.format(idx_freq_yr0)
        if num_patnos_yr3 > 0:
            idx_freq_yr3 = len(cohort_mds3_yr3_df.loc[cohort_mds3_yr3_df['NHY']==idx])/num_patnos_yr3
            yr3_output_str += ', {0:.4f}'.format(idx_freq_yr3)
    print(yr0_output_str)
    if num_patnos_yr3 > 0:
        print(yr3_output_str)