In [1]:
import numpy as np, pandas as pd, pickle
from lifelines.statistics import logrank_test

In [2]:
outcome_dir = '../ppmi_survival_models/survival_outcome_subtotals_using_CMEDTM/set_3.0_0.5/'
with open(outcome_dir + 'cohorts_time_event_dict.pkl', 'r') as f:
    outcome_df = pickle.load(f)['PD']
baseline_df = pd.read_csv('../gather_PD_data/selected_baseline_data_using_CMEDTM.csv')
del baseline_df['ENROLL_CAT']
longitudinal_df = pd.read_csv('../gather_PD_data/selected_longitudinal_data_using_CMEDTM.csv')
screening_longitudinal_df = longitudinal_df.loc[longitudinal_df['EVENT_ID_DUR']==0]
baseline_longitudinal_df = longitudinal_df.loc[longitudinal_df['EVENT_ID_DUR']==0.125]
screening_longitudinal_cols = ['NUPDRS1', 'MOCA', 'NUPDRS2_DAILYACT', 'NUPDRS3_GAIT', 'NUPDRS3_RIGID_RIGHT', \
                               'NUPDRS3_FACE', 'NUPDRS3_TREMOR', 'NUPDRS3_RIGID_LEFT']
baseline_longitudinal_cols = ['SCOPA-AUT', 'HVLT_discrim_recog', 'STAI', 'HVLT_immed_recall', 'QUIP', 'EPWORTH', \
                              'GDSSHORT', 'HVLT_retent', 'BJLO', 'LNS', 'SEMANTIC_FLUENCY', 'REMSLEEP']
baseline_df = baseline_df.merge(screening_longitudinal_df[['PATNO']+screening_longitudinal_cols], on=['PATNO'], \
                                validate='one_to_one')
baseline_df = baseline_df.merge(baseline_longitudinal_df[['PATNO']+baseline_longitudinal_cols], on=['PATNO'], \
                                validate='one_to_one')
baseline_df = baseline_df.dropna()
outcome_df = outcome_df.loc[outcome_df['PATNO'].isin(baseline_df.PATNO.unique())]

In [3]:
patnos = baseline_df.PATNO.values
np.random.seed(29033)
np.random.shuffle(patnos)
train_test_split_idx = int(0.8*len(patnos))
train_patnos = patnos[:train_test_split_idx]
baseline_df = baseline_df.loc[baseline_df['PATNO'].isin(set(train_patnos.tolist()))]
outcome_df = outcome_df.loc[outcome_df['PATNO'].isin(set(train_patnos.tolist()))]

In [4]:
baseline_df.columns.values

array(['PATNO', 'MALE', 'RAWHITE', 'FAMHIST', 'EDUCYRS', 'RIGHT_HANDED',
       'UPSIT', 'DIS_DUR_BY_CONSENTDT', 'Genetic PCA component 0',
       'Genetic PCA component 1', 'Genetic PCA component 2',
       'Genetic PCA component 3', 'Genetic PCA component 4',
       'Genetic PCA component 5', 'Genetic PCA component 6',
       'Genetic PCA component 7', 'Genetic PCA component 8',
       'Genetic PCA component 9', 'WGTKG', 'HTCM', 'DVT_SDM',
       'PTAU_ABETA_ratio', 'TTAU_ABETA_ratio', 'PTAU_TTAU_ratio',
       'PTAU_log', 'TTAU_log', 'ABETA_log', 'ASYNU_log', 'CSF Hemoglobin',
       'AGE', 'RIGHT_DOMSIDE', 'SYSSTND', 'SYSSUP', 'HRSTND', 'HRSUP',
       'DIASUP', 'DIASTND', 'TEMPC', 'TD_PIGD_untreated:tremor',
       'TD_PIGD_untreated:posture', 'ipsilateral_putamen',
       'ipsilateral_caudate', 'count_density_ratio_ipsilateral',
       'count_density_ratio_contralateral', 'contralateral_putamen',
       'contralateral_caudate', 'asymmetry_index_caudate',
       'asymmetry_index_p

In [5]:
len(baseline_df.columns)

68

In [6]:
outcome_df.columns.values

array(['PATNO', 'SCOPA-AUT_T', 'SCOPA-AUT_E', 'Autonomic_T',
       'Autonomic_E', 'EPWORTH_T', 'EPWORTH_E', 'REMSLEEP_T',
       'REMSLEEP_E', 'Sleep_T', 'Sleep_E', 'HVLT_discrim_recog_T',
       'HVLT_discrim_recog_E', 'HVLT_immed_recall_T',
       'HVLT_immed_recall_E', 'LNS_T', 'LNS_E', 'HVLT_retent_T',
       'HVLT_retent_E', 'BJLO_T', 'BJLO_E', 'MOCA_T', 'MOCA_E',
       'SEMANTIC_FLUENCY_T', 'SEMANTIC_FLUENCY_E', 'Cognitive_T',
       'Cognitive_E', 'QUIP_T', 'QUIP_E', 'STAI_T', 'STAI_E',
       'Psychiatric_T', 'Psychiatric_E', 'NUPDRS2_DAILYACT_T',
       'NUPDRS2_DAILYACT_E', 'NUPDRS3_GAIT_T', 'NUPDRS3_GAIT_E',
       'NUPDRS3_RIGID_RIGHT_T', 'NUPDRS3_RIGID_RIGHT_E', 'NUPDRS3_FACE_T',
       'NUPDRS3_FACE_E', 'NUPDRS3_TREMOR_T', 'NUPDRS3_TREMOR_E',
       'NUPDRS3_RIGID_LEFT_T', 'NUPDRS3_RIGID_LEFT_E', 'Motor_T',
       'Motor_E', 'hybrid_T', 'hybrid_E', 'hybrid_requiremotor_E',
       'hybrid_requiremotor_T'], dtype=object)

In [7]:
train_df = baseline_df.merge(outcome_df, validate='one_to_one')

In [8]:
def get_stratifying_feats(pval_thresh=0.05):
    outcomes = ['hybrid_requiremotor', 'Motor', 'Cognitive', 'Autonomic', 'Sleep', 'Psychiatric']
    outcome_stratifying_feats = dict()
    for outcome in outcomes:
        outcome_stratifying_feats[outcome] = set()
    for feat in baseline_df.columns.values[1:]:
        if train_df[feat].nunique() == 2:
            first_strata_df = train_df.loc[train_df[feat]==train_df[feat].min()]
            second_strata_df = train_df.loc[train_df[feat]==train_df[feat].max()]
            if len(first_strata_df) < 10 or len(second_strata_df) < 10:
                continue
            for outcome in outcomes:
                results = logrank_test(first_strata_df[outcome + '_T'], second_strata_df[outcome + '_T'], \
                                       first_strata_df[outcome + '_E'], second_strata_df[outcome + '_E'])
                if results.p_value <= pval_thresh:
                    outcome_stratifying_feats[outcome].add(feat)
        else:
            for percentile in [0.33, 0.5, 0.67]:
                first_strata_df = train_df.loc[train_df[feat]<=train_df[feat].quantile(percentile)]
                second_strata_df = train_df.loc[train_df[feat]>train_df[feat].quantile(percentile)]
                if len(first_strata_df) < 10 or len(second_strata_df) < 10:
                    continue
                for outcome in outcomes:
                    results = logrank_test(first_strata_df[outcome + '_T'], second_strata_df[outcome + '_T'], \
                                           first_strata_df[outcome + '_E'], second_strata_df[outcome + '_E'])
                    if results.p_value <= pval_thresh:
                        outcome_stratifying_feats[outcome].add(feat)
    return outcome_stratifying_feats

In [9]:
pval05_stratifying_feats = get_stratifying_feats()
pval05_stratifying_feats

{'Autonomic': {'ABETA_log',
  'AGE',
  'ASYNU_log',
  'Genetic PCA component 0',
  'Genetic PCA component 1',
  'Genetic PCA component 3',
  'Genetic PCA component 9',
  'LNS',
  'MOCA',
  'PTAU_ABETA_ratio',
  'REMSLEEP',
  'SEMANTIC_FLUENCY',
  'TTAU_ABETA_ratio'},
 'Cognitive': {'Genetic PCA component 3',
  'Genetic PCA component 6',
  'HTCM',
  'MOCA',
  'REMSLEEP',
  'asymmetry_index_putamen',
  'count_density_ratio_ipsilateral'},
 'Motor': {'ABETA_log',
  'AGE',
  'LNS',
  'NUPDRS2_DAILYACT',
  'PTAU_TTAU_ratio',
  'asymmetry_index_putamen'},
 'Psychiatric': {'BJLO', 'Genetic PCA component 8', 'RAWHITE'},
 'Sleep': {'AGE',
  'ASYNU_log',
  'DIASUP',
  'HVLT_immed_recall',
  'HVLT_retent',
  'SCOPA-AUT'},
 'hybrid_requiremotor': {'ABETA_log',
  'AGE',
  'LNS',
  'STAI',
  'TTAU_ABETA_ratio',
  'asymmetry_index_putamen',
  'ipsilateral_putamen'}}

In [11]:
with open('stratifying_feats_pval05_using_CMEDTM.pkl', 'w') as f:
    pickle.dump(pval05_stratifying_feats, f)