In [None]:
import numpy as np, pandas as pd, pickle
from lifelines.statistics import logrank_test

In [None]:
outcome_dir = '../ppmi_survival_models/survival_outcome_subtotals_using_CMEDTM/set_3.0_0.5/'
with open(outcome_dir + 'cohorts_time_event_dict.pkl', 'r') as f:
    outcome_df = pickle.load(f)['PD']
baseline_df = pd.read_csv('../gather_PD_data/selected_baseline_data_using_CMEDTM.csv')
del baseline_df['ENROLL_CAT']
longitudinal_df = pd.read_csv('../gather_PD_data/selected_longitudinal_data_using_CMEDTM.csv')
screening_longitudinal_df = longitudinal_df.loc[longitudinal_df['EVENT_ID_DUR']==0]
baseline_longitudinal_df = longitudinal_df.loc[longitudinal_df['EVENT_ID_DUR']==0.125]
screening_longitudinal_cols = ['NUPDRS1', 'MOCA', 'NUPDRS2_DAILYACT', 'NUPDRS3_GAIT', 'NUPDRS3_RIGID_RIGHT', \
                               'NUPDRS3_FACE', 'NUPDRS3_TREMOR', 'NUPDRS3_RIGID_LEFT']
baseline_longitudinal_cols = ['SCOPA-AUT', 'HVLT_discrim_recog', 'STAI', 'HVLT_immed_recall', 'QUIP', 'EPWORTH', \
                              'GDSSHORT', 'HVLT_retent', 'BJLO', 'LNS', 'SEMANTIC_FLUENCY', 'REMSLEEP']
baseline_df = baseline_df.merge(screening_longitudinal_df[['PATNO']+screening_longitudinal_cols], on=['PATNO'], \
                                validate='one_to_one')
baseline_df = baseline_df.merge(baseline_longitudinal_df[['PATNO']+baseline_longitudinal_cols], on=['PATNO'], \
                                validate='one_to_one')
baseline_df = baseline_df.dropna()
outcome_df = outcome_df.loc[outcome_df['PATNO'].isin(baseline_df.PATNO.unique())]

In [None]:
patnos = baseline_df.PATNO.values
np.random.seed(29033)
np.random.shuffle(patnos)
train_test_split_idx = int(0.8*len(patnos))
train_patnos = patnos[:train_test_split_idx]
baseline_df = baseline_df.loc[baseline_df['PATNO'].isin(set(train_patnos.tolist()))]
outcome_df = outcome_df.loc[outcome_df['PATNO'].isin(set(train_patnos.tolist()))]

In [None]:
baseline_df.columns.values

In [None]:
len(baseline_df.columns)

In [None]:
outcome_df.columns.values

In [None]:
train_df = baseline_df.merge(outcome_df, validate='one_to_one')

In [None]:
def get_stratifying_feats(pval_thresh=0.05):
    outcomes = ['hybrid_requiremotor', 'Motor', 'Cognitive', 'Autonomic', 'Sleep', 'Psychiatric']
    outcome_stratifying_feats = dict()
    for outcome in outcomes:
        outcome_stratifying_feats[outcome] = set()
    for feat in baseline_df.columns.values[1:]:
        if train_df[feat].nunique() == 2:
            first_strata_df = train_df.loc[train_df[feat]==train_df[feat].min()]
            second_strata_df = train_df.loc[train_df[feat]==train_df[feat].max()]
            if len(first_strata_df) < 10 or len(second_strata_df) < 10:
                continue
            for outcome in outcomes:
                results = logrank_test(first_strata_df[outcome + '_T'], second_strata_df[outcome + '_T'], \
                                       first_strata_df[outcome + '_E'], second_strata_df[outcome + '_E'])
                if results.p_value <= pval_thresh:
                    outcome_stratifying_feats[outcome].add(feat)
        else:
            for percentile in [0.33, 0.5, 0.67]:
                first_strata_df = train_df.loc[train_df[feat]<=train_df[feat].quantile(percentile)]
                second_strata_df = train_df.loc[train_df[feat]>train_df[feat].quantile(percentile)]
                if len(first_strata_df) < 10 or len(second_strata_df) < 10:
                    continue
                for outcome in outcomes:
                    results = logrank_test(first_strata_df[outcome + '_T'], second_strata_df[outcome + '_T'], \
                                           first_strata_df[outcome + '_E'], second_strata_df[outcome + '_E'])
                    if results.p_value <= pval_thresh:
                        outcome_stratifying_feats[outcome].add(feat)
    return outcome_stratifying_feats

In [None]:
pval05_stratifying_feats = get_stratifying_feats()
pval05_stratifying_feats

In [None]:
with open('stratifying_feats_pval05_using_CMEDTM.pkl', 'w') as f:
    pickle.dump(pval05_stratifying_feats, f)