In [1]:
import numpy as np, pandas as pd, pickle

In [2]:
surv_df_filter2 = pd.read_csv('PD_outcomes_filter2yrs.csv')
surv_df_filter3 = pd.read_csv('PD_outcomes_filter3yrs.csv')
with open('../finalized_outcome_survival_models/final_all_covariate_sets.pkl', 'r') as f:
    all_covariate_sets = pickle.load(f)
baseline_df = pd.read_csv('../finalized_outcome_survival_models/final_survival_baseline_data.csv')

In [3]:
all_union_of_covariate_sets = dict()
outcomes = ['Autonomic', 'Cognitive', 'Motor', 'Sleep', 'Psychiatric']
for outcome in outcomes:
    all_union_of_covariate_sets[outcome] = set()
    for cov_set in all_covariate_sets.values():
        all_union_of_covariate_sets[outcome] = all_union_of_covariate_sets[outcome].union(cov_set[outcome])
        all_union_of_covariate_sets[outcome] = all_union_of_covariate_sets[outcome].union(cov_set['Standard'])
all_union_of_covariate_sets['hybrid_requiremotor'] = set()
for outcome in outcomes:
    all_union_of_covariate_sets['hybrid_requiremotor'] \
        = all_union_of_covariate_sets['hybrid_requiremotor'].union(all_union_of_covariate_sets[outcome])
all_union_of_covariate_sets['NUPDRS23_45'] = all_union_of_covariate_sets['Motor']
all_union_of_covariate_sets['MOCA_25'] = all_union_of_covariate_sets['Cognitive']
all_union_of_covariate_sets['MSEADLG_79'] = all_union_of_covariate_sets['hybrid_requiremotor']

In [5]:
np.random.seed(2033)
filter2_test_patnos = dict()
filter3_test_patnos = dict()
filter2_outcome_remaining_dfs = dict()
filter3_outcome_remaining_dfs = dict()
outcomes = ['Autonomic', 'Cognitive', 'Motor', 'Sleep', 'Psychiatric', 'hybrid_requiremotor', 'NUPDRS23_45', \
            'MOCA_25', 'MSEADLG_79']
for outcome in all_union_of_covariate_sets.keys():
    outcome_surv_df_filter2 = surv_df_filter2[['PATNO', outcome + '_T', outcome + '_E']]
    outcome_surv_df_filter2 = outcome_surv_df_filter2.loc[outcome_surv_df_filter2[outcome + '_T'] > 0]
    num_filter2_pats_orig = len(outcome_surv_df_filter2)
    num_filter2_obs_orig = outcome_surv_df_filter2[outcome + '_E'].sum()
    outcome_surv_df_filter2 \
        = outcome_surv_df_filter2.merge(baseline_df[['PATNO']+list(all_union_of_covariate_sets[outcome])], \
                                        on=['PATNO'], validate='one_to_one').dropna()
    filter2_obs_patnos = outcome_surv_df_filter2.loc[outcome_surv_df_filter2[outcome + '_E']==1].PATNO.values
    filter2_cens_patnos = outcome_surv_df_filter2.loc[outcome_surv_df_filter2[outcome + '_E']==0].PATNO.values
    np.random.shuffle(filter2_obs_patnos)
    np.random.shuffle(filter2_cens_patnos)
    filter2_obs_test_patnos = filter2_obs_patnos[int(.8*len(filter2_obs_patnos)):]
    filter2_cens_test_patnos = filter2_cens_patnos[int(.8*len(filter2_cens_patnos)):]
    filter2_test_patnos[outcome] = np.concatenate((filter2_obs_test_patnos, filter2_cens_test_patnos))
    filter2_outcome_remaining_dfs[outcome] \
        = outcome_surv_df_filter2.loc[~outcome_surv_df_filter2['PATNO'].isin(filter2_test_patnos[outcome])]
    
    outcome_surv_df_filter3 = surv_df_filter3[['PATNO', outcome + '_T', outcome + '_E']]
    outcome_surv_df_filter3 = outcome_surv_df_filter3.loc[outcome_surv_df_filter3[outcome + '_T'] > 0]
    num_filter3_pats_orig = len(outcome_surv_df_filter3)
    num_filter3_obs_orig = outcome_surv_df_filter3[outcome + '_E'].sum()
    outcome_surv_df_filter3 \
        = outcome_surv_df_filter3.merge(baseline_df[['PATNO']+list(all_union_of_covariate_sets[outcome])], \
                                        on=['PATNO'], validate='one_to_one').dropna()
    filter3_obs_patnos = outcome_surv_df_filter3.loc[outcome_surv_df_filter3[outcome + '_E']==1].PATNO.values
    filter3_cens_patnos = outcome_surv_df_filter3.loc[outcome_surv_df_filter3[outcome + '_E']==0].PATNO.values
    np.random.shuffle(filter3_obs_patnos)
    np.random.shuffle(filter3_cens_patnos)
    filter3_obs_test_patnos = filter3_obs_patnos[int(.8*len(filter3_obs_patnos)):]
    filter3_cens_test_patnos = filter3_cens_patnos[int(.8*len(filter3_cens_patnos)):]
    filter3_test_patnos[outcome] = np.concatenate((filter3_obs_test_patnos, filter3_cens_test_patnos))
    filter3_outcome_remaining_dfs[outcome] \
        = outcome_surv_df_filter3.loc[~outcome_surv_df_filter3['PATNO'].isin(filter3_test_patnos[outcome])]
    
    num_filter2_test_pats = len(filter2_test_patnos[outcome])
    test_filter2_df = outcome_surv_df_filter2.loc[outcome_surv_df_filter2['PATNO'].isin(filter2_test_patnos[outcome])]
    num_filter2_test_obs = test_filter2_df[outcome + '_E'].sum()
    num_filter2_pats = len(outcome_surv_df_filter2)
    num_filter2_obs = outcome_surv_df_filter2[outcome + '_E'].sum()
    print(outcome + ' in 2-year setting: ' + str(num_filter2_obs_orig) + ' of ' + str(num_filter2_pats_orig) \
          + ' w/o consider baseline feats, ' + str(num_filter2_obs) + ' of ' + str(num_filter2_pats) \
          + ' w/ consider baseline feats, ' + str(num_filter2_test_obs) + ' of ' + str(num_filter2_test_pats) \
          + ' of test observed')
    
    num_filter3_test_pats = len(filter3_test_patnos[outcome])
    test_filter3_df = outcome_surv_df_filter3.loc[outcome_surv_df_filter3['PATNO'].isin(filter3_test_patnos[outcome])]
    num_filter3_test_obs = test_filter3_df[outcome + '_E'].sum()
    num_filter3_pats = len(outcome_surv_df_filter3)
    num_filter3_obs = outcome_surv_df_filter3[outcome + '_E'].sum()
    print(outcome + ' in 3-year setting: ' + str(num_filter3_obs_orig) + ' of ' + str(num_filter3_pats_orig) \
          + ' w/o consider baseline feats, ' + str(num_filter3_obs) + ' of ' + str(num_filter3_pats) \
          + ' w/ consider baseline feats, ' + str(num_filter3_test_obs) + ' of ' + str(num_filter3_test_pats) \
          + ' of test observed')

Autonomic in 2-year setting: 168 of 368 w/o consider baseline feats, 122 of 254 w/ consider baseline feats, 25 of 52 of test observed
Autonomic in 3-year setting: 170 of 342 w/o consider baseline feats, 126 of 237 w/ consider baseline feats, 26 of 49 of test observed
Cognitive in 2-year setting: 51 of 361 w/o consider baseline feats, 38 of 292 w/ consider baseline feats, 8 of 59 of test observed
Cognitive in 3-year setting: 59 of 335 w/o consider baseline feats, 47 of 273 w/ consider baseline feats, 10 of 56 of test observed
Psychiatric in 2-year setting: 147 of 368 w/o consider baseline feats, 107 of 255 w/ consider baseline feats, 22 of 52 of test observed
Psychiatric in 3-year setting: 161 of 342 w/o consider baseline feats, 121 of 238 w/ consider baseline feats, 25 of 49 of test observed
MOCA_25 in 2-year setting: 74 of 286 w/o consider baseline feats, 50 of 230 w/ consider baseline feats, 10 of 46 of test observed
MOCA_25 in 3-year setting: 85 of 267 w/o consider baseline feats, 6

In [6]:
with open('test_patnos_filter2.pkl', 'w') as f:
    pickle.dump(filter2_test_patnos, f)
with open('test_patnos_filter3.pkl', 'w') as f:
    pickle.dump(filter3_test_patnos, f)

In [11]:
np.random.seed(2047)
def get_valid_and_covariate_dicts(filtered_outcome_remaining_dfs):
    filtered_valid_patnos_dict = dict()
    feats_to_remove_dict = dict()
    for outcome in outcomes:
        outcome_df = filtered_outcome_remaining_dfs[outcome]
        outcome_obs_patnos = outcome_df.loc[outcome_df[outcome + '_E']==1].PATNO.values
        outcome_cens_patnos = outcome_df.loc[outcome_df[outcome + '_E']==0].PATNO.values
        np.random.shuffle(outcome_obs_patnos)
        np.random.shuffle(outcome_cens_patnos)
        first_obs_splitpoint = int(.25*len(outcome_obs_patnos))
        second_obs_splitpoint =int(.5*len(outcome_obs_patnos))
        third_obs_splitpoint = int(.75*len(outcome_obs_patnos))
        first_cens_splitpoint = int(.25*len(outcome_cens_patnos))
        second_cens_splitpoint = int(.5*len(outcome_cens_patnos))
        third_cens_splitpoint = int(.75*len(outcome_cens_patnos))
        filtered_valid_patnos_dict[outcome] \
            = [np.concatenate((outcome_obs_patnos[:first_obs_splitpoint].flatten(), \
                               outcome_cens_patnos[:first_cens_splitpoint].flatten())), \
               np.concatenate((outcome_obs_patnos[first_obs_splitpoint:second_obs_splitpoint].flatten(), \
                               outcome_cens_patnos[first_cens_splitpoint:second_cens_splitpoint].flatten())), \
               np.concatenate((outcome_obs_patnos[second_obs_splitpoint:third_obs_splitpoint].flatten(), \
                               outcome_cens_patnos[second_cens_splitpoint:third_cens_splitpoint].flatten())), \
               np.concatenate((outcome_obs_patnos[third_obs_splitpoint:].flatten(), \
                               outcome_cens_patnos[third_cens_splitpoint].flatten()))]
        feats_to_remove = set()
        for col in outcome_df.columns:
            if col in {'PATNO', outcome + '_T', outcome + '_E'}:
                continue
            for fold_idx in range(len(filtered_valid_patnos_dict[outcome])):
                if outcome_df.loc[~outcome_df['PATNO'].isin(filtered_valid_patnos_dict[outcome][fold_idx])][col].std() == 0:
                    feats_to_remove.add(col)
                    break
        print(outcome)
        print(feats_to_remove)
        feats_to_remove_dict[outcome] = feats_to_remove
    return filtered_valid_patnos_dict, feats_to_remove_dict


In [12]:
filter2_valid_patnos_dict, filter2_feats_to_remove = get_valid_and_covariate_dicts(filter2_outcome_remaining_dfs)

Autonomic
set([])
Cognitive
set(['BJLOT3'])
Motor
set([])
Sleep
set([])
Psychiatric
set(['TMSEX', 'TMTRWD', 'TMGAMBLE'])
hybrid_requiremotor
set(['TMSEX', 'SCAU16', 'TMTRWD', 'CNTRLSEX', 'TMGAMBLE'])
NUPDRS23_45
set([])
MOCA_25
set(['MCAVIGIL', 'BJLOT3', 'MCADATE'])
MSEADLG_79
set(['TMSEX', 'TMTRWD', 'TMGAMBLE'])


In [13]:
filter3_valid_patnos_dict, filter3_feats_to_remove = get_valid_and_covariate_dicts(filter3_outcome_remaining_dfs)

Autonomic
set([])
Cognitive
set(['LNS1C'])
Motor
set([])
Sleep
set(['STROKE'])
Psychiatric
set(['TMSEX', 'TMTRWD', 'TMGAMBLE'])
hybrid_requiremotor
set(['TMTRWD', 'BJLOT3', 'TMSEX', 'STROKE', 'TMGAMBLE', 'LNS1C'])
NUPDRS23_45
set([])
MOCA_25
set(['BJLOT13', 'MCACLCKC', 'BJLOT3'])
MSEADLG_79
set(['TMSEX', 'TMTRWD', 'TMGAMBLE'])


In [18]:
filter2_feats_to_remove['Autonomic'].add('SCAU16')

In [20]:
filter3_feats_to_remove['Autonomic'].add('SCAU7')

In [22]:
filter3_feats_to_remove['MSEADLG_79'].add('SLEEP AID')

In [24]:
filter2_feats_to_remove['MSEADLG_79'].add('SLEEP AID')

In [None]:
with open('valid_patnos_filter2.pkl', 'w') as f:
    pickle.dump(filter2_valid_patnos_dict, f)
with open('valid_patnos_filter3.pkl', 'w') as f:
    pickle.dump(filter3_valid_patnos_dict, f)

In [2]:
import pickle
with open('feats_to_remove_filter2.pkl', 'r') as f:
    filter2_feats_to_remove = pickle.load(f)
with open('feats_to_remove_filter3.pkl', 'r') as f:
    filter3_feats_to_remove = pickle.load(f)

In [3]:
filter2_feats_to_remove['Psychiatric'].add('CNTRLSEX')
filter3_feats_to_remove['Psychiatric'].add('PhysExam_Psychiatric')
filter3_feats_to_remove['Motor'].add('NP2FREZ')

In [6]:
filter2_feats_to_remove['MSEADLG_79'].add('NP1HALL')
filter3_feats_to_remove['MSEADLG_79'].add('NP1HALL')

In [8]:
filter2_feats_to_remove['Sleep'].add('STROKE')

In [None]:
filter3_feats_to_remove['Autonomic'].add('SCAU16')

In [9]:
with open('feats_to_remove_filter2.pkl', 'w') as f:
    pickle.dump(filter2_feats_to_remove, f)
with open('feats_to_remove_filter3.pkl', 'w') as f:
    pickle.dump(filter3_feats_to_remove, f)

very low variance when conditioned on death event, i.e. completely determines whether a subject dies or not -> convergence issues (probably just too few samples and overfitting)

In [11]:
# actually run the check myself to see if there are other features to remove
import numpy as np, pandas as pd, pickle
with open('test_patnos_filter2.pkl', 'r') as f:
    filter2_test_patnos = pickle.load(f)
with open('test_patnos_filter3.pkl', 'r') as f:
    filter3_test_patnos = pickle.load(f)
with open('valid_patnos_filter2.pkl', 'r') as f:
    filter2_valid_patnos = pickle.load(f)
with open('valid_patnos_filter3.pkl', 'r') as f:
    filter3_valid_patnos = pickle.load(f)
surv_df_filter2 = pd.read_csv('PD_outcomes_filter2yrs.csv')
surv_df_filter3 = pd.read_csv('PD_outcomes_filter3yrs.csv')
with open('../finalized_outcome_survival_models/final_all_covariate_sets.pkl', 'r') as f:
    all_covariate_sets = pickle.load(f)
baseline_df = pd.read_csv('../finalized_outcome_survival_models/final_survival_baseline_data.csv')

In [16]:
outcomes = ['Motor', 'Cognitive', 'Psychiatric', 'Autonomic', 'Sleep', 'hybrid_requiremotor', 'NUPDRS23_45', \
            'MOCA_25', 'MSEADLG_79']
filtered2_covariate_sets = dict()
filtered3_covariate_sets = dict()
filtered2_covariate_sets_removed_feats = dict()
filtered3_covariate_sets_removed_feats = dict()
for cov_set in all_covariate_sets.keys():
    filtered2_covariate_sets[cov_set] = dict()
    filtered3_covariate_sets[cov_set] = dict()
    filtered2_covariate_sets_removed_feats[cov_set] = dict()
    filtered3_covariate_sets_removed_feats[cov_set] = dict()
    for outcome in outcomes:
        if outcome == 'NUPDRS23_45':
            outcome_cov_set = list(all_covariate_sets[cov_set]['Standard']) \
                + list(all_covariate_sets[cov_set]['Motor'])
        elif outcome == 'MOCA_25':
            outcome_cov_set = list(all_covariate_sets[cov_set]['Standard']) \
                + list(all_covariate_sets[cov_set]['Cognitive'])
        elif outcome == 'MSEADLG_79' or outcome == 'hybrid_requiremotor':
            outcome_cov_set = list(all_covariate_sets[cov_set]['Standard']) \
                + list(all_covariate_sets[cov_set]['Motor']) + list(all_covariate_sets[cov_set]['Cognitive']) \
                + list(all_covariate_sets[cov_set]['Autonomic']) + list(all_covariate_sets[cov_set]['Psychiatric']) \
                + list(all_covariate_sets[cov_set]['Sleep'])
        else:
            outcome_cov_set = list(all_covariate_sets[cov_set]['Standard']) \
                + list(all_covariate_sets[cov_set][outcome])
        outcome2_cov_set = list(set(outcome_cov_set))
        outcome3_cov_set = list(set(outcome_cov_set))
        outcome2_removed_feats = set()
        outcome3_removed_feats = set()
        outcome2_df = surv_df_filter2[['PATNO', outcome + '_E', \
                                       outcome + '_T']].merge(baseline_df[['PATNO']+outcome2_cov_set]).dropna()
        outcome2_df = outcome2_df.loc[outcome2_df[outcome + '_T'] > 0]
        train_valid2_df = outcome2_df.loc[~outcome2_df['PATNO'].isin(filter2_test_patnos[outcome])]
        outcome3_df = surv_df_filter3[['PATNO', outcome + '_E', \
                                       outcome + '_T']].merge(baseline_df[['PATNO']+outcome3_cov_set]).dropna()
        outcome3_df = outcome3_df.loc[outcome3_df[outcome + '_T'] > 0]
        train_valid3_df = outcome3_df.loc[~outcome3_df['PATNO'].isin(filter3_test_patnos[outcome])]
        for feat in outcome_cov_set:
            for fold_idx in range(4):
                train2_df \
                    = train_valid2_df.loc[~train_valid2_df['PATNO'].isin(filter2_valid_patnos[outcome][fold_idx])]
                obs2_df = train2_df.loc[train2_df[outcome + '_E'] == 1]
                cens2_df = train2_df.loc[train2_df[outcome + '_E'] == 0]
                obs2_std = obs2_df[feat].std()
                cens2_std = cens2_df[feat].std()
                if obs2_std == 0 or cens2_std == 0:
                    outcome2_cov_set.remove(feat)
                    outcome2_removed_feats.add(feat)
                    break
                train3_df \
                    = train_valid3_df.loc[~train_valid3_df['PATNO'].isin(filter3_valid_patnos[outcome][fold_idx])]
                obs3_df = train3_df.loc[train3_df[outcome + '_E'] == 1]
                cens3_df = train3_df.loc[train3_df[outcome + '_E'] == 0]
                obs3_std = obs3_df[feat].std()
                cens3_std = cens3_df[feat].std()
                if obs3_std == 0 or cens3_std == 0:
                    outcome3_cov_set.remove(feat)
                    outcome3_removed_feats.add(feat)
                    break
        filtered2_covariate_sets[cov_set][outcome] = outcome2_cov_set
        filtered3_covariate_sets[cov_set][outcome] = outcome3_cov_set
        filtered2_covariate_sets_removed_feats[cov_set][outcome] = outcome2_removed_feats
        filtered3_covariate_sets_removed_feats[cov_set][outcome] = outcome3_removed_feats
print(filtered2_covariate_sets_removed_feats)
print(filtered3_covariate_sets_removed_feats)

{'questions_plus_treatment_imaging_CSF_expanded_genetic': {'Autonomic': set(['SCAU16']), 'Cognitive': set(['MCAFDS', 'LNS1C', 'BJLOT3', 'MCADATE', 'MCACLCKC', 'MCAVIGIL', 'LNS2B', 'LNS1B']), 'Psychiatric': set(['TMTRWD', 'GDSWRTLS', 'TMSEX', 'PhysExam_Psychiatric', 'CNTRLSEX', 'TMGAMBLE']), 'MOCA_25': set(['BJLOT5', 'MCAFDS', 'BJLOT7', 'LNS1C', 'BJLOT3', 'MCADATE', 'BJLOT9', 'MCACLCKC', 'MCACLCKN', 'MCAVIGIL', 'BJLOT15', 'LNS2B', 'MCABDS', 'LNS2A', 'LNS1B']), 'Sleep': set([]), 'Motor': set([]), 'NUPDRS23_45': set([]), 'MSEADLG_79': set(['MCACLCKN', 'HETRA', 'GDSWRTLS', 'SLEEP AID', 'SCAU23A', 'MCAVIGIL', 'SCAU18', 'LNS3C', 'SCAU16', 'LNS1C', 'NP2FREZ', 'DRMNOCTB', 'SLPINJUR', 'GDSAFRAD', 'NP3RTALL', 'GDSEMPTY', 'CNTRLSEX', 'TMTRWD', 'RAWHITE', 'SCAU26B', 'ESS6', 'TMSEX', 'BJLOT15', 'PhysExam_Psychiatric', 'BJLOT17', 'LNS2B', 'MCARHINO', 'LNS2A', 'SCAU24', 'GDSHOPLS', 'BJLOT3', 'MCADATE', 'CNTRLEAT', 'NP1HALL', 'DRMUMV', 'STROKE', 'ESS8', 'RLS', 'TMGAMBLE']), 'hybrid_requiremotor': set(

In [20]:
# modifications based on later runs
filtered3_covariate_sets['questions_plus_treatment_imaging_CSF_expanded_genetic']['Cognitive'].remove('BJLOT3')
filtered3_covariate_sets['questions_plus_treatment_imaging_CSF_expanded_genetic']['Cognitive'].remove('LNS1B')
filtered3_covariate_sets['questions_plus_treatment_imaging_CSF_expanded']['Cognitive'].remove('LNS1B')
filtered3_covariate_sets_removed_feats['questions_plus_treatment_imaging_CSF_expanded_genetic']['Cognitive'].add('BJLOT3')
filtered3_covariate_sets_removed_feats['questions_plus_treatment_imaging_CSF_expanded_genetic']['Cognitive'].add('LNS1B')
filtered3_covariate_sets_removed_feats['questions_plus_treatment_imaging_CSF_expanded']['Cognitive'].add('LNS1B')


In [17]:
with open('feats_to_remove_filter2.pkl', 'r') as f:
    filter2_feats_to_remove = pickle.load(f)
with open('feats_to_remove_filter3.pkl', 'r') as f:
    filter3_feats_to_remove = pickle.load(f)
print(filter2_feats_to_remove)
print(filter3_feats_to_remove)

{'Autonomic': set(['SCAU16']), 'Cognitive': set(['BJLOT3']), 'Psychiatric': set(['TMSEX', 'TMTRWD', 'TMGAMBLE', 'CNTRLSEX']), 'MOCA_25': set(['MCAVIGIL', 'BJLOT3', 'MCADATE']), 'Sleep': set(['STROKE']), 'Motor': set([]), 'NUPDRS23_45': set([]), 'MSEADLG_79': set(['TMSEX', 'TMTRWD', 'TMGAMBLE', 'SLEEP AID']), 'hybrid_requiremotor': set(['TMSEX', 'SCAU16', 'TMTRWD', 'CNTRLSEX', 'TMGAMBLE'])}
{'Autonomic': set(['SCAU7']), 'Cognitive': set(['LNS1C']), 'Psychiatric': set(['TMSEX', 'TMTRWD', 'TMGAMBLE', 'PhysExam_Psychiatric']), 'MOCA_25': set(['BJLOT13', 'MCACLCKC', 'BJLOT3']), 'Sleep': set(['STROKE']), 'Motor': set(['NP2FREZ']), 'NUPDRS23_45': set([]), 'MSEADLG_79': set(['NP1HALL', 'TMSEX', 'TMTRWD', 'TMGAMBLE', 'SLEEP AID']), 'hybrid_requiremotor': set(['TMTRWD', 'BJLOT3', 'TMSEX', 'STROKE', 'TMGAMBLE', 'LNS1C'])}


In [23]:
for cov_set in filtered2_covariate_sets.keys():
    for outcome in filter2_feats_to_remove.keys():
        for feat in filter2_feats_to_remove[outcome]:
            if feat in filtered2_covariate_sets[cov_set][outcome]:
                filtered2_covariate_sets[cov_set][outcome].remove(feat)
                filtered2_covariate_sets_removed_feats[cov_set][outcome].add(feat)

for cov_set in filtered3_covariate_sets.keys():
    for outcome in filter3_feats_to_remove.keys():
        for feat in filter3_feats_to_remove[outcome]:
            if feat in filtered3_covariate_sets[cov_set][outcome]:
                filtered3_covariate_sets[cov_set][outcome].remove(feat)
                filtered3_covariate_sets_removed_feats[cov_set][outcome].add(feat)
                

In [25]:
# add PhysExam_Psychiatric to all hybrid_requiremotor
for cov_set in filtered2_covariate_sets.keys():
    if 'PhysExam_Psychiatric' in filtered2_covariate_sets[cov_set]['hybrid_requiremotor']:
        filtered2_covariate_sets[cov_set]['hybrid_requiremotor'].remove('PhysExam_Psychiatric')
        filtered2_covariate_sets_removed_feats[cov_set]['hybrid_requiremotor'].add('PhysExam_Psychiatric')

for cov_set in filtered3_covariate_sets.keys():
    if 'PhysExam_Psychiatric' in filtered3_covariate_sets[cov_set]['hybrid_requiremotor']:
        filtered3_covariate_sets[cov_set]['hybrid_requiremotor'].remove('PhysExam_Psychiatric')
        filtered3_covariate_sets_removed_feats[cov_set]['hybrid_requiremotor'].add('PhysExam_Psychiatric')

                                                                                   

In [2]:
import pickle
with open('finalized_covariate_sets_filter2.pkl', 'r') as f:
    filtered2_covariate_sets = pickle.load(f)
with open('finalized_covariate_sets_filter3.pkl', 'r') as f:
    filtered3_covariate_sets = pickle.load(f)
with open('finalized_removed_feats_filter2.pkl', 'r') as f:
    filtered2_covariate_sets_removed_feats = pickle.load(f)
with open('finalized_removed_feats_filter3.pkl', 'r') as f:
    filtered3_covariate_sets_removed_feats = pickle.load(f)

In [10]:
# add PhysExam_Psychiatric to all MSEADLG_79
# add 'MCACLCKC', 'MCAALTTM', 'LNS1C', 'MCADATE' to all cognitive, MOCA_25, hybrid_requiremotor, MSEADLG_79
for cov_set in filtered2_covariate_sets.keys():
    if 'PhysExam_Psychiatric' in filtered2_covariate_sets[cov_set]['MSEADLG_79']:
        filtered2_covariate_sets[cov_set]['MSEADLG_79'].remove('PhysExam_Psychiatric')
        filtered2_covariate_sets_removed_feats[cov_set]['MSEADLG_79'].add('PhysExam_Psychiatric')
        
    if 'MCACLCKC' in filtered2_covariate_sets[cov_set]['Cognitive']:
        filtered2_covariate_sets[cov_set]['Cognitive'].remove('MCACLCKC')
        filtered2_covariate_sets_removed_feats[cov_set]['Cognitive'].add('MCACLCKC')
    if 'MCAALTTM' in filtered2_covariate_sets[cov_set]['Cognitive']:
        filtered2_covariate_sets[cov_set]['Cognitive'].remove('MCAALTTM')
        filtered2_covariate_sets_removed_feats[cov_set]['Cognitive'].add('MCAALTTM')
    if 'LNS1C' in filtered2_covariate_sets[cov_set]['Cognitive']:
        filtered2_covariate_sets[cov_set]['Cognitive'].remove('LNS1C')
        filtered2_covariate_sets_removed_feats[cov_set]['Cognitive'].add('LNS1C')
    if 'MCADATE' in filtered2_covariate_sets[cov_set]['Cognitive']:
        filtered2_covariate_sets[cov_set]['Cognitive'].remove('MCADATE')
        filtered2_covariate_sets_removed_feats[cov_set]['Cognitive'].add('MCADATE')
    
    if 'MCACLCKC' in filtered2_covariate_sets[cov_set]['MOCA_25']:
        filtered2_covariate_sets[cov_set]['MOCA_25'].remove('MCACLCKC')
        filtered2_covariate_sets_removed_feats[cov_set]['MOCA_25'].add('MCACLCKC')
    if 'MCAALTTM' in filtered2_covariate_sets[cov_set]['MOCA_25']:
        filtered2_covariate_sets[cov_set]['MOCA_25'].remove('MCAALTTM')
        filtered2_covariate_sets_removed_feats[cov_set]['MOCA_25'].add('MCAALTTM')
    if 'LNS1C' in filtered2_covariate_sets[cov_set]['MOCA_25']:
        filtered2_covariate_sets[cov_set]['MOCA_25'].remove('LNS1C')
        filtered2_covariate_sets_removed_feats[cov_set]['MOCA_25'].add('LNS1C')
    if 'MCADATE' in filtered2_covariate_sets[cov_set]['MOCA_25']:
        filtered2_covariate_sets[cov_set]['MOCA_25'].remove('MCADATE')
        filtered2_covariate_sets_removed_feats[cov_set]['MOCA_25'].add('MCADATE')
        
    if 'MCACLCKC' in filtered2_covariate_sets[cov_set]['hybrid_requiremotor']:
        filtered2_covariate_sets[cov_set]['hybrid_requiremotor'].remove('MCACLCKC')
        filtered2_covariate_sets_removed_feats[cov_set]['hybrid_requiremotor'].add('MCACLCKC')
    if 'MCAALTTM' in filtered2_covariate_sets[cov_set]['hybrid_requiremotor']:
        filtered2_covariate_sets[cov_set]['hybrid_requiremotor'].remove('MCAALTTM')
        filtered2_covariate_sets_removed_feats[cov_set]['hybrid_requiremotor'].add('MCAALTTM')
    if 'LNS1C' in filtered2_covariate_sets[cov_set]['hybrid_requiremotor']:
        filtered2_covariate_sets[cov_set]['hybrid_requiremotor'].remove('LNS1C')
        filtered2_covariate_sets_removed_feats[cov_set]['hybrid_requiremotor'].add('LNS1C')
    if 'MCADATE' in filtered2_covariate_sets[cov_set]['hybrid_requiremotor']:
        filtered2_covariate_sets[cov_set]['hybrid_requiremotor'].remove('MCADATE')
        filtered2_covariate_sets_removed_feats[cov_set]['hybrid_requiremotor'].add('MCADATE')
        
    if 'MCACLCKC' in filtered2_covariate_sets[cov_set]['MSEADLG_79']:
        filtered2_covariate_sets[cov_set]['MSEADLG_79'].remove('MCACLCKC')
        filtered2_covariate_sets_removed_feats[cov_set]['MSEADLG_79'].add('MCACLCKC')
    if 'MCAALTTM' in filtered2_covariate_sets[cov_set]['MSEADLG_79']:
        filtered2_covariate_sets[cov_set]['MSEADLG_79'].remove('MCAALTTM')
        filtered2_covariate_sets_removed_feats[cov_set]['MSEADLG_79'].add('MCAALTTM')
    if 'LNS1C' in filtered2_covariate_sets[cov_set]['MSEADLG_79']:
        filtered2_covariate_sets[cov_set]['MSEADLG_79'].remove('LNS1C')
        filtered2_covariate_sets_removed_feats[cov_set]['MSEADLG_79'].add('LNS1C')
    if 'MCADATE' in filtered2_covariate_sets[cov_set]['MSEADLG_79']:
        filtered2_covariate_sets[cov_set]['MSEADLG_79'].remove('MCADATE')
        filtered2_covariate_sets_removed_feats[cov_set]['MSEADLG_79'].add('MCADATE')

In [11]:
for cov_set in filtered3_covariate_sets.keys():
    if 'PhysExam_Psychiatric' in filtered3_covariate_sets[cov_set]['MSEADLG_79']:
        filtered3_covariate_sets[cov_set]['MSEADLG_79'].remove('PhysExam_Psychiatric')
        filtered3_covariate_sets_removed_feats[cov_set]['MSEADLG_79'].add('PhysExam_Psychiatric')
        
    if 'MCACLCKC' in filtered3_covariate_sets[cov_set]['Cognitive']:
        filtered3_covariate_sets[cov_set]['Cognitive'].remove('MCACLCKC')
        filtered3_covariate_sets_removed_feats[cov_set]['Cognitive'].add('MCACLCKC')
    if 'MCAALTTM' in filtered3_covariate_sets[cov_set]['Cognitive']:
        filtered3_covariate_sets[cov_set]['Cognitive'].remove('MCAALTTM')
        filtered3_covariate_sets_removed_feats[cov_set]['Cognitive'].add('MCAALTTM')
    if 'LNS1C' in filtered3_covariate_sets[cov_set]['Cognitive']:
        filtered3_covariate_sets[cov_set]['Cognitive'].remove('LNS1C')
        filtered3_covariate_sets_removed_feats[cov_set]['Cognitive'].add('LNS1C')
    if 'MCADATE' in filtered3_covariate_sets[cov_set]['Cognitive']:
        filtered3_covariate_sets[cov_set]['Cognitive'].remove('MCADATE')
        filtered3_covariate_sets_removed_feats[cov_set]['Cognitive'].add('MCADATE')
    
    if 'MCACLCKC' in filtered3_covariate_sets[cov_set]['MOCA_25']:
        filtered3_covariate_sets[cov_set]['MOCA_25'].remove('MCACLCKC')
        filtered3_covariate_sets_removed_feats[cov_set]['MOCA_25'].add('MCACLCKC')
    if 'MCAALTTM' in filtered3_covariate_sets[cov_set]['MOCA_25']:
        filtered3_covariate_sets[cov_set]['MOCA_25'].remove('MCAALTTM')
        filtered3_covariate_sets_removed_feats[cov_set]['MOCA_25'].add('MCAALTTM')
    if 'LNS1C' in filtered3_covariate_sets[cov_set]['MOCA_25']:
        filtered3_covariate_sets[cov_set]['MOCA_25'].remove('LNS1C')
        filtered3_covariate_sets_removed_feats[cov_set]['MOCA_25'].add('LNS1C')
    if 'MCADATE' in filtered3_covariate_sets[cov_set]['MOCA_25']:
        filtered3_covariate_sets[cov_set]['MOCA_25'].remove('MCADATE')
        filtered3_covariate_sets_removed_feats[cov_set]['MOCA_25'].add('MCADATE')
        
    if 'MCACLCKC' in filtered3_covariate_sets[cov_set]['hybrid_requiremotor']:
        filtered3_covariate_sets[cov_set]['hybrid_requiremotor'].remove('MCACLCKC')
        filtered3_covariate_sets_removed_feats[cov_set]['hybrid_requiremotor'].add('MCACLCKC')
    if 'MCAALTTM' in filtered3_covariate_sets[cov_set]['hybrid_requiremotor']:
        filtered3_covariate_sets[cov_set]['hybrid_requiremotor'].remove('MCAALTTM')
        filtered3_covariate_sets_removed_feats[cov_set]['hybrid_requiremotor'].add('MCAALTTM')
    if 'LNS1C' in filtered3_covariate_sets[cov_set]['hybrid_requiremotor']:
        filtered3_covariate_sets[cov_set]['hybrid_requiremotor'].remove('LNS1C')
        filtered3_covariate_sets_removed_feats[cov_set]['hybrid_requiremotor'].add('LNS1C')
    if 'MCADATE' in filtered3_covariate_sets[cov_set]['hybrid_requiremotor']:
        filtered3_covariate_sets[cov_set]['hybrid_requiremotor'].remove('MCADATE')
        filtered3_covariate_sets_removed_feats[cov_set]['hybrid_requiremotor'].add('MCADATE')
        
    if 'MCACLCKC' in filtered3_covariate_sets[cov_set]['MSEADLG_79']:
        filtered3_covariate_sets[cov_set]['MSEADLG_79'].remove('MCACLCKC')
        filtered3_covariate_sets_removed_feats[cov_set]['MSEADLG_79'].add('MCACLCKC')
    if 'MCAALTTM' in filtered3_covariate_sets[cov_set]['MSEADLG_79']:
        filtered3_covariate_sets[cov_set]['MSEADLG_79'].remove('MCAALTTM')
        filtered3_covariate_sets_removed_feats[cov_set]['MSEADLG_79'].add('MCAALTTM')
    if 'LNS1C' in filtered3_covariate_sets[cov_set]['MSEADLG_79']:
        filtered3_covariate_sets[cov_set]['MSEADLG_79'].remove('LNS1C')
        filtered3_covariate_sets_removed_feats[cov_set]['MSEADLG_79'].add('LNS1C')
    if 'MCADATE' in filtered3_covariate_sets[cov_set]['MSEADLG_79']:
        filtered3_covariate_sets[cov_set]['MSEADLG_79'].remove('MCADATE')
        filtered3_covariate_sets_removed_feats[cov_set]['MSEADLG_79'].add('MCADATE')

In [6]:
# add 'CNTRLSEX' for Psychiatric, hybrid_requiremotor, MSEADLG_79
for cov_set in filtered2_covariate_sets.keys():
    if 'CNTRLSEX' in filtered2_covariate_sets[cov_set]['Psychiatric']:
        filtered2_covariate_sets[cov_set]['Psychiatric'].remove('CNTRLSEX')
        filtered2_covariate_sets_removed_feats[cov_set]['Psychiatric'].add('CNTRLSEX')
    if 'CNTRLSEX' in filtered2_covariate_sets[cov_set]['hybrid_requiremotor']:
        filtered2_covariate_sets[cov_set]['hybrid_requiremotor'].remove('CNTRLSEX')
        filtered2_covariate_sets_removed_feats[cov_set]['hybrid_requiremotor'].add('CNTRLSEX')
    if 'CNTRLSEX' in filtered2_covariate_sets[cov_set]['MSEADLG_79']:
        filtered2_covariate_sets[cov_set]['MSEADLG_79'].remove('CNTRLSEX')
        filtered2_covariate_sets_removed_feats[cov_set]['MSEADLG_79'].add('CNTRLSEX')
for cov_set in filtered3_covariate_sets.keys():
    if 'CNTRLSEX' in filtered3_covariate_sets[cov_set]['Psychiatric']:
        filtered3_covariate_sets[cov_set]['Psychiatric'].remove('CNTRLSEX')
        filtered3_covariate_sets_removed_feats[cov_set]['Psychiatric'].add('CNTRLSEX')
    if 'CNTRLSEX' in filtered3_covariate_sets[cov_set]['hybrid_requiremotor']:
        filtered3_covariate_sets[cov_set]['hybrid_requiremotor'].remove('CNTRLSEX')
        filtered3_covariate_sets_removed_feats[cov_set]['hybrid_requiremotor'].add('CNTRLSEX')
    if 'CNTRLSEX' in filtered3_covariate_sets[cov_set]['MSEADLG_79']:
        filtered3_covariate_sets[cov_set]['MSEADLG_79'].remove('CNTRLSEX')
        filtered3_covariate_sets_removed_feats[cov_set]['MSEADLG_79'].add('CNTRLSEX')

In [13]:
# add SCAU16 to Autonomic, hybrid_requiremotor, MSEADLG_79
for cov_set in filtered2_covariate_sets.keys():
    if 'SCAU16' in filtered2_covariate_sets[cov_set]['Autonomic']:
        filtered2_covariate_sets[cov_set]['Autonomic'].remove('SCAU16')
        filtered2_covariate_sets_removed_feats[cov_set]['Autonomic'].add('SCAU16')
    if 'SCAU16' in filtered2_covariate_sets[cov_set]['hybrid_requiremotor']:
        filtered2_covariate_sets[cov_set]['hybrid_requiremotor'].remove('SCAU16')
        filtered2_covariate_sets_removed_feats[cov_set]['hybrid_requiremotor'].add('SCAU16')
    if 'SCAU16' in filtered2_covariate_sets[cov_set]['MSEADLG_79']:
        filtered2_covariate_sets[cov_set]['MSEADLG_79'].remove('SCAU16')
        filtered2_covariate_sets_removed_feats[cov_set]['MSEADLG_79'].add('SCAU16')
for cov_set in filtered3_covariate_sets.keys():
    if 'SCAU16' in filtered3_covariate_sets[cov_set]['Autonomic']:
        filtered3_covariate_sets[cov_set]['Autonomic'].remove('SCAU16')
        filtered3_covariate_sets_removed_feats[cov_set]['Autonomic'].add('SCAU16')
    if 'SCAU16' in filtered3_covariate_sets[cov_set]['hybrid_requiremotor']:
        filtered3_covariate_sets[cov_set]['hybrid_requiremotor'].remove('SCAU16')
        filtered3_covariate_sets_removed_feats[cov_set]['hybrid_requiremotor'].add('SCAU16')
    if 'SCAU16' in filtered3_covariate_sets[cov_set]['MSEADLG_79']:
        filtered3_covariate_sets[cov_set]['MSEADLG_79'].remove('SCAU16')
        filtered3_covariate_sets_removed_feats[cov_set]['MSEADLG_79'].add('SCAU16')

In [14]:
with open('finalized_covariate_sets_filter2.pkl', 'w') as f:
    pickle.dump(filtered2_covariate_sets, f)
with open('finalized_covariate_sets_filter3.pkl', 'w') as f:
    pickle.dump(filtered3_covariate_sets, f)
with open('finalized_removed_feats_filter2.pkl', 'w') as f:
    pickle.dump(filtered2_covariate_sets_removed_feats, f)
with open('finalized_removed_feats_filter3.pkl', 'w') as f:
    pickle.dump(filtered3_covariate_sets_removed_feats, f)

In [None]:
# redo above for frequency less than 5% remove
# if remove from set for 2, also remove from set for 3
# if remove from a set, also automatically remove from hybrid and MSEADLG
# link the motor, cognitive, hybrid pairs

In [2]:
import numpy as np, pandas as pd, pickle
with open('test_patnos_filter2.pkl', 'r') as f:
    filter2_test_patnos = pickle.load(f)
with open('test_patnos_filter3.pkl', 'r') as f:
    filter3_test_patnos = pickle.load(f)
with open('valid_patnos_filter2.pkl', 'r') as f:
    filter2_valid_patnos = pickle.load(f)
with open('valid_patnos_filter3.pkl', 'r') as f:
    filter3_valid_patnos = pickle.load(f)
surv_df_filter2 = pd.read_csv('PD_outcomes_filter2yrs.csv')
surv_df_filter3 = pd.read_csv('PD_outcomes_filter3yrs.csv')
with open('../finalized_outcome_survival_models/final_all_covariate_sets.pkl', 'r') as f:
    all_covariate_sets = pickle.load(f)
baseline_df = pd.read_csv('../finalized_outcome_survival_models/final_survival_baseline_data.csv')

In [5]:
outcomes = ['Motor', 'Cognitive', 'Psychiatric', 'Autonomic', 'Sleep', 'hybrid_requiremotor', 'NUPDRS23_45', \
            'MOCA_25', 'MSEADLG_79']
filtered2_covariate_sets = dict()
filtered3_covariate_sets = dict()
filtered2_covariate_sets_removed_feats = dict()
filtered3_covariate_sets_removed_feats = dict()
for cov_set in all_covariate_sets.keys():
    filtered2_covariate_sets[cov_set] = dict()
    filtered3_covariate_sets[cov_set] = dict()
    filtered2_covariate_sets_removed_feats[cov_set] = dict()
    filtered3_covariate_sets_removed_feats[cov_set] = dict()
    for outcome in outcomes:
        if outcome == 'NUPDRS23_45':
            outcome_cov_set = list(all_covariate_sets[cov_set]['Standard']) \
                + list(all_covariate_sets[cov_set]['Motor'])
        elif outcome == 'MOCA_25':
            outcome_cov_set = list(all_covariate_sets[cov_set]['Standard']) \
                + list(all_covariate_sets[cov_set]['Cognitive'])
        elif outcome == 'MSEADLG_79' or outcome == 'hybrid_requiremotor':
            outcome_cov_set = list(all_covariate_sets[cov_set]['Standard']) \
                + list(all_covariate_sets[cov_set]['Motor']) + list(all_covariate_sets[cov_set]['Cognitive']) \
                + list(all_covariate_sets[cov_set]['Autonomic']) + list(all_covariate_sets[cov_set]['Psychiatric']) \
                + list(all_covariate_sets[cov_set]['Sleep'])
        else:
            outcome_cov_set = list(all_covariate_sets[cov_set]['Standard']) \
                + list(all_covariate_sets[cov_set][outcome])
        outcome2_cov_set = list(set(outcome_cov_set))
        outcome3_cov_set = list(set(outcome_cov_set))
        outcome2_removed_feats = set()
        outcome3_removed_feats = set()
        outcome2_df = surv_df_filter2[['PATNO', outcome + '_E', \
                                       outcome + '_T']].merge(baseline_df[['PATNO']+outcome2_cov_set]).dropna()
        outcome2_df = outcome2_df.loc[outcome2_df[outcome + '_T'] > 0]
        train_valid2_df = outcome2_df.loc[~outcome2_df['PATNO'].isin(filter2_test_patnos[outcome])]
        outcome3_df = surv_df_filter3[['PATNO', outcome + '_E', \
                                       outcome + '_T']].merge(baseline_df[['PATNO']+outcome3_cov_set]).dropna()
        outcome3_df = outcome3_df.loc[outcome3_df[outcome + '_T'] > 0]
        train_valid3_df = outcome3_df.loc[~outcome3_df['PATNO'].isin(filter3_test_patnos[outcome])]
        for feat in outcome_cov_set:
            for fold_idx in range(4):
                train2_df \
                    = train_valid2_df.loc[~train_valid2_df['PATNO'].isin(filter2_valid_patnos[outcome][fold_idx])]
                obs2_df = train2_df.loc[train2_df[outcome + '_E'] == 1]
                cens2_df = train2_df.loc[train2_df[outcome + '_E'] == 0]
                obs2_std = obs2_df[feat].std()
                cens2_std = cens2_df[feat].std()
                if set(train2_df[feat].unique().tolist()).issubset({0,1}):
                    feat_freq2 = train2_df[feat].mean()
                else:
                    feat_freq2 = .5
                if obs2_std == 0 or cens2_std == 0 or feat_freq2 < .05 or feat_freq2 > .95:
                    outcome2_cov_set.remove(feat)
                    outcome2_removed_feats.add(feat)
                    break
                train3_df \
                    = train_valid3_df.loc[~train_valid3_df['PATNO'].isin(filter3_valid_patnos[outcome][fold_idx])]
                obs3_df = train3_df.loc[train3_df[outcome + '_E'] == 1]
                cens3_df = train3_df.loc[train3_df[outcome + '_E'] == 0]
                obs3_std = obs3_df[feat].std()
                cens3_std = cens3_df[feat].std()
                if set(train3_df[feat].unique().tolist()).issubset({0,1}):
                    feat_freq3 = train3_df[feat].mean()
                else:
                    feat_freq3 = .5
                if obs3_std == 0 or cens3_std == 0 or feat_freq3 < .05 or feat_freq3 > .95:
                    outcome3_cov_set.remove(feat)
                    outcome3_removed_feats.add(feat)
                    break
        filtered2_covariate_sets[cov_set][outcome] = outcome2_cov_set
        filtered3_covariate_sets[cov_set][outcome] = outcome3_cov_set
        filtered2_covariate_sets_removed_feats[cov_set][outcome] = outcome2_removed_feats
        filtered3_covariate_sets_removed_feats[cov_set][outcome] = outcome3_removed_feats
share_remove_cov_sets = ['questions_plus_treatment_imaging_CSF_expanded_genetic', \
                         'questions_plus_treatment_imaging_CSF_expanded', \
                         'standard_plus_treatment_imaging_CSF_expanded']
for outcome in outcomes:
    outcome_shared_removed = set()
    for cov_set in share_remove_cov_sets:
        outcome_shared_removed = outcome_shared_removed.union(filtered2_covariate_sets_removed_feats[cov_set][outcome])
        outcome_shared_removed = outcome_shared_removed.union(filtered3_covariate_sets_removed_feats[cov_set][outcome])
    for cov_set in share_remove_cov_sets:
        filtered2_covariate_sets_removed_feats[cov_set][outcome] = outcome_shared_removed
        filtered3_covariate_sets_removed_feats[cov_set][outcome] = outcome_shared_removed
for cov_set in share_remove_cov_sets:
    cog_related2_removed \
        = filtered2_covariate_sets_removed_feats[cov_set]['Cognitive'].union(filtered2_covariate_sets_removed_feats[cov_set]['MOCA_25'])
    filtered2_covariate_sets_removed_feats[cov_set]['Cognitive'] = cog_related2_removed
    filtered2_covariate_sets_removed_feats[cov_set]['MOCA_25'] = cog_related2_removed
    
    cog_related3_removed \
        = filtered3_covariate_sets_removed_feats[cov_set]['Cognitive'].union(filtered3_covariate_sets_removed_feats[cov_set]['MOCA_25'])
    filtered3_covariate_sets_removed_feats[cov_set]['Cognitive'] = cog_related3_removed
    filtered3_covariate_sets_removed_feats[cov_set]['MOCA_25'] = cog_related3_removed
    
    motor_related2_removed \
        = filtered2_covariate_sets_removed_feats[cov_set]['Motor'].union(filtered2_covariate_sets_removed_feats[cov_set]['NUPDRS23_45'])
    filtered2_covariate_sets_removed_feats[cov_set]['Motor'] =  motor_related2_removed
    filtered2_covariate_sets_removed_feats[cov_set]['NUPDRS23_45'] = motor_related2_removed
    
    motor_related3_removed \
        = filtered3_covariate_sets_removed_feats[cov_set]['Motor'].union(filtered3_covariate_sets_removed_feats[cov_set]['NUPDRS23_45'])
    filtered3_covariate_sets_removed_feats[cov_set]['Motor'] = motor_related3_removed
    filtered3_covariate_sets_removed_feats[cov_set]['NUPDRS23_45'] = motor_related3_removed
    
    hybrid_related2_removed = set()
    hybrid_related3_removed = set()
    for outcome in outcomes:
        hybrid_related2_removed = hybrid_related2_removed.union(filtered2_covariate_sets_removed_feats[cov_set][outcome])
        hybrid_related3_removed = hybrid_related3_removed.union(filtered3_covariate_sets_removed_feats[cov_set][outcome])
    filtered2_covariate_sets_removed_feats[cov_set]['hybrid_requiremotor'] = hybrid_related2_removed
    filtered2_covariate_sets_removed_feats[cov_set]['MSEADLG_79'] = hybrid_related2_removed
    filtered3_covariate_sets_removed_feats[cov_set]['hybrid_requiremotor'] = hybrid_related3_removed
    filtered3_covariate_sets_removed_feats[cov_set]['MSEADLG_79'] = hybrid_related3_removed
        
for outcome in outcomes:
    for cov_set in all_covariate_sets.keys():
        filtered2_covariate_sets[cov_set][outcome] \
            = set(filtered2_covariate_sets[cov_set][outcome]).difference(filtered2_covariate_sets_removed_feats[cov_set][outcome])
        filtered3_covariate_sets[cov_set][outcome] \
            = set(filtered3_covariate_sets[cov_set][outcome]).difference(filtered3_covariate_sets_removed_feats[cov_set][outcome])
print(filtered2_covariate_sets_removed_feats)
print(filtered3_covariate_sets_removed_feats)

{'questions_plus_treatment_imaging_CSF_expanded_genetic': {'Autonomic': set(['SCAU7', 'SCAU16']), 'Cognitive': set(['MCAFDS', 'MCACLCKN', 'MCACLCKC', 'BJLOT11', 'MCAVIGIL', 'BJLOT15', 'BJLOT17', 'LNS2C', 'LNS2B', 'MCARHINO', 'LNS2A', 'BJLOT5', 'BJLOT7', 'LNS1C', 'LNS1B', 'MCADATE', 'BJLOT9', 'BJLOT13', 'MCABDS', 'BJLOT3', 'MCAALTTM']), 'Psychiatric': set(['TMTRWD', 'RAWHITE', 'GDSWRTLS', 'GDSSATIS', 'TMSEX', 'PhysExam_Psychiatric', 'ANXIOLYTIC', 'GDSHOPLS', 'CNTRLEAT', 'TMTORACT', 'NP1HALL', 'TMBUY', 'TMTMTACT', 'GDSEMPTY', 'CNTRLSEX', 'TMGAMBLE']), 'MOCA_25': set(['MCAFDS', 'MCACLCKN', 'MCACLCKC', 'BJLOT11', 'MCAVIGIL', 'BJLOT15', 'BJLOT17', 'LNS2C', 'LNS2B', 'MCARHINO', 'LNS2A', 'BJLOT5', 'BJLOT7', 'LNS1C', 'LNS1B', 'MCADATE', 'BJLOT9', 'BJLOT13', 'MCABDS', 'BJLOT3', 'MCAALTTM']), 'Sleep': set(['DRMOBJFL', 'STROKE', 'CNSOTH', 'DRMUMV', 'RLS', 'ESS6', 'SLEEP AID']), 'Motor': set(['PIGD', 'NP3RTALJ', 'NP2FREZ']), 'NUPDRS23_45': set(['PIGD', 'NP3RTALJ', 'NP2FREZ']), 'MSEADLG_79': set(['

In [6]:
with open('finalized_covariate_sets_filter2.pkl', 'w') as f:
    pickle.dump(filtered2_covariate_sets, f)
with open('finalized_covariate_sets_filter3.pkl', 'w') as f:
    pickle.dump(filtered3_covariate_sets, f)
with open('finalized_removed_feats_filter2.pkl', 'w') as f:
    pickle.dump(filtered2_covariate_sets_removed_feats, f)
with open('finalized_removed_feats_filter3.pkl', 'w') as f:
    pickle.dump(filtered3_covariate_sets_removed_feats, f)