In [1]:
import os
import sys
import yaml
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.insert(0, parent_dir)
os.chdir(parent_dir)
from src.utils import *
from src.imputation import MICE
from src.t_star import TStar 
from src.preprocessingPipeline import PreprocessingPipeline
from src.metrics import Metrics
from src.subpopulations import SubpopulationCreator
from src.coxphm import CoxProportionalHazardsModel
from src.subpopulations import SubpopulationCreator
from src.monte_carlo import Monte_Carlo_Cross_Validation

In [3]:
with open('paths.yaml', 'r') as file:
    paths = yaml.safe_load(file)

path_prediction = paths['PATH_PREDICTION']
path_data = paths['PATH_IMPUTED']

In [4]:
feature_set = "set1"

In [5]:
def run_experiment(t_star_date, txpt, ept, txpp, epp, 
                   plausible_values, pmm_value, monte_carlo_repetitions, proportion,
                   date_for_prediction, source_ID=None):
    
    t_star = TStar(t_star_date=t_star_date, txpt=txpt, 
                   ept=ept, txpp=txpp, epp=epp, 
                   load=False, indexes_from_ID=source_ID)


    if feature_set == "set1": 
        
        predictors = ['AGE_DON', 'AGE', 'CREAT_DON', 'DIAB', 'DIALYSIS_DATE', 'HCV_SEROSTATUS', 'COD_CAD_DON', 
        'ETHCAT_DON', 'HGT_CM_DON_CALC', 'HIST_HYPERTENS_DON', 'NON_HRT_DON', 'DIABETES_DON', 'HLAMIS', 'ABO_MAT' ]

        columns = ['GTIME_KI', 'GSTATUS_KI','COMPOSITE_DEATH_DATE', 'TX_DATE', 'ETHCAT', 'GENDER'] + predictors

        predictors_to_remove = ['DIAB', 'COD_CAD_DON', 'COD_CAD_DON_3', 'COD_CAD_DON_4', 
                            'COD_CAD_DON_999', 'DIAB_2', 'DIAB_3', 'DIAB_4', 'DIAB_5', 
                            'ETHCAT_DON_4', 'ETHCAT_DON_5', 'ETHCAT_DON_6', 'ETHCAT_DON_7',
                            'ETHCAT_DON_9', 'ETHCAT_DON_998', 'HLAMIS_1', 'HLAMIS_2', 'HLAMIS_3', 'HLAMIS_4', 'HLAMIS_5', 
                            'HLAMIS_6', 'ABO_MAT_2', 'ABO_MAT_3', 'dialysis_days'] 


    elif feature_set == "set2": 
        predictors = ['AGE_DON', 'AGE', 'CREAT_DON', 'DIAB', 'DIALYSIS_DATE', 'HCV_SEROSTATUS', 'COD_CAD_DON', 
        'ETHCAT_DON', 'HGT_CM_DON_CALC', 'HIST_HYPERTENS_DON', 'NON_HRT_DON', 'DIABETES_DON', 'HLAMIS', 'ABO_MAT' ]

        columns = ['GTIME_KI', 'GSTATUS_KI','COMPOSITE_DEATH_DATE', 'TX_DATE'] + predictors

        predictors_to_remove = ['DIAB', 'COD_CAD_DON', 'HLAMIS', 'ABO_MAT'] 
        
    else: 

        columns = ['GTIME_KI', 'GSTATUS_KI', 'AGE_DON', 'HCV_SEROSTATUS', 'GENDER_DON', 'AGE', 'DIAB',
                    'BMI_CALC', 'ETHCAT', 'GENDER', 'ABO_MAT', 'AMIS', 'BMIS', 'DRMIS', 'HLAMIS', 
                    'COLD_ISCH_KI', 'SERUM_CREAT', 'COMPOSITE_DEATH_DATE', 'DIABETES_DON',
                    'CREAT_TRR', 'HGT_CM_DON_CALC', 'HGT_CM_CALC', 'WGT_KG_CALC',
                    'COD_CAD_DON', 'WGT_KG_DON_CALC', 'CREAT_DON', 'ETHCAT_DON',
                    'HIST_HYPERTENS_DON', 'NON_HRT_DON', 'DIALYSIS_DATE','TX_DATE', 'PT_CODE', 'ABO', 'ON_DIALYSIS',
                    'DAYSWAIT_CHRON', 
                    'TOT_SERUM_ALBUM', 'PRI_PAYMENT_TCR_KI', 'PRE_TX_TXFUS', 'END_CPRA_DETAIL', 'PERIP_VASC', 
                    'BMI_DON_CALC', 'NPKID',
                    'HIST_CIG_DON', 'CMV_IGG', 'TXKID',
                    'DISTANCE',   'DIAG_KI', 'DIABDUR_DON', 'CREAT_TRR'
                    ]
        predictors = ['AGE_DON', 'AGE', 'CREAT_DON', 'DIAB', 'DIALYSIS_DATE', 'HCV_SEROSTATUS', 'COD_CAD_DON']
        
        
    t_star.set_predictors(predictors, columns)

    t_star.get_data(model_type='cphm')

    pipeline = PreprocessingPipeline(time_in_dialysis_from_date=True)

    df_train, df_test  = pipeline.run(t_star) 
     
    sub_creator = SubpopulationCreator(t_star)

    t_star = pipeline.delete_predictor_from_t_star(t_star, predictors=predictors_to_remove)

    df_train = pipeline.delete_predictor_from_date_set(df_train, predictors=predictors_to_remove)

    df_test = pipeline.delete_predictor_from_date_set(df_test, predictors=predictors_to_remove)     


    df_train.columns = [col.replace('.0', '') for col in df_train.columns]
    df_test.columns = [col.replace('.0', '') for col in df_test.columns]

    impute_train = MICE(plausible_values, pmm_value, t_star, df_train, phase='Train')
    impute_train.run(delete_old_files=True)

    impute_test = MICE(plausible_values, pmm_value, t_star, df_test, phase = 'Test')
    impute_test.run()

    cph_python = CoxProportionalHazardsModel(environment="python", penalizer=0.0)

    
    mccv = Monte_Carlo_Cross_Validation(cph_python, t_star, repetitions=monte_carlo_repetitions, 
                                        combined_models=True, internal_testing_proportion=proportion)

    mccv.run()

    t_star_short = (t_star.dir_name.replace('000000', '')+'_pv_'+
                    str(plausible_values)+'_pmm_'+
                    str(pmm_value)+'_mccv_'+
                    str(monte_carlo_repetitions) + '_proportion_' +
                    str(proportion))

    repetitions = os.listdir(path_prediction+t_star.dir_name+'/')
    #repetitions.remove('after_tstar')

    for rep in repetitions:

        baseline_prediction_path_test = path_prediction+t_star.dir_name+'/'+rep+'/after_tstar/'
        
        prediction_test_files = os.listdir(baseline_prediction_path_test)

        baseline_prediction_path_train = path_prediction+t_star.dir_name+'/'+rep+'/'
        
        files_train = os.listdir(baseline_prediction_path_train)
        
        files_train.remove('after_tstar')
        
        files_with_train = [f for f in os.listdir( path_data+t_star.dir_name) if 'Train' in f]
        files_with_test = [f for f in os.listdir( path_data+t_star.dir_name) if 'Test' in f]
        
        eth_train, eth_test       = sub_creator.create_eth()
        gender_train, gender_test = sub_creator.create_gender()
        
        metrics = Metrics(t_star_date=t_star_date, txpt=txpt, ept=ept, txpp=txpp,
                      epp=epp, imputed=False)
        

        subpopulations = [eth_test, gender_test]
        df_test_im = pd.read_csv(path_data+t_star.dir_name+'/'+files_with_test[0])

        df_test_im.set_index('index', inplace=True)

        local_path = 'notebooks/experiments/results/test/'
            
        df_train_repetition = pd.read_csv(path_data+t_star.dir_name+'/'+files_with_train[0])
        
        df_train_repetition.set_index('index', inplace=True)
        
        counter = 0
        for path_predictions in prediction_test_files:
            t_star_short_local = t_star_short + '_rep_'+str(rep)+'_set_test'
            if counter == 0: 
                predictions = pd.read_csv(baseline_prediction_path_test+path_predictions, index_col=0)
                counter = counter + 1 
            else: 
                new_predictions = pd.read_csv(baseline_prediction_path_test+path_predictions, index_col=0)
                predictions = predictions + new_predictions
                counter = counter + 1 
            
        predictions = predictions/len(prediction_test_files)

        save_metrics(predictions, local_path, t_star_short_local,
                        subpopulations, df_test_im, date_for_prediction, metrics)
    
         
        local_path = 'notebooks/experiments/results/train/'
        idx = pd.read_csv('data/'+t_star.dir_name+'/'+str(rep)+'/test_idx.csv', index_col=1)              
        data_validation = df_train_repetition.loc[idx.index] 
        
        eth_train = eth_train[eth_train.index.isin(data_validation.index)]
        eth_train = eth_train[eth_train.index.drop_duplicates()]
        
        gender_train = gender_train[gender_train.index.isin(data_validation.index)]
        gender_train = gender_train[gender_train.index.drop_duplicates()]

        subpopulations = [eth_train, gender_train]    
        
        del predictions
        counter = 0
        
        for path_predictions in files_train:
            t_star_short_local = t_star_short + '_rep_'+str(rep)+'_set_train'                  
            if counter == 0: 
                predictions = pd.read_csv(baseline_prediction_path_train+path_predictions, index_col=0)
                print(predictions.columns)
                counter = counter + 1 
            else: 
                new_predictions = pd.read_csv(baseline_prediction_path_train+path_predictions, index_col=0)
                predictions = predictions + new_predictions
                counter = counter + 1 
        
        predictions = predictions/len(files_train)
        
        save_metrics(predictions, local_path, t_star_short_local, 
                        subpopulations, data_validation, date_for_prediction,
                        metrics, train_df = df_train_repetition)
              

In [7]:
epp = 6
date_for_prediction = int(365.25*5)
EXP = [
    (x, y, 1, epp, date, date_for_prediction, 10, 10, 40, 0.3)
    for date in ["31-12-2009", "31-12-2010", "31-12-2011", "31-12-2012", "31-12-2013"]
    for (x, y) in [(1,6), (5,6), (3,9)]
]

for (txpt, ept, txpp, epp, t_star_date, date_for_prediction, 
     plausible_values, pmm_value, monte_carlo_repetitions, proportion) in EXP:
    
    run_experiment(t_star_date, txpt, ept, txpp, epp, plausible_values, 
                   pmm_value, monte_carlo_repetitions, proportion, date_for_prediction, 
                   source_ID = None)

t_b:  2004-01-01 00:00:00
t_final_training_period:  2004-12-31 00:00:00
t_final_event_period:  2009-12-31 00:00:00
t_final_prediction_period:  2015-12-31 00:00:00
t_final_prediction_tx_period:  2010-12-31 00:00:00


FileNotFoundError: [Errno 2] No such file or directory: 'data/kidpan_filtered.csv'