In [None]:
import fileinput
import glob
import pandas as pd
import numpy as np
import os
from datetime import datetime
from datetime import timedelta

### Load final data

In [None]:
merged_totalModel = pd.read_csv("final\\merged_totalModel_FINAL.csv")

### 0. For imputation

In [None]:
merged_totalModel["gen_ethnicity"] = merged_totalModel["gen_ethnicity"].astype('category')
merged_totalModel["ethnicity_encoded"] = merged_totalModel["gen_ethnicity"].cat.codes

# Reorder columns (and only the ones that will be used for imputation)
merged_totalModel = merged_totalModel[['e_patid', 'baseline_age', 'followup_age', 'followup_time', 'gender_encoded', 'gen_ethnicity', 'ethnicity_encoded',
                                       'townsend', 'height (cm)', 'weight (kg)', 'alcohol_status', 'smoking_status',
                                       'diabetes_status', 'antiplatelets_use', 'crp value (mg/L)', 'total_outcomes']]

# look at which encoded value corresponds to which original value
ethnicity_codes_totalModel = dict( enumerate(merged_totalModel['gen_ethnicity'].cat.categories ) )

# Only keep encoded ethnicities
merged_totalModel = merged_totalModel[['e_patid', 'baseline_age', 'followup_age', 'followup_time', 'gender_encoded', 'ethnicity_encoded',
                                       'townsend', 'height (cm)', 'weight (kg)', 'alcohol_status', 'smoking_status',
                                       'diabetes_status', 'antiplatelets_use', 'crp value (mg/L)', 'total_outcomes']]

# convert unnecessary floats to ints!
merged_totalModel['gender_encoded'] = merged_totalModel['gender_encoded'].astype(np.int64)
merged_totalModel['townsend'] = merged_totalModel['townsend'].astype(np.int64)
merged_totalModel['alcohol_status'] = merged_totalModel['alcohol_status'].astype(np.int64)
merged_totalModel['smoking_status'] = merged_totalModel['smoking_status'].astype(np.int64)
merged_totalModel['total_outcomes'] = merged_totalModel['total_outcomes'].astype(np.int64)

# save file
merged_totalModel.to_csv('final\\0. Imputation\\imputation_totalModel.csv', index = False)

### 1. Create final versions for Aim 1: comparing FG performance in Netherlands vs. UK population

In [None]:
# output of imputation!

### 2. Create final versions for Aim 2: comparing stats to ML

### 2a. FG to compare to ML

##### a. change outcomes to reflect what happens to the patient at the 10 year follow up time point
##### b. alter follow up time and follow up age to reflect the new 10 year time frame being undertaken 
##### c. split data into training and testing (do cases and controls separately and then merge to maintain equal case-control balance across train-test)

In [None]:
### Add ihd_diagdate, cancerdate, and deathdate to be able to determine 10 year outcome and 10 year followup age/time
merged_totalModel = pd.merge(merged_totalModel, adding_ft_total[['e_patid', 'ihd_diagdate', 'cancerdate', 'deathdate']], on='e_patid', how='left')

merged_totalModel.rename(columns={"height..cm.": "height (cm)", "weight..kg.": "weight (kg)", "crp.value..mg.L." : "crp value (mg/L)"}, inplace = True)

# re-order the columns 
merged_totalModel = merged_totalModel[['e_patid', 'ihd_diagdate', 'cancerdate', 'deathdate', 'baseline_age', 'followup_age', 
                                       'followup_time', 'gender_encoded', 'ethnicity_encoded',
                                       'townsend', 'height (cm)', 'weight (kg)', 'alcohol_status', 'smoking_status',
                                       'diabetes_status', 'antiplatelets_use', 'crp value (mg/L)', 'total_outcomes']]



### Change outcomes to reflect 10 year outcome 
# convert relevant date columns to datetime type if they are not already
merged_totalModel['ihd_diagdate'] = pd.to_datetime(merged_totalModel['ihd_diagdate'])
merged_totalModel['cancerdate'] = pd.to_datetime(merged_totalModel['cancerdate'])
merged_totalModel['deathdate'] = pd.to_datetime(merged_totalModel['deathdate'])

# define the duration for outcomes after ihd_diagdate (10 years)
outcome_duration = timedelta(days=365 * 10)

# update the 'total_outcomes' column based on the specified conditions
merged_totalModel['total_outcomes'] = 0  # Initialize with 0 for all rows

# update with value 1 if cancerdate happens within 10 years after ihd_diagdate
merged_totalModel.loc[(merged_totalModel['cancerdate'] <= merged_totalModel['ihd_diagdate'] + outcome_duration) &
                     (merged_totalModel['cancerdate'].notnull()), 'total_outcomes'] = 1

# update with value 2 if deathdate happens before ihd_diagdate + 10 years
merged_totalModel.loc[(merged_totalModel['deathdate'] <= merged_totalModel['ihd_diagdate'] + outcome_duration) &
                     (merged_totalModel['deathdate'].notnull()) & (merged_totalModel['total_outcomes'] == 0), 'total_outcomes'] = 2



### Change follow up time and follow up to reflect the new 10 year follow up timeline 
# create a new column 'followup_enddate' based on the conditions: if outcomes is 0 make followup_enddate ihd_diagnosis + 10, if 1 make it cancerdate, if 2 make it deathdate
merged_totalModel['followup_enddate'] = merged_totalModel['ihd_diagdate'] + timedelta(days = 365*10)
merged_totalModel.loc[(merged_totalModel['total_outcomes'] == 1), 'followup_enddate'] = merged_totalModel['cancerdate']
merged_totalModel.loc[(merged_totalModel['total_outcomes'] == 2), 'followup_enddate'] = merged_totalModel['deathdate']

# set relevant columns as datetime objects
merged_totalModel['followup_enddate'] = pd.to_datetime(merged_totalModel['followup_enddate'], errors = 'coerce')
merged_totalModel['ihd_diagdate'] = pd.to_datetime(merged_totalModel['ihd_diagdate'], errors = 'coerce')

# compute follow up time 
merged_totalModel['followup_time'] = (merged_totalModel['followup_enddate'].dt.year - merged_totalModel['ihd_diagdate'].dt.year)

# compute follow up age 
merged_totalModel['followup_age'] = merged_totalModel['baseline_age'] + merged_totalModel['followup_time']

# reorder columns
merged_totalModel = merged_totalModel[['e_patid', 'baseline_age', 'followup_age', 'followup_time', 'gender_encoded',
                     'ethnicity_encoded', 'townsend', 'height (cm)', 'weight (kg)', 'alcohol_status', 
                     'smoking_status','diabetes_status', 'antiplatelets_use', 'crp value (mg/L)', 'total_outcomes']]



### Perform 80-20 split for cases and controls 
# split into cases and controls
merged_totalModel_cases = merged_totalModel[merged_totalModel["total_outcomes"] == 1]
merged_totalModel_controls = merged_totalModel[(merged_totalModel["total_outcomes"] == 0) | (merged_totalModel["total_outcomes"] == 2)]

# randomly split each of these into 90% for training and 10% for testing 
merged_totalModel_casesTraining = merged_totalModel_cases.sample(frac = 0.9)
merged_totalModel_casesTesting = merged_totalModel_cases.drop(merged_totalModel_casesTraining.index)
merged_totalModel_controlsTraining = merged_totalModel_controls.sample(frac = 0.9)
merged_totalModel_controlsTesting = merged_totalModel_controls.drop(merged_totalModel_controlsTraining.index)
 
# merge training controls/cases and merge testing controls/cases
merged_totalModel_Training = pd.concat([merged_totalModel_casesTraining, merged_totalModel_controlsTraining])
merged_totalModel_Testing = pd.concat([merged_totalModel_casesTesting, merged_totalModel_controlsTesting])


### Save the files!
merged_totalModel_Training.to_csv('final\\2a. Compare FG to ML\\FGTraining_totalModel.csv', index = False)
merged_totalModel_Testing.to_csv('final\\2a. Compare FG to ML\\FGTesting_totalModel.csv', index = False)

### 2b. Cox (to compare competing risks), LR, and xGB 

###### cut data at 10 years, binary outcome, split data 80-20

In [None]:
# TOTAL MODEL
# load in the FG 10-year cut off data that will be edited
CoxMLTraining_totalModel = pd.read_csv('final\\2a. Compare FG to ML\\FGTraining_totalModel.csv')
CoxMLTesting_totalModel = pd.read_csv('final\\2a. Compare FG to ML\\FGTesting_totalModel.csv')

# change outcomes labelled "2" to a label of "0" so we end up with a binary 0 (no cancer) and 1 (cancer)
CoxMLTraining_totalModel.loc[(CoxMLTraining_totalModel['total_outcomes'] == 2), 'total_outcomes'] = 0
CoxMLTesting_totalModel.loc[(CoxMLTesting_totalModel['total_outcomes'] == 2), 'total_outcomes'] = 0

# save the data
CoxMLTraining_totalModel.to_csv('final\\2b. Compare Cox, ML\\CoxMLTraining_totalModel.csv', index = False)
CoxMLTesting_totalModel.to_csv('final\\2b. Compare Cox, ML\\CoxMLTesting_totalModel.csv', index = False)