In [1]:
import fileinput
import glob
import pandas as pd
import numpy as np
import os
from datetime import datetime
from datetime import timedelta

### Load Data

In [None]:
# IHD DATA
DIR = "merged_files"
os.chdir(DIR)

ihd = pd.read_csv('ihd.csv')

# DEATH DATA
DIR = "original_files"
os.chdir(DIR)

death = pd.read_csv('deaths.csv', sep = '\t')
cancers = pd.read_csv('cancers.csv', sep = '\t')

# make sure deaths are only non-cancer
compare = cancers.e_patid
bools = death.e_patid.isin(compare)
death = death[~bools.values] # now you just have non cancer death

# also only keep deaths that appear in IHD dataframe
compare2 = ihd.e_patid
bools2 = death.e_patid.isin(compare2)
death = death[bools2.values]

# turn death death column into datetime object
death['deathdate'] = pd.to_datetime(death['deathdate'], format = '%d%b%Y', errors = 'coerce')

# CANCER DATA
# turn cancer diagnosis data into datetime object
cancer["diagdate"] = pd.to_datetime(cancer["diagdate"], format = "%d%b%Y")

# Create the total cancer dataset 
DIR = "cleaned_files"
os.chdir(DIR)
cancer_total = cancer[["e_patid", "diagdate", "cancersite1"]]
cancer_total.to_csv("total_cancer.csv", index = False)

### Merge IHD, cancer, death data

In [None]:
# merge the data frames
outcomes_merged_total = pd.merge(ihd, death, on = 'e_patid', how = 'left')
outcomes_merged_total = pd.merge(outcomes_merged_total, cancer_total, on = 'e_patid', how = 'left')

# Convert obsdate and deathdate to datetime objects
outcomes_merged_total['obsdate'] = pd.to_datetime(outcomes_merged_total['obsdate'])
outcomes_merged_total['deathdate'] = pd.to_datetime(outcomes_merged_total['deathdate'], errors = 'coerce')

# Create a new column that indicates whether deathdate is after obsdate
outcomes_merged_total['is_valid'] = (outcomes_merged_total['deathdate'].isnull()) | (outcomes_merged_total['deathdate'] > outcomes_merged_total['obsdate'])

# check out how many are invalid hehe
outcomes_merged_total[~outcomes_merged_total['is_valid']]

# make sure diagdate is datetime object
outcomes_merged_total['diagdate'] = pd.to_datetime(outcomes_merged_total['diagdate'])

#create new column 'outcome'
outcomes_merged_total['outcome'] = np.nan

# update 'outcome' column according to the following conditions
# 0 - if no death or cancer info
# 1 - if has cancer info
# 2 - if has death info (and does not have cancer info)
outcomes_merged_total.loc[(pd.notna(outcomes_merged_total['deathdate'])) & (pd.isna(outcomes_merged_total['diagdate'])), 'outcome'] = 2
outcomes_merged_total.loc[(pd.isna(outcomes_merged_total['deathdate'])) & (pd.isna(outcomes_merged_total['diagdate'])), 'outcome'] = 0
outcomes_merged_total.loc[pd.notna(outcomes_merged_total['diagdate']), 'outcome'] = 1

# filter rows where diagdate occurs before obsdate
outcomes_merged_total.loc[outcomes_merged_total['diagdate'] < outcomes_merged_total['obsdate'], 'outcome'] = np.nan
# filter rows where deathdate occurs before IHD
outcomes_merged_total.loc[outcomes_merged_total['deathdate'] < outcomes_merged_total['obsdate'], 'outcome'] = np.nan

# get indices of the NaN values that are to be dropped!
na_ids_total = outcomes_merged_total[outcomes_merged_total.outcome.isna()].e_patid

# drop rows where outcome column is np.nan (it is only nan if filter conditions are applied!)
outcomes_merged_total = outcomes_merged_total.dropna(subset = ['outcome'])

DIR = 'cleaned_files'
os.chdir(DIR)
outcomes_merged_total.to_csv('outcomes.csv')