In [None]:
import fileinput
import glob
import pandas as pd
import numpy as np
import os
from datetime import datetime
from datetime import timedelta

# Combine Data

### Load data

In [None]:
# load all relevant data in
DIR = "cleaned_files\\Closest"
os.chdir(DIR)

# load outcomes
total_outcomes = pd.read_csv('outcomes.csv')

# load predictors (change file names as needed)
age = pd.read_csv('age_no_duplicates_merged.csv')
alcohol = pd.read_csv('alcohol_status_no_duplicates_merged_REDUCED.csv')
antiplatelets = pd.read_csv('antiplatelets_no_duplicates_merged.csv')
crp = pd.read_csv('crp_no_duplicates_merged.csv')
diabetes = pd.read_csv('diabetes_no_duplicates_merged.csv')
ethnicity = pd.read_csv('ethnicity_no_duplicates_merged.csv')
height = pd.read_csv('height_no_duplicates_merged.csv')
sex = pd.read_csv('sex_no_duplicates_merged.csv')
smoking = pd.read_csv('smoking_status_no_duplicates_merged_REDUCED.csv')
townsend = pd.read_csv('townsend_no_duplicates_merged.csv')
weight = pd.read_csv('weight_no_duplicates_merged.csv')

# ensure to keep only relevant columns and rename any ambiguous sounding columns
total_outcomes.rename(columns = {"outcome": "total_outcomes"}, inplace = True)
total_outcomes = total_outcomes[["e_patid", "obsdate", "deathdate", "total_outcomes"]]

colorectal_outcomes.rename(columns = {"outcome": "colorectal_outcomes"}, inplace = True)
colorectal_outcomes = colorectal_outcomes[["e_patid", "obsdate", "deathdate", "colorectal_outcomes"]]

lung_outcomes.rename(columns = {"outcome": "lung_outcomes"}, inplace = True)
lung_outcomes = lung_outcomes[["e_patid", "obsdate", "deathdate", "lung_outcomes"]]

age = age[["e_patid", "yob", "baseline_age"]]
alcohol = alcohol[["e_patid", "alcohol_status"]]
antiplatelets = antiplatelets[["e_patid", "antiplatelets_use"]]
crp = crp[["e_patid", "crp value (mg/L)"]]
diabetes = diabetes[["e_patid", "diabetes_status"]]
ethnicity = ethnicity[["e_patid", "gen_ethnicity"]]
height = height[["e_patid", "height (cm)"]]
sex = sex[["e_patid", "gender_encoded"]]
smoking = smoking[["e_patid", "smoking_status"]]
townsend = townsend[["e_patid", "townsend"]]
weight = weight[["e_patid", "weight (kg)"]]

### Merge data to make the total cancer dataframe

In [None]:
# create a list of the dataframes
dfs_total = [total_outcomes, age, alcohol, antiplatelets, crp, diabetes, ethnicity, height, sex, smoking, townsend, weight]

# set the common column name
common_col = 'e_patid'

# create an empty dataframe to merge the dataframes into
merged_totalModel = pd.DataFrame()

# loop over the list of dataframes and merge them one by one
for df in dfs_total:
    if merged_totalModel.empty:
        merged_totalModel = df.copy()
    else:
        merged_totalModel = pd.merge(merged_totalModel, df, on = common_col)
        
# change the column order (info first 4 rows, predictors next 11, outcome as last)
merged_totalModel = merged_totalModel[['e_patid', 'obsdate', 'deathdate', 'yob', 'baseline_age', 'gender_encoded', 'gen_ethnicity', 'townsend', 'height (cm)', 'weight (kg)', 'alcohol_status', 'smoking_status', 'diabetes_status', 'antiplatelets_use', 'crp value (mg/L)', 'total_outcomes']]
        
# drop rows where age is beyond range of 45-80 years 
merged_totalModel = merged_totalModel.drop(merged_totalModel[(merged_totalModel['baseline_age'] < 45) | (merged_totalModel['baseline_age'] > 80)].index)

# drop rows where sex is missing
merged_totalModel = merged_totalModel.drop(merged_totalModel[merged_totalModel.gender_encoded.isna()].index)

# drop tows where Townsend is missing 
merged_totalModel = merged_totalModel.drop(merged_totalModel[merged_totalModel.townsend.isna()].index)

# save models pre-clipping outliers
DIR = "final"
os.chdir(DIR)

merged_totalModel.to_csv('merged_totalModel.csv', index = False)

# Clip Continuous Data

### Graph continuous variables to decide where to clip them

In [None]:
# HEIGHT - distribution
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
sns.histplot(data=merged_totalModel, x='height (cm)')
plt.title('Height Distribution')
plt.xlabel('Height (cm)')
plt.ylabel('Frequency')
plt.xlim(0, 1000)

plt.show()

In [None]:
# HEIGHT - box plot
import matplotlib.pyplot as plt
import pandas as pd

# Drop missing values in the 'height (cm)' column
merged_totalModel_cleaned = merged_totalModel.dropna(subset=['height (cm)'])

# Set figure dimensions
plt.figure(figsize=(8, 10))

# Create the boxplot using the cleaned DataFrame
plt.boxplot(data=merged_totalModel_cleaned, x='height (cm)')

# Add labels and title
plt.xlabel('')
plt.ylabel('Height (cm)')
plt.title('Boxplot for Height')

# Display the plot
plt.show()

In [None]:
# WEIGHT - distribution
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
sns.histplot(data=merged_totalModel, x='weight (kg)')
plt.title('Weight Distribution')
plt.xlabel('Weight (kg)')
plt.ylabel('Frequency')

plt.show()

In [None]:
# WEIGHT - boxplot
import matplotlib.pyplot as plt
import pandas as pd

# Drop missing values in the 'height (cm)' column
merged_totalModel_cleaned = merged_totalModel.dropna(subset=['weight (kg)'])

# Set figure dimensions
plt.figure(figsize=(8, 10))

# Create the boxplot using the cleaned DataFrame
plt.boxplot(data=merged_totalModel_cleaned, x='weight (kg)')

# Add labels and title
plt.xlabel('')
plt.ylabel('Weight (kg)')
plt.title('Boxplot for Weight')

# Display the plot
plt.show()

In [None]:
# CRP - distribution
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
sns.histplot(data=merged_totalModel, x='crp value (mg/L)')
plt.title('CRP Distribution')
plt.xlabel('CRP Value (mg/L)')
plt.ylabel('Frequency')

plt.show()

### Clip continuous data

In [None]:
model = merged_totalModel.copy()
var = "height (cm)"
q1, q99 = np.nanpercentile(model[var], [1,99])
outliers = model.loc[(model[var] < q1) | (model[var] > q99)]

In [None]:
# Clip weight data at 30 and 140kg
merged_totalModel['weight (kg)'] = np.where(merged_totalModel['weight (kg)'] > 140, 140, merged_totalModel['weight (kg)'])
merged_totalModel['weight (kg)'] = np.where(merged_totalModel['weight (kg)'] < 30, 30, merged_totalModel['weight (kg)'])
merged_totalModel.to_csv("final\\merged_totalModel_FINAL.csv", index = False)

# Clip height data at 120cm and 200cm 
merged_totalModel['height (cm)'] = np.where(merged_totalModel['height (cm)'] > 200, 200, merged_totalModel['height (cm)'])
merged_totalModel['height (cm)'] = np.where(merged_totalModel['height (cm)'] < 120, 120, merged_totalModel['height (cm)'])
merged_totalModel.to_csv("final\\merged_totalModel_FINAL.csv", index = False)

# Clip CRP data at 500 
merged_totalModel['crp value (mg/L)'] = np.where(merged_totalModel['crp value (mg/L)'] > 500, 500, merged_totalModel['crp value (mg/L)'])
merged_totalModel.to_csv("final\\merged_totalModel_FINAL.csv", index = False)

# Add follow-up time and cancer diagnosis date

In [None]:
# read in information on follow up in CPRD
patinfo = pd.read_table("") # get patient information 

# read in cancer info 
total_cancerinfo = pd.read_csv("total_cancer.csv") # get cancer info

#### cancer diagnosis date

In [None]:
# Append relevant information
# add start and end to the model
merged_totalModel = merged_totalModel.merge(patinfo[['e_patid', 'start', 'end']], on='e_patid', how='left')
merged_totalModel = merged_totalModel.merge(total_cancerinfo[['e_patid', 'diagdate']], on = 'e_patid', how = 'left')

# convert to datetime
merged_totalModel['start'] = pd.to_datetime(merged_totalModel['start'], format = "%d/%m/%Y")
merged_totalModel['end'] = pd.to_datetime(merged_totalModel['end'], format = "%d/%m/%Y")
merged_totalModel['diagdate'] = pd.to_datetime(merged_totalModel['diagdate'])

# rearrange columns 
column_order = ['e_patid', 'obsdate', 'diagdate', 'deathdate', 'start', 'end', 'yob', 'baseline_age', 'gender_encoded', 'gen_ethnicity', 'townsend', 'height (cm)', 'weight (kg)', 'alcohol_status', 'smoking_status', 'diabetes_status', 'antiplatelets_use', 'crp value (mg/L)', 'total_outcomes']
merged_totalModel = merged_totalModel.reindex(columns=column_order)

# rename start, end, obsdate
merged_totalModel.rename(columns = {'obsdate':'ihd_diagdate', 'diagdate' : 'cancerdate', 'start' : 'cprd_enter', 'end' : 'cprd_exit'}, inplace = True)

# save the model
merged_totalModel.to_csv("final\\merged_totalModel_FINAL.csv", index = False)

#### follow-up date

In [None]:
# Create follow-up date and age columns
# Create a new column 'followup_enddate' based on the conditions: if outcomes is 0 make followup_enddate same as cprd_exit, if 1 make it cancerdate, if 2 make it deathdate
merged_totalModel['followup_enddate'] = merged_totalModel['cprd_exit']
merged_totalModel['followup_enddate'] = merged_totalModel['followup_enddate'].mask(merged_totalModel['total_outcomes'] == 1, merged_totalModel['cancerdate'])
merged_totalModel['followup_enddate'] = merged_totalModel['followup_enddate'].mask(merged_totalModel['total_outcomes'] == 2, merged_totalModel['deathdate'])

# set relevant columns as datetime objects
merged_totalModel['followup_enddate'] = pd.to_datetime(merged_totalModel['followup_enddate'], errors = 'coerce')
merged_totalModel['ihd_diagdate'] = pd.to_datetime(merged_totalModel['ihd_diagdate'], errors = 'coerce')

# compute follow up time 
merged_totalModel['followup_time'] = (merged_totalModel['followup_enddate'].dt.year - merged_totalModel['ihd_diagdate'].dt.year)

# compute follow up age 
merged_totalModel['followup_age'] = merged_totalModel['baseline_age'] + merged_totalModel['followup_time']

# Reorder columns
merged_totalModel = merged_totalModel[['e_patid', 'ihd_diagdate', 'cancerdate', 'deathdate', 'cprd_enter', 'cprd_exit', 'yob',
                                       'followup_enddate', 'followup_time', 'baseline_age', 'followup_age', 'gender_encoded', 'gen_ethnicity',
                                       'townsend', 'height (cm)', 'weight (kg)', 'alcohol_status', 'smoking_status',
                                       'diabetes_status', 'antiplatelets_use', 'crp value (mg/L)', 'total_outcomes']]

# keep only observations where follow up time is greater than 1 (for some of the censored date, cprd exit time seems to be before ihd diagnosis... I think because BLOTTED cut off at Dec 31, 2019 but some IHDs are in 2020/21)
merged_totalModel = merged_totalModel[merged_totalModel['followup_time'] >= 0]

merged_totalModel.to_csv("final\\merged_totalModel_FINAL.csv", index = False)