In [None]:
import fileinput
import glob
import pandas as pd
import numpy as np
import os
from datetime import datetime
from datetime import timedelta

### Limit all predictor data to patients who have IHD

In [None]:
# Load files
DIR = "cleaned_files"
os.chdir(DIR)
outcome =  pd.read_csv("outcomes.csv")
# get IHD patients IDs as the comparator (IHD pts in outcomes file)
compare = outcome.e_patid

DIR = "merged_files" # change to DIR where all the predictor files are stored 
os.chdir()
age_gender = pd.read_csv("age_gender.csv")
crp = pd.read_csv("crp.csv")
diabetes = pd.read_csv("diabetes.csv")
height = pd.read_csv("height.csv", sep = "\t")
weight = pd.read_csv("weight.csv", sep = "\t") 
alcohol = pd.read_csv("alcohol.csv")

outputdir = "cleaned_files"
if not os.path.exists(outputdir):
    os.makedirs(outputdir)

# get the files you want to work with
files = [age_gender, height, weight, crp, diabetes, alcohol] 
file_names = ["age_gender", "height", "weight", "crp", "diabetes", "alcohol"]

# loop through each file and manipulate as desired
for i in range(len(files)):
    name = file_names[i] # get file name
    in_data = files[i] # read file into dataframe
    bools = in_data.e_patid.isin(compare) # get array of True/False for rows in in_data that have IHD 
    out_data = in_data[bools.values] # keep only rows with True
    out_data.to_csv(outputdir + "\\" + name + ".csv", index = False)

In [None]:
# Smoking, CRP, Antiplatelets done in separate file (see bottom of this file for more detail)

### Now go through predictors and clean the data

In [None]:
# load in the files that were just saved/created in the previous cell
DIR = "cleaned_files"
os.chdir(DIR) 
age_gender = pd.read_csv("age_gender.csv") 
crp = pd.read_csv("crp.csv")
diabetes = pd.read_csv("diabetes.csv") 
height = pd.read_csv("height.csv")
weight = pd.read_csv("weight.csv")
alcohol = pd.read_csv("alcohol.csv")

#### age

In [None]:
#### AGE ####
# isolate data frame for age
age = age_gender.copy()
age = age[["e_patid", "yob", "start", "end"]]

# calculate age at study entry (IHD diagnosis)
# merge age and outcome data frames
age_merged = pd.merge(outcome, age, on = 'e_patid', how = 'left')

# drop irrelevant columns
age_merged = age_merged[["e_patid", "obsdate", "yob", "start", "end"]]

# make outcome date and yob datetime objects
age_merged["obsdate"] = pd.to_datetime(age_merged["obsdate"])
age_merged["yob"] = pd.to_datetime(age_merged["yob"], format = '%Y') # make YOB day and month Jan 1

# make new column with the age at baseline
age_merged["baseline_age"] = age_merged["obsdate"].dt.year - age_merged["yob"].dt.year

# save the file
DIR = "cleaned_files" # specify directory as desired
os.chdir(DIR)
age_merged.to_csv('age.csv')

#### gender

In [None]:
#### GENDER ####

# isolate this data frame to just gender because we will deal with age later
gender = age_gender.copy()
gender = gender [["e_patid", "gender"]]

# apply conditions to change the gender accordingly
conditions = [
    gender["gender"] == "M",
    gender["gender"] == "F",
    gender["gender"] == "I"
]

values = [0, 1, np.nan]

gender["gender_encoded"] = np.select(conditions, values, default = np.nan)

# merge gender with outcome data
gender_merged = pd.merge(outcome, gender, on = 'e_patid', how = 'left')
gender_merged = gender_merged[["e_patid", "gender", "gender_encoded"]]

# save the merged gender data frame
DIR = 'cleaned_files' # change directory as needed
os.chdir(DIR)
gender_merged.to_csv('gender.csv')

#### diabetes

In [None]:
#### DIABETES ####

diabetes.rename(columns = {"obsdate": "measuredate"}, inplace = True)

# create diabetes status column where all people with diabetes have a 1
diabetes['diabetes_status'] = 1

# merge outcome and diabetes so that now our 'diabetes status' has 1 for those with diabetes and 0 for those without
merged_diabetes = pd.merge(outcome, diabetes, on = "e_patid", how = "left")
merged_diabetes.diabetes_status = merged_diabetes.diabetes_status.replace(np.nan, 0)

# drop unecessary columns
merged_diabetes = merged_diabetes[["e_patid", "obsdate", "measuredate", "diabetes_status"]]

# only look at instances of diabetes recorded before the index date
merged_diabetes['obsdate'] = pd.to_datetime(merged_diabetes['obsdate'])
merged_diabetes['measuredate'] = pd.to_datetime(merged_diabetes['measuredate'], errors = 'coerce')

# only mark 1 for those who had diabetes before study index date
merged_diabetes['diabetes_status'] = ((merged_diabetes['diabetes_status'] == 1) & 
                                             (merged_diabetes['measuredate'] <= merged_diabetes['obsdate'])).astype(int)

# Set value to 0 using boolean indexing
merged_diabetes.loc[~((merged_diabetes['diabetes_status'] == 1) & 
                           (merged_diabetes['measuredate'] <= merged_diabetes['measuredate'])), 
                        'diabetes_status'] = 0

DIR = 'cleaned_files'
os.chdir(DIR)
merged_diabetes.to_csv('diabetes.csv')

#### weight

In [None]:
#### WEIGHT ####

# check what kind of invalid date formats there are --> looks like it is just nans
not_datetime_weight = pd.to_datetime(weight["measuredate"], errors = 'coerce', format = '%d%b%Y').isna()
not_datetime_weight = weight.measuredate[not_datetime_weight].unique()

weight["measuredate"] = pd.to_datetime(weight["measuredate"], errors = 'coerce', format = '%d%b%Y')
weight = weight.sort_values(["e_patid", "measuredate"])

# get units table from dataset (text file with measurement units of the predictor)
units_id = pd.read_table("NumUnit.txt")

# then link weight and units_id based on the numunitid
weight_units_merged = pd.merge(weight, units_id, on = 'numunitid', how = 'left')

# next, add a column to weight with the numunitid description
weight['unitdescription'] = weight_units_merged['Description']

unit_counts_weight = weight.groupby('unitdescription').size().reset_index(name = 'counts')
unit_counts_weight = unit_counts_weight.sort_values(by = 'counts', ascending = False)

# "drop" values whose units cannot convert (will set them to NaN later)
most_freq_weight_units = unit_counts_weight.drop(index = [1, 2, 28, 15, 13, 0, 5, 9, 6, 4, 7, 18, 3, 8, 10, 26, 27, 22, 25])
# take note of the units being dropped (aka set to NaN)
#1 (Unknown) 
#2 /kg(body wt) 
#28 mmol/L 0 % 
#15 Unk UoM 
#13 O/E - weight NOS 
#5 100 
#7 160 
#6 114 
#4 0639 
#8 255
#9 7 
#18 cm 
#3 /min 
#27 mmHg 
#10 8CAL
#26 mm/Hg 
#25 metres 
#22 kg/m2 

# specify the conversation rates for the units that I can convert
conversion_rates_weight = {'kg': 1,
                    'Kgs': 1,
                    'kilograms': 1,
                    'decimal stones': 6.35029,
                    'Kilos': 1,
                    'stone': 6.35029,
                    'st': 6.35029,
                    'Weight in Kg': 1,
                    'kg.': 1,
                    'WEIGHT IN KILOS': 5,
                    'lb': 0.453592,
                    'Stones': 6.35029} 

# create a boolean mask for the most common units
common_units_weight = weight['unitdescription'].isin(most_freq_weight_units['unitdescription'])

# set the values and units for the most common units
weight.loc[common_units_weight, 'value'] *= weight.loc[common_units_weight, 'unitdescription'].map(conversion_rates_weight)
weight.loc[common_units_weight, 'unitdescription'] = 'kg'

# mark missing values for other units
weight.loc[~common_units_weight, 'value'] = np.nan

# create new column for weight (kg)
weight["weight (kg)"] = weight["value"]

# save file
DIR = "cleaned_files"
os.chdir(DIR)
weight.to_csv('weight.csv', index = False)

# NOW NEED TO DROP DUPLICATES --> new file (see Remove Duplicates folder)

#### height

In [None]:
#### HEIGHT ####

# get units table (text file with measurement units of the predictor)
units_id = pd.read_table("NumUnit.txt")

# link height and units_id based on the numunitid
height_units_merged = pd.merge(height, units_id, on = 'numunitid', how = 'left')

# next, add a column to height data with the numunitid description
height['unitdescription'] = height_units_merged['Description']

# check what kind of invalid date formats there are --> looks like it is just nans
not_datetime_height = pd.to_datetime(height["measuredate"], errors = 'coerce', format = '%d%b%Y').isna()
not_datetime_height = height.measuredate[not_datetime_height].unique()

# change height dates into datetime object
height["measuredate"] = pd.to_datetime(height["measuredate"], errors = 'coerce', format = '%d%b%Y')
height = height.sort_values(["e_patid", "measuredate"])

# look at the unique height units
see_unique = height.unitdescription.unique()

# look at most frequently used units 
unit_counts_height = height.groupby('unitdescription').size().reset_index(name = 'counts')
unit_counts_height = unit_counts_height.sort_values(by = 'counts', ascending = False)

# "drop" units that cannot be converted (set them to NaN later on)
most_freq_height_units = unit_counts_height.drop(index = [0, 17, 7, 6, 1, 5, 12, 3, 2, 16, 11, 4])
# note the units that are "dropped"
#0 /min 
#17 mmHg 
#7 O/E-height 10-20% over average 
#6 O/E -height within 10% average 
#1 100 
#5 O/E - loss of height 
#12 kg/m2 
#3 160 
#2 114 
#16 mm/Hg 
#11 kg
#4 Kgs 

# create conversion rate table for height
conversion_rates_height = {'cm': 1,
                    'm': 100,
                    'cms': 1,
                    'metres': 100,
                    'mm': 0.1,
                    'ft': 30.48} 

# create a boolean mask for the most common units
common_units_height = height['unitdescription'].isin(most_freq_height_units['unitdescription'])

# set the values and units for the most common units
height.loc[common_units_height, 'value'] *= height.loc[common_units_height, 'unitdescription'].map(conversion_rates_height)
height.loc[common_units_height, 'unitdescription'] = 'cm'

# mark missing values for other units
height.loc[~common_units_height, 'value'] = np.nan

# create new column for weight (kg)
height["height (cm)"] = height["value"]

DIR = "cleaned_files"
os.chdir(DIR)
height.to_csv('height.csv', index = False)

# NOW NEED TO DROP DUPLICATES --> new file (see Remove Duplicates folder)

#### alcohol

In [None]:
#### ALCOHOL ####

# first check what the invalid dates are (e.g., nan, year is something ridiculous, etc.)
not_datetime = pd.to_datetime(alcohol["measuredate"], format="%d%b%Y", errors='coerce').isna() # get invalid dates
alcohol.measuredate[not_datetime].unique() # check what kinds of values are the invalid ones

# turn the date column into a datetime object
alcohol["measuredate"] = pd.to_datetime(alcohol["measuredate"], format = "%d%b%Y", errors = "coerce") #turn obsdate into datetime object
alcohol = alcohol.sort_values(["e_patid", "measuredate"]) # sort values based on patient ID and obsdate

# get units table (text file with measurement units of the predictor)
units_id = pd.read_table("NumUnit.txt")

# then link alcohol and units_id based on the numunitid
alcohol_units_merged = pd.merge(alcohol, units_id, on = 'numunitid', how = 'left')

# next, add a column to alcohol with the numunitid description
alcohol['unitdescription'] = alcohol_units_merged['Description']

# convert value to be in units per week
# classify each patient as "0" (0 units per week), "1" (between 0 and 10), "2" (greater than 10)

# look at the different units 
unit_counts = alcohol.groupby('unitdescription').size().reset_index(name='counts')
unit_counts = unit_counts.sort_values(by='counts', ascending = False)
most_frequent_units = unit_counts
# after manually looking through the data, I know I have to deal with units, Unit, Occasional, and 0 separately

# for now, ignore the rows that I cannot convert like units, Unit, 0, Occasional
most_frequent_units.drop(index = [1, 292, 286, 125, 285, 116, 235, 290, 281, 282, 289, 288, 287, 213, 10, 20, 48, 25, 253, 53, 31, 95, 33, 105, 100, 66, 4, 76, 343, 63, 72, 36, 128, 262, 38, 122, 88, 59, 117, 118, 110, 156, 132, 42, 147, 331, 218, 254, 93, 101, 60, 121, 123, 16, 57,  62, 61, 130, 247, 248, 249, 250, 251, 252, 258, 74, 260, 73, 291, 295, 293, 37, 32, 20, 28, 27, 23, 22, 15, 336, 337, 340, 5, 3, 39, 40, 242, 296, 298, 299, 55, 303, 304, 52, 51, 46, 44, 43, 41, 243, 241,166, 103, 102, 107, 108, 126, 140, 124, 114, 113, 112, 97, 240, 219, 220, 221, 22, 223, 225, 226, 83, 229, 216, 81, 237, 239, 215, 87, 91, 91, 90, 201, 203, 204, 205, 206, 209, 86, 0], inplace = True)
most_common_units = most_frequent_units.unitdescription # descriptions for the most frequent units that I can convert
most_common_units = most_common_units.to_frame()

# change all alcohol values whose unitdescription is "Occasional" to 5
# Occasional is in the top most common units and occasional implies not heavy drinker but not non-drinker
alcohol.loc[alcohol['unitdescription'] == 'Occasional', 'value'] = 5
alcohol.loc[alcohol['unitdescription'] == 'Occasional', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == '4 times a year.', 'value'] == 1
alcohol.loc[alcohol['unitdescription'] == '4 times a year.', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'occasionally', 'value'] = 5
alcohol.loc[alcohol['unitdescription'] == 'occasionally', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'U/occasionally', 'value'] == 5
alcohol.loc[alcohol['unitdescription'] == 'U/occasionally', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'Rarely', 'value'] == 1
alcohol.loc[alcohol['unitdescription'] == 'Rarely', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'every 3 weeks social'] == 1
alcohol.loc[alcohol['unitdescription'] == 'every 3 weeks social', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'on special occas'] == 1
alcohol.loc[alcohol['unitdescription'] == 'on special occas', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'on occasions'] == 1
alcohol.loc[alcohol['unitdescription'] == 'on occasions', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'occasionally only'] == 5
alcohol.loc[alcohol['unitdescription'] == 'occasionally only', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'on xmas day', 'value'] = 1
alcohol.loc[alcohol['unitdescription'] == 'on xmas day', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'once a year', 'value'] = 1
alcohol.loc[alcohol['unitdescription'] == 'once a year', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'once in a while', 'value'] = 1
alcohol.loc[alcohol['unitdescription'] == 'once in a while', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'only on special oc', 'value'] = 1
alcohol.loc[alcohol['unitdescription'] == 'only on special oc', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'less than monthly', 'value'] = 5
alcohol.loc[alcohol['unitdescription'] == 'less than monthly', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'less then monthly', 'value'] = 5
alcohol.loc[alcohol['unitdescription'] == 'less then monthly', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'not often', 'value'] = 5
alcohol.loc[alcohol['unitdescription'] == 'not often', 'unitdescription'] = 'units per week'
 
alcohol.loc[alcohol['unitdescription'] == 'occa', 'value'] = 5
alcohol.loc[alcohol['unitdescription'] == 'occa', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'occasionallly', 'value'] = 5
alcohol.loc[alcohol['unitdescription'] == 'occasionallly', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'occasional drinker', 'value'] = 5
alcohol.loc[alcohol['unitdescription'] == 'occasional drinker', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == ' rare occassions', 'value'] = 5
alcohol.loc[alcohol['unitdescription'] == ' rare occassions', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'socially not weekly', 'value'] = 5
alcohol.loc[alcohol['unitdescription'] == 'socially not weekly', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'sundays', 'value'] = 5
alcohol.loc[alcohol['unitdescription'] == 'sundays', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'U/ occasionally', 'value'] = 5
alcohol.loc[alcohol['unitdescription'] == 'U/ occasionally', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'occ', 'value'] = 5
alcohol.loc[alcohol['unitdescription'] == 'occ', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'U/occasional', 'value'] = 5
alcohol.loc[alcohol['unitdescription'] == 'U/occasional', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == 'special occasions', 'value'] = 5
alcohol.loc[alcohol['unitdescription'] == 'special occasions', 'unitdescription'] = 'units per week'

alcohol.loc[alcohol['unitdescription'] == '0cc', 'value'] = 5
alcohol.loc[alcohol['unitdescription'] == '0cc', 'unitdescription'] = 'units per week'


# create a conversion rates table to convert the most frequent units to 
# units/week
# a lot of the most frequent units are a variation on units/week so no
# point converting those (e.g., U/week, units/wk, etc.)

conversion_rates = {'units/week' :1,  
'U/week':1,
'units':1, 
'units/wk':1, 
'units per week':1, 
'units}/wk':1,
'/week':1,
'units / week':1,
' /wk':1,
'Unit':1,
'units/day': 7,
'U/wk': 1,
'Unt/Wk': 1,
'upw': 1,
'unit/wk': 1,
'pw': 1, 
'U/month': 1/4,
'/day':7,
'units per wk': 1,
'units/weeks': 1,
'per week': 1,
'U/monthly':1/4,
'u/day':7,
'month':1/4, 
'un/pw':1,
'U/year':1/52, 
'monthly':1/4, 
'units/w':1, 
'Units/ week':1, 
'units /week':1, 
'u/pw':1, 
'week':1, 
'U/fortnight':1/2,
'yearly':1/52, 
'day':7, 
'0U/week':1, 
'U/DAILY':7, 
'Alc Units/wk':1, 
'U/yearly':1/52, 
'units p.d.':7,
'2 U/week':1, 
'1 U/week':1,
'Alcohol Units Per Week':1, 
'fortnightly':1/2, 
'U/fortnightly':1/2,
'U/months':1/4, 
'U/monthy':1/4, 
'0 U/week':1, 
'year':1/52, 
'14U/week':1, 
'x 2 monthly':1/8, 
'iu/wk':1, 
'per month':1/4, 
'U/6 months':1/24, 
'U/mth':1/4, 
'0/week':1, 
'units/weekly':1, 
'weeks':1,
'8U/week':1, 
'u/weekly':1,
'units pw':1, 
'1U/week':1, 
'10 U/week':1,
'fortnight':1/2, 
'units p/wk':1, 
'umonth':1/4, 
'unit/week':1, 
'Units//week':1, 
'U/6 monthly':1/24, 
'U/2week':1/2, 
'weekly':1, 
'x 2 month':1/8, 
'U/6months':1/24, 
'amonth': 1/4, 
'U/night':7, 
'U/week6':1, 
'U/week0':1, 
'every 3 weeks':1/3, 
'pints beer/week': 2, 
'mth':1/4, 
'ou/week':1, 
'per day':7, 
'p/w':1, 
'U/peryear':1/52, 
'1/week':1,
'5 U/week':1, 
'/month':1/4, 
'3 U/week':1, 
'2U':1, 
'28U/week':1, 
'21U/week':1, 
'3 units':1, 
'pints cider week':2,
'3 month':1/12, 
'3 weeks':1/3,
'4/week':1,
'4 units':1, 
'4 U/week':1, 
'35U/week':1, 
'mounth':1/4, 
'3/week':1, 
'units last week':1, 
'14 U/week':1, 
'10U/week':1, 
'units/per week':1, 
'0 units':1, 
'0 u/wk':1, 
'0 /week':1, 
'-3 cans beer/day':16.8,
'units in last week':1, 
'20U/week':1, 
'U/week':1, 
'u/3months':1/12, 
'20 U/week':1, 
'2 units':1, 
'1u':1, 
'unit/pw':1, 
'glasses wine/wkend':7.35, 
'6units':1, 
'8 U/week':1, 
'U/mthly':1/4, 
'7U/week':1, 
'U/per day 1':7,
'U/per month':1/4, 
'U/w10eek':1, 
'U/week 1':1, 
'U/week 60':1, 
'U/week-5':1,
'U/week.5':1, 
'6U/week':1, 
'U/week10':1, 
'U/week12':1, 
'U/week14':1, 
'U/week16':1, 
'U/week4':1, 
'U/week56':1, 
'U/moth':1/4, 
'U/week7':1, 
'U wkly':1, 
'U/ 3 months':1/12, 
'U/2WEEKS' : 1/2,
'U/2monthly' : 1/8,
'U/5 weeks' :1/5, 
'U/6monthy' : 1/24,
'U/a month' :1/4,
'9 U/week' :1,
'U/annually' :1/52,
'U/bi-monthly' :1/2,
'U/every 2-3 months': 1/8, 
'U/forenight':1/2, 
'U/fornightly' :1/2,
'9U/week' :1,
'U/month. ' :1/4,
'6 U/week' :1,
'42U/week' :1,
'42 U/week' :1,
'40 U/week' :1,
'glass brandy/night':9.8, 
'glass wine/day' :14.7,
"glass's wine/week" :2.1,
'glasses whiskey/Nt' : 9.8, 
'U 6 monthly' : 1/24, 
'cans larger/day' : 14, 
'5U/week':1, 
'U/yr':1/52, 
'U0/week' :1,
'U16/week' :1,
'Unit per week':1,
'a year' :1/52,
'a/day' :7,
'bottles/week (wine) ' :10,
'bottle spirits a day' : 280,
'bottle wine/week' : 10,
'bottles whiskey/day' : 280,
'bottles wine a week': 10,
'bottles wine/day' : 70
}

# create a boolean mask for the most common units
common_units = alcohol['unitdescription'].isin(most_common_units['unitdescription'])

# set the values and units for the most common units
alcohol.loc[common_units, 'value'] *= alcohol.loc[common_units, 'unitdescription'].map(conversion_rates)
alcohol.loc[common_units, 'unitdescription'] = 'units per week'

# mark missing values for other units
alcohol.loc[~common_units, 'value'] = np.nan

# now create an alcohol status variable and stratify into:
# 0 (<0 units per week), 1 (between 0 and 10 units per week), 2 (>10 units per week)

conditions = [
    alcohol['value'] == 0,
    alcohol['value'].between(0, 10),
    alcohol['value'] > 10
]
values = [0, 1, 2]

# use numpy.select to apply the conditions to the entire DataFrame
alcohol['alcohol_status'] = np.select(conditions, values, default = np.nan)

# save cleaned alcohol data
DIR = "cleaned_files"
os.chdir(DIR)
alcohol.to_csv('alcohol.csv', index = False)

# NOW NEED TO DROP DUPLICATES --> new file (see Remove Duplicates folder)

#### CRP, Anitplatelets, Smoking are done in other files

In [None]:
#### CRP #### --> fully dealt with in a new file (see Remove Duplicates folder)
#### ANTIPLATELETS #### --> fully dealt with in a new file (see Remove Duplicates folder)
#### SMOKING #### --> fully dealt with in a new file (see Remove Duplicates folder)