### Extract files

In [None]:
import fileinput
import glob
import pandas as pd
import numpy as np
import os
from datetime import datetime

# If your data is split into multiple sections, then run this code for every section of data (changing file names as needed)
DIR = "original_files" # directory where files are stored 
os.chdir(DIR)

# make output folder if it does not already exist
outputdir = "merged_files" # change directory, if needed
if not os.path.exists(outputdir):
    os.makedirs(outputdir)

# get comparator (IHD medical codes, e.g. SNOMED codes)
compare = pd.read_table("ihd-codes.txt") 

# get the txt files you want to work with
files = glob.glob('*.txt')

# loop through each file and manipulate as desired
for file in files:
    print(file) # keep track of what loop you're on
    name = os.path.basename(file).split('.')[0] # get file name
    in_data = pd.read_table(file) # read file into dataframe    
    bools = in_data.medcodeid.isin(compare.MedCodeId) # get array of True/False for rows in in_data that have IHD 
    out_data = in_data[bools.values] # keep only rows with True
    out_data = out_data[["e_patid", "consid", "obsid", "obsdate", "parentobsid", "medcodeid", "probobsid"]] # keep only relevant columns
    out_data.to_csv(outputdir + "\\" + name + ".csv", index = False)

### Merge all the individual files that were just extracted

In [None]:
DIR = "merged_files" # change directory, if needed
os.chdir(DIR)

files = glob.glob('*.csv')

df_merged = pd.DataFrame()
for file in files:
    print(file)
    data = pd.read_csv(file)
    df_merged = pd.concat([df_merged, data], axis = 0, ignore_index = True)

# save copy before removing duplicates    
df_merged.to_csv("ihd.csv", index = False)

### Drop invalid dates

In [None]:
# check which rows have an invalid date format
badones = df_merged.loc[pd.to_datetime(df_merged['obsdate'], errors='coerce', format='%d/%m/%Y').isnull()]
len_badones = len(badones)

# get boolean array for which rows are nan (True) and which are something other than nan (False)
bools_badones = badones["obsdate"].isnull()

# check how many are NaN and compare length of this to length of badones to know if any are something other than nan
badones_nan = badones[bools_badones]
len_diff = len_badones - len(badones_nan)

# look at the bad ones that are something other than nan to decide how to deal with them 
badones[~bools_badones]

# remove invalid dates because in my case they are all either NaN or completely invalid years, which we cannot impute 
# do this by concatenating the sub-df that has the "badones" to the original but drop duplicates 
df_merged = pd.concat([df_merged, badones]).drop_duplicates(keep = False)

### For duplicate cases of IHD, keep earliest one

In [None]:
# keep the earliest instance of IHD diagnosis
df_merged["obsdate"] = pd.to_datetime(df_merged["obsdate"], errors = "coerce") # turn obsdate into datetime object
df_merged = df_merged.sort_values(["e_patid", "obsdate"]) # sort values based on patient ID and obsdate
df_merged = df_merged.drop_duplicates(subset=['e_patid'], keep='first')
df_merged.to_csv("ihd.csv", index = False)