## This file is used on a per predictor basis (need to change file names/directories before extracting subsequent predictor)

### Use for all predictors except age, gender, ethnicity, Townsend deprivation score, which were directly obtained from linked data

### Extract individual files

In [None]:
import fileinput
import glob
import pandas as pd
import numpy as np
import os
from datetime import datetime

DIR = "original_files" # change directory, if needed (to where files are stored)
os.chdir(DIR)

# make output folder if it does not already exist
outputdir = "merged_files" # change directory, if needed (to name of predictor)
if not os.path.exists(outputdir):
    os.makedirs(outputdir)

# get comparator (e.g., diabetes SNOMED or medical codes)
compare = pd.read_table("") # specify code for relevant predictor (e.g., diabetes-codes.txt)

# get the txt files you want to work with
files = glob.glob('*.txt')

# loop through each file and manipulate as desired
for file in files:
    print(file) # keep track of what loop you're on
    name = os.path.basename(file).split('.')[0] # get file name
    in_data = pd.read_table(file) # read file into dataframe    
    bools = in_data.medcodeid.isin(compare.medcodeid) # get array of True/False for rows in in_data that have diabetes 
    out_data = in_data[bools.values] # keep only rows with True
    out_data = out_data[["e_patid", "consid", "obsdate", "medcodeid"]] # keep only relevant columns
    out_data.to_csv(outputdir + "\\" + name + ".csv", index = False)

### Merge all the individual files that were just extracted

In [None]:
DIR = "merged_files" # change directory, if needed
os.chdir(DIR)

files = glob.glob('*.csv')

df_merged = pd.DataFrame()
for file in files:
    print(file)
    data = pd.read_csv(file)
    df_merged = pd.concat([df_merged, data], axis = 0, ignore_index = True)

# save copy before removing duplicates    
df_merged.to_csv("predictor_name", index = False) # change as needed to save specific predictor file (e.g., diabetes.csv)])

### For diabetes predictor only!

In [None]:
# keep earliest diabetes diagnosis (bc it is about 'history diabetes - yes/no')
df_merged["obsdate"] = pd.to_datetime(df_merged["obsdate"], errors = "coerce") # turn obsdate into datetime object
df_merged = df_merged.sort_values(["e_patid", "obsdate"]) # sort values based on patient ID and obsdate
df_merged = df_merged.drop_duplicates(subset=['e_patid'], keep='first')
bools_duplicates = ~df_merged["e_patid"].duplicated() # find duplicates and make boolean such that first instance of duplicate (i.e., earliest occurence date-wise) is True
df_merged = df_merged[bools_duplicates.values] # keep only rows that are True in the boolean (because this is only 'diabetes history' so date of occurence does not matter)

df_merged.to_csv("predictor_name", index = False) # change as needed to save specific predictor file (e.g., merged_diabetes_files.csv)