In [None]:
import fileinput
import glob
import pandas as pd
import numpy as np
import os
from datetime import datetime
from datetime import timedelta

### Load in the data

In [None]:
# Load outcome data
DIR = "cleaned_files"
os.chdir(DIR)
outcome =  pd.read_csv("outcomes.csv")
compare = outcome.e_patid

DIR = "merged_files"
os.chdir(DIR)
antiplatelets = pd.read_csv("antiplatelets.csv", sep = ",")

DIR = "cleaned_files"
os.chdir(DIR)
antiplatelets = antiplatelets[antiplatelets['e_patid'].isin(outcome['e_patid'])]
antiplatelets.to_csv("antiplatelets.csv", index = False)

### Organize antiplatelets data and mark as antiplatelets use (y/n) based on before IHD diagnosis

In [None]:
# create new column where all people that use antiplatelets have a value of 1 
antiplatelets['antiplatelets_use'] = 1

# merge ihd and antiplatelets so that now our 'antiplatelets_use' has 1 for those that use it and 0 for those that do not
merged_antiplatelets = pd.merge(outcome, antiplatelets, on = "e_patid", how = "left")
merged_antiplatelets.antiplatelets_use = merged_antiplatelets.antiplatelets_use.replace(np.nan, 0)

# drop unecessary columns
merged_antiplatelets = merged_antiplatelets[["e_patid", "obsdate", "medicationdate", "antiplatelets_use"]]

# convert to datetime
merged_antiplatelets['medicationdate'] = pd.to_datetime(merged_antiplatelets['medicationdate'], format = "%d%b%Y", errors = 'coerce')
merged_antiplatelets['obsdate'] = pd.to_datetime(merged_antiplatelets['obsdate'])

# keep earliest instance of antiplatelets use (bc it is about 'use - yes/no')
merged_antiplatelets = merged_antiplatelets.sort_values(["e_patid", "medicationdate"]) # sort values based on patient ID and obsdate
merged_antiplatelets = merged_antiplatelets.drop_duplicates(subset=['e_patid'], keep='first')

# Vectorized update of 'antiplatelets_use' column
merged_antiplatelets['antiplatelets_use'] = ((merged_antiplatelets['antiplatelets_use'] == 1) & 
                                             (merged_antiplatelets['medicationdate'] <= merged_antiplatelets['obsdate'])).astype(int)

# Set value to 0 using boolean indexing
merged_antiplatelets.loc[~((merged_antiplatelets['antiplatelets_use'] == 1) & 
                           (merged_antiplatelets['medicationdate'] <= merged_antiplatelets['obsdate'])), 
                        'antiplatelets_use'] = 0

# keep relevant columns
merged_antiplatelets = merged_antiplatelets[["e_patid", "obsdate", "medicationdate", "antiplatelets_use"]]

# save 
DIR = 'cleaned_files\\Closest'
os.chdir(DIR)
merged_antiplatelets.to_csv('antiplatelets_no_duplicates_merged.csv')