In [None]:
import pickle
import pandas as pd
import networkx as nx
import datetime
from tqdm import tqdm

# HF patients

In [None]:
hf = pd.read_csv('/projects/RALES TRIAL/1A/notspiro dataextraction/notspiro data/hf_patients.csv')
hf.shape

In [None]:
hf_patients = set(hf['client_idcode'])
len(hf_patients)

#  Patients on eplerenone (structured data)

In [None]:
df_structured_data = pd.read_csv('/projects/data/GS/HF/eplerenone_orders.csv')
del df_structured_data['Unnamed: 0']
del df_structured_data['order_isprn']
del df_structured_data['order_subsequencenum']

In [None]:
df_structured_data.order_arrivaldtm.isna().sum()

In [None]:
df_structured_data['order_arrivaldtm'] = pd.to_datetime(df_structured_data['order_arrivaldtm'], utc=True).dt.date 
df_structured_data = df_structured_data[['client_idcode', 'order_arrivaldtm']]
df_structured_data.shape

In [None]:
df = pd.merge(df_structured_data, hf, on='client_idcode')
df.shape

In [None]:
df['order_arrivaldtm'] = pd.to_datetime(df['order_arrivaldtm'], utc=True).dt.date 
df['difference'] = pd.to_datetime(df['start_date']) - pd.to_datetime(df['order_arrivaldtm'])
min_time = datetime.timedelta(days=0) 
max_time = datetime.timedelta(days=183)
t1 = df['difference'] >= min_time 
t2 = df['difference'] <= max_time
in_window = t1 & t2 
df['in_window'] = in_window

In [None]:
df = df.loc[df['in_window'] == True]
df.shape

In [None]:
print(df['difference'].isna().sum())
df.head()

In [None]:
hf_patients_on_eplerenone_structured = set(df['client_idcode'])
len(hf_patients_on_eplerenone_structured)

#  Patients on eplerenone (unstructured data)

In [None]:
def graph_from_onto(onto):
    G = nx.DiGraph()
    cl_edges = []
    for s, ts in onto.items():
        for t in ts:
            cl_edges.append((s.replace('S-', ''), t.replace('S-', '')))
    G.add_edges_from(cl_edges)
    return G

def load_onto(o):
    with open(f'/projects/data/GS/{o}', 'rb') as f:
        onto = pickle.load(f)
    G = graph_from_onto(onto)
    return G

def expand_codes(onto, codes):
    expanded = {}
    for name, top_codes in codes.items():
        expanded[name] = set()
        for c in top_codes:
            expanded[name].add(c)
            if c in onto:
                expanded[name].update(nx.ancestors(onto, c))
            else:
                print("NOT FOUND:", c, name)
        print(name, len(top_codes), len(expanded[name]))
    return expanded

In [None]:
onto = load_onto('isa_rela_ch2pt_202009.pickle')

In [None]:
# entering the SNOMED codes for eplerenone

eplerenone = {'eplerenone' : ['407010008', '398699008']}
eplerenone_codes =  expand_codes(onto,eplerenone)

In [None]:
eplerenone_codes = set()
for x in eplerenone.values():
    eplerenone_codes.update(x)

In [None]:
%%time 
with open('/projects/data/GS/pt2cui_pos_dates.pickle', 'rb') as f:
    pt2cui_pos_dates = pickle.load(f)

In [None]:
# function to identify HF patients with 2 mentions of eplerenone 


def dates_ex(pt_data, codes):
    dates = None
    for x in codes:
        if x in pt_data and len(pt_data[x])>=2:
            if dates == None:
                dates = pt_data[x]
            else: 
                dates = dates.union(pt_data[x])
    return dates

dates_eplerenone = {}
for pt in hf_patients:
    pt_data = pt2cui_pos_dates.get(pt, {})
    dates = dates_ex(pt_data, eplerenone_codes) #
    dates_eplerenone[pt] = dates

In [None]:
# filter out the None values (aka non initiators)

print(len(dates_eplerenone))
filtered_eplerenone_dictionary = {k: v for k, v in dates_eplerenone.items() if v is not None}
len(filtered_eplerenone_dictionary)

In [None]:
df_unstructured_data = pd.DataFrame(filtered_eplerenone_dictionary.items(), columns = ['client_idcode', 'eplerenone_date'])
len(df_unstructured_data)

In [None]:
# expanding the dataset; 1 row for each date

df_unstructured_data = df_unstructured_data[['client_idcode']].join((df_unstructured_data[i].explode() for i in df_unstructured_data.iloc[:,1:]))

In [None]:
df_unstructured_data['eplerenone_date'] = pd.to_datetime(df_unstructured_data['eplerenone_date'], utc=True).dt.date
df_unstructured_data.head()

In [None]:
df = pd.merge(hf, df_unstructured_data, on='client_idcode')
df.shape

In [None]:
df.head()

In [None]:
df['difference'] = pd.to_datetime(df['start_date']) - pd.to_datetime(df['eplerenone_date'])
min_time = datetime.timedelta(days=0) 
max_time = datetime.timedelta(days=183)
t1 = df['difference'] >= min_time 
t2 = df['difference'] <= max_time
in_window = t1 & t2 
df['in_window'] = in_window

In [None]:
df = df.loc[df['in_window'] == True]
df.shape

In [None]:
print(df['difference'].isna().sum())
df.head()

In [None]:
hf_patients_on_eplerenone_unstructured = set(df['client_idcode'])
len(hf_patients_on_eplerenone_unstructured)

# Comparing structured and unstructured data

In [None]:
hf_patients_on_eplerenone_unstructured.update(hf_patients_on_eplerenone_structured)
print(len(hf_patients_on_eplerenone_unstructured))
hf_patients_on_eplerenone = hf_patients_on_eplerenone_unstructured
len(hf_patients_on_eplerenone )

In [None]:
with open ('/projects/RALES TRIAL/1A/notspiro dataextraction/notspiro data/hf_patients_on_eplerenone.pickle', 'wb') as f:
    pickle.dump(hf_patients_on_eplerenone,f)