In [None]:
import pickle
import networkx as nx
import pandas as pd 
import datetime
import pandas

# HF patients  

In [None]:
hf = pd.read_csv('/projects/RALES TRIAL/1A/notspiro dataextraction/notspiro data/hf_patients.csv')
hf.shape

In [None]:
hf_patients = set(hf['client_idcode'])
len(hf_patients)

# Identifying HF patients not on spironolactone (structured data) 

In [None]:
df = pd.read_csv('/projects/data/GS/HF/gs_all_orders.csv') # load the orders table
df = df[['client_idcode', 'order_name', 'order_summaryline', 'order_entered', 'order_arrivaldtm']]
df['drug_name'] = df['order_name'].str.upper()

In [None]:
%%time
drug_names = ['SPIRONOLACTONE']
keep = []
for index, row in df.iterrows():
    keep.append(any([x in row['drug_name'] for x  in drug_names]))

In [None]:
df = df.loc[keep]
df.shape

In [None]:
df = pd.merge(df, hf, on='client_idcode')
df.shape

In [None]:
df['order_arrivaldtm'] = pd.to_datetime(df['order_arrivaldtm'], utc=True).dt.date 
df['difference'] = pd.to_datetime(df['start_date']) - pd.to_datetime(df['order_arrivaldtm'])
min_time = datetime.timedelta(days=0) 
max_time = datetime.timedelta(days=183)
t1 = df['difference'] >= min_time 
t2 = df['difference'] <= max_time
in_window = t1 & t2 
df['in_window'] = in_window

In [None]:
df.head()

In [None]:
df = df.loc[df['in_window'] == True]
df.shape

In [None]:
print(df['difference'].isna().sum())
df.head()

In [None]:
hf_patients_on_spiro_structured = set(df['client_idcode'])
len(hf_patients_on_spiro_structured)

# Identifying HF patients not on spironolactone (unstructured data) 

In [None]:
def graph_from_onto(onto):
    G = nx.DiGraph()
    cl_edges = []
    for s, ts in onto.items():
        for t in ts:
            cl_edges.append((s.replace('S-', ''), t.replace('S-', '')))
    G.add_edges_from(cl_edges)
    return G

def load_onto(o):
    with open(f'/projects/data/GS/{o}', 'rb') as f:
        onto = pickle.load(f)
    G = graph_from_onto(onto)
    return G

def expand_codes(onto, codes):
    expanded = {}
    for name, top_codes in codes.items():
        expanded[name] = set()
        for c in top_codes:
            expanded[name].add(c)
            if c in onto:
                expanded[name].update(nx.ancestors(onto, c))
            else:
                print("NOT FOUND:", c, name)
        print(name, len(top_codes), len(expanded[name]))
    return expanded

In [None]:
onto = load_onto('isa_rela_ch2pt_202009.pickle')

In [None]:
spiro = {'Spironolactone' : ['387078006','13929005','777603002', '318056008', '1301111000001106', '1261411000001109']}
spiro = expand_codes(onto,spiro)

In [None]:
spiro_codes = set()
for x in spiro.values():
    spiro_codes.update(x)

In [None]:
%%time 
with open('/projects/data/GS/pt2cui_pos_dates.pickle', 'rb') as f:
    pt2cui_pos_dates = pickle.load(f)

In [None]:
# function to identify HF patients with 2 mentions of spironolactone 


def dates_ex(pt_data, codes):
    dates = None
    for x in codes:
        if x in pt_data and len(pt_data[x])>=2:
            if dates == None:
                dates = pt_data[x]
            else: 
                dates = dates.union(pt_data[x])
    return dates

dates_spiro = {}
for pt in hf_patients:
    pt_data = pt2cui_pos_dates.get(pt, {})
    dates = dates_ex(pt_data, spiro_codes) #
    dates_spiro[pt] = dates

In [None]:
len(dates_spiro)

In [None]:
None not in dates_spiro.values()

In [None]:
filtered_dates_spiro = {k: v for k, v in dates_spiro.items() if v is not None}
print(len(filtered_dates_spiro))
spiro_dates = pd.DataFrame(list(filtered_dates_spiro.items()), columns=['client_idcode', 'spiro_date']) 
spiro_dates.shape

In [None]:
# expanding the dataset; 1 row for each date

spiro_dates = spiro_dates[['client_idcode']].join((spiro_dates[i].explode() for i in spiro_dates.iloc[:,1:]))

In [None]:
spiro_dates['spiro_date'] = pd.to_datetime(spiro_dates['spiro_date'], utc=True).dt.date 

In [None]:
df = pd.merge(spiro_dates, hf, on='client_idcode')
df.shape

In [None]:
df.head()

In [None]:
df['difference'] = pd.to_datetime(df['start_date']) - pd.to_datetime(df['spiro_date'])
min_time = datetime.timedelta(days=0) 
max_time = datetime.timedelta(days=183)
t1 = df['difference'] >= min_time 
t2 = df['difference'] <= max_time
in_window = t1 & t2 
df['in_window'] = in_window

In [None]:
df = df.loc[df['in_window'] == True]
df.shape

In [None]:
print(df['difference'].isna().sum())
df.head()

In [None]:
hf_patients_on_spiro_unstructured = set(df['client_idcode'])
len(hf_patients_on_spiro_unstructured)

# Comparing structured and unstructured data

In [None]:
hf_patients_on_spiro_unstructured.update(hf_patients_on_spiro_structured)
print(len(hf_patients_on_spiro_unstructured))
hf_patients_on_spiro = hf_patients_on_spiro_unstructured
len(hf_patients_on_spiro )

In [None]:
with open ('/projects/RALES TRIAL/1A/notspiro dataextraction/notspiro data/hf_patients_on_spiro.pickle', 'wb') as f:
    pickle.dump(hf_patients_on_spiro,f)