In [None]:
import pickle
import json
import networkx as nx
import datetime
import pandas
import pandas as pd 
from tqdm import tqdm
import numpy as np

 # Identifying HF patients 

In [None]:
def graph_from_onto(onto):
    G = nx.DiGraph()
    cl_edges = []
    for s, ts in onto.items():
        for t in ts:
            cl_edges.append((s.replace('S-', ''), t.replace('S-', '')))
    G.add_edges_from(cl_edges)
    return G

def load_onto(o):
    with open(f'/projects/data/GS/{o}', 'rb') as f:
        onto = pickle.load(f)
    G = graph_from_onto(onto)
    return G

def expand_codes(onto, codes):
    expanded = {}
    for name, top_codes in codes.items():
        expanded[name] = set()
        for c in top_codes:
            expanded[name].add(c)
            if c in onto:
                expanded[name].update(nx.ancestors(onto, c))
            else:
                print("NOT FOUND:", c, name)
        print(name, len(top_codes), len(expanded[name]))
    return expanded

In [None]:
onto = load_onto('isa_rela_ch2pt_202009.pickle')

In [None]:
hf = { 'HF' : ['84114007']}
hf = expand_codes(onto,hf)

In [None]:
hf_codes = set()
for x in hf.values():
    hf_codes.update(x)

In [None]:
%%time 
with open('/projects/data/GS/pt2cui_pos_dates.pickle', 'rb') as f:
    pt2cui_pos_dates = pickle.load(f)

In [None]:
# identifying patients with at least 1 mention of HF 

def build_set(codes_list):
    '''
    this function returns 
    :param codes:
    :return data: 
    '''
    data = set()
    for pt in pt2cui_pos_dates:
        for ptcodes in pt2cui_pos_dates[pt]:
            if ptcodes in codes_list:
                data.add(pt) 
    return data

In [None]:
hf_patients = build_set(hf_codes)

In [None]:
len(hf_patients)

In [None]:
hf_patients

In [None]:
pt2cui_pos_dates = {k: pt2cui_pos_dates[k] for k in hf_patients} 

In [None]:
hf_ids = list(hf_patients)
hf_ids = pandas.DataFrame(data={"hf_ids": hf_ids})
hf_ids.to_csv('/projects/RALES TRIAL/1A/spiro dataextraction/spiro data/hf_ids.csv', sep=',',index=False)

#  HF patients on spironolactone (structured data)

In [None]:
df = pd.read_csv('/projects/data/GS/HF/gs_all_orders.csv') # load the orders table
df = df[['client_idcode', 'order_name', 'order_summaryline', 'order_entered', 'order_arrivaldtm']]
hf = df['client_idcode'].isin(hf_patients) # identify HF patients 
df = df[hf] # only keep HF patients 

In [None]:
df['drug_name'] = df['order_name'].str.upper()

In [None]:
%%time
drug_names = ['SPIRONOLACTONE']
keep = []
for index, row in df.iterrows():
    keep.append(any([x in row['drug_name'] for x  in drug_names]))

In [None]:
df = df.loc[keep] # only keeping patients on spironolactone

In [None]:
# only keeping the first spironolactone prescription order line

df['order_arrivaldtm'] = pd.to_datetime(df['order_arrivaldtm'], utc=True).dt.date 
min_date = df.groupby('client_idcode').order_arrivaldtm.min()
df = df.merge(min_date, on='client_idcode',suffixes=('', '_min'))
df = df[df.order_arrivaldtm==df.order_arrivaldtm_min].drop('order_arrivaldtm_min', axis=1)

In [None]:
df['order_summaryline_b'] = df['order_summaryline'].str.upper()

In [None]:
# removing patients with a mention of those words below

matches = ['TWICE','HELD', 'ON HOLD', 'STOPPED', 'STOP', 'WITHHELD', 'WITHHOLD', 'INCREASED', 'RESTARTED', 'INCREASE',
           'DECREASED', 'DECREASE','BD', 'B.D.', 'TDS', 'T.D.S.', 'DISCONTINUED', 'DISCONTINUE']
ids_to_exclude = set()

for i,row in tqdm(df.iterrows()):
    if any(x in df.loc[i,'order_summaryline_b'] for x in matches):
        ids_to_exclude.add(df.loc[i,'client_idcode']) 

In [None]:
# code interruption: double checking individuals in icu 

matches = ['(PRESCRIBED IN ICU)']
icu_double_check = set()

for i,row in tqdm(df.iterrows()):
    if any(x in df.loc[i,'order_summaryline_b'] for x in matches):
        icu_double_check.add(df.loc[i,'client_idcode']) 
len(icu_double_check)

In [None]:
with open ('/projects/RALES TRIAL/1A/spiro dataextraction/spiro data/icu_double_check.pickle', 'wb') as f:
    pickle.dump(icu_double_check,f)

In [None]:
len(ids_to_exclude)

In [None]:
df = df[~df.client_idcode.isin(ids_to_exclude)]

In [None]:
# extracting the dose of the prescription order

for i,row in tqdm(df.iterrows()):
    if '(PRESCRIBED IN ICU)' not in df.loc[i,'order_summaryline_b']:
        df.loc[i,'dose'] = df.loc[i,'order_summaryline_b'].split('MG')[0].strip()[0:4].strip()
    else:
        df.loc[i,'dose'] = np.nan

for i,row in tqdm(df.iterrows()):
    if '(PRESCRIBED IN ICU)' in df.loc[i,'order_summaryline_b']:
        if '12.5' not in df.loc[i,'order_summaryline_b']: 
            df.loc[i,'dose'] = df.loc[i,'order_summaryline_b'].split('(PRESCRIBED IN ICU)')[1].strip()[0:2].strip()
        else:
            df.loc[i,'dose'] = df.loc[i,'order_summaryline_b'].split('(PRESCRIBED IN ICU)')[1].strip()[0:4].strip()

In [None]:
df.loc[df['order_summaryline_b'].str.contains('(PRESCRIBED IN ICU)', case=False)]

In [None]:
print(df['dose'].isna().sum())
df['dose'] = pd.to_numeric(df['dose'], downcast='float')
df['dose'].unique()

In [None]:
mask = df.client_idcode.duplicated(keep=False)
df[mask]

In [None]:
df.loc[df['dose'] == 300]

In [None]:
result = df.groupby('client_idcode')['dose'].apply(lambda x: x.nunique() == 1).reset_index(name='same_dose_value')
false_rows = result.loc[result['same_dose_value'] == False]
print(false_rows)

In [None]:
# orders seem to be entered twice for some patients (see cell above), only keeping one row for each patients

df = df.loc[df.groupby('client_idcode')['dose'].idxmin()]
df.reset_index(drop=True, inplace=True)

In [None]:
print(df.shape)
duplicates = df['client_idcode'].duplicated()
duplicates.value_counts()

#  HF patients on spironolactone (unstructured data)

In [None]:
# entering the SNOMED codes for spironolactone

spiro = {
    'Spironolactone' : ['387078006','13929005','777603002', '318056008', '1301111000001106', '1261411000001109']
}

In [None]:
spiro = expand_codes(onto,spiro)

In [None]:
spiro_codes = set()
for x in spiro.values():
    spiro_codes.update(x)

In [None]:
# function to identify HF patients with 2 mentions of spironolactone

def first_date_for_code_list(pt_data, codes):
    first = None
    for x in codes:
        if x in pt_data and len(pt_data[x])>=2:  
            if first == None:
                first = min(pt_data[x])
            else:
                first = min(first, min(pt_data[x]))
    return first

first_mention_spiro = {}
for pt in hf_patients:
    pt_data = pt2cui_pos_dates.get(pt, {})
    first = first_date_for_code_list(pt_data, spiro_codes) #
    first_mention_spiro[pt] = first

In [None]:
# filter out the None values (aka non initiators)
    
filtered_first_mention_spiro = {k: v for k, v in first_mention_spiro.items() if v is not None}

In [None]:
filtered_first_mention_spiro = {k: v for k,v in filtered_first_mention_spiro.items() if k not in ids_to_exclude}

In [None]:
df_unstructured_data = pd.DataFrame(filtered_first_mention_spiro.keys(), columns = ['client_idcode'] )
df_unstructured_data['spiro_prescription'] = df_unstructured_data['client_idcode'].map(filtered_first_mention_spiro)
df_unstructured_data['spiro_prescription'] = pd.to_datetime(df_unstructured_data['spiro_prescription'], utc=True).dt.date

In [None]:
df_unstructured_data.head()

In [None]:
df_unstructured_data.shape

# Comparing the structured and unstructured data

In [None]:
# keeping patients with no major difference between stuctured/unstrctured spiro presctiptions: if spiro was mentioned in clinical text 91 days before an order being placed patients are removed


def comparison(df1,df2):
    '''
    this function returns 
    :param codes:
    :return data: 
    '''
    df = pd.merge(df1, df2, on='client_idcode', how='outer') 
    df['difference'] =  (pd.to_datetime(df.order_arrivaldtm) - pd.to_datetime(df.spiro_prescription)).dt.days
    df = df.drop(df[df.difference >= 91].index) 
    df.reset_index(drop=True, inplace=True) 
    # del df ['difference']
    return df

In [None]:
df = comparison(df,df_unstructured_data)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# removing patients for which we don't know the spironolactone dose & dosage

df = df.dropna(subset=['order_summaryline_b'])
df.shape

In [None]:
df = df[['client_idcode', 'order_arrivaldtm', 'dose']]

In [None]:
df.head()

In [None]:
first_mention_spiro = dict(zip(df.client_idcode, df.order_arrivaldtm)) 
len(first_mention_spiro)

In [None]:
spiro_dose = dict(zip(df.client_idcode, df.dose)) 
len(spiro_dose)

###  First mention spiro date and dose dictionaries

In [None]:
with open ('/projects/RALES TRIAL/1A/spiro dataextraction/spiro data/first_mention_spiro.pickle', 'wb') as f:
    pickle.dump(first_mention_spiro,f)

In [None]:
with open ('/projects/RALES TRIAL/1A/spiro dataextraction/spiro data/spiro_dose.pickle', 'wb') as f:
    pickle.dump(spiro_dose,f)

# Checking if the HF diagnosis is before the spironolactone prescription

In [None]:
# identifying patients with at least 1 mention of HF and getting the date of the first HF mention

def first_date_for_code_list(pt_data, codes):
    first = None
    for x in codes:
        if x in pt_data:  
            if first == None:
                first = min(pt_data[x])
            else:
                first = min(first, min(pt_data[x]))
    return first

first_mention_hf = {}
for pt in hf_patients:
    pt_data = pt2cui_pos_dates.get(pt, {})
    first = first_date_for_code_list(pt_data, hf_codes)
    first_mention_hf[pt] = first

In [None]:
len(first_mention_hf)

In [None]:
# from HF dictionary to HF dataframe

hf = pd.DataFrame(first_mention_hf.items(), columns = ['client_idcode', 'first_mention_hf'] )
hf['first_mention_hf'] = pd.to_datetime(hf['first_mention_hf'], utc=True).dt.date

In [None]:
# merging the spironolactone and hf dataframes

df = pd.merge(df, hf, on='client_idcode', how='left') 

In [None]:
df['difference'] =  df['order_arrivaldtm'] - df['first_mention_hf']

In [None]:
# indetifying patients on spironolactone with a mention of HF within 6 months prior the arrival of the spironolactone prescription order

min_time = datetime.timedelta(days=1) 
max_time = datetime.timedelta(days=183)
t1 = df['difference'] >= min_time 
t2 = df['difference'] <= max_time
in_window = t1 & t2 
df['in_window'] = in_window

In [None]:
df.head()

In [None]:
df = df[df.in_window]
len(df)

In [None]:
df.head()

In [None]:
hf_diagnosis_match = set(df['client_idcode'])
len(hf_diagnosis_match)

In [None]:
with open ('/projects/RALES TRIAL/1A/spiro dataextraction/spiro data/hf_diagnosis_match.pickle', 'wb') as f:
    pickle.dump(hf_diagnosis_match,f)