In [None]:
import pickle
import networkx as nx
import pandas as pd
import datetime 
import numpy as np
from tqdm import tqdm

# HF patients on spironolactone

In [None]:
with open('/projects/RALES TRIAL/1A/spiro dataextraction/spiro data/first_mention_spiro.pickle', 'rb') as f:
    first_mention_spiro = pickle.load(f)
len(first_mention_spiro)

In [None]:
spiro = list(first_mention_spiro.keys())
len(spiro)

# Comorbidities

In [None]:
def graph_from_onto(onto):
    G = nx.DiGraph()
    cl_edges = []
    for s, ts in onto.items():
        for t in ts:
            cl_edges.append((s.replace('S-', ''), t.replace('S-', '')))
    G.add_edges_from(cl_edges)
    return G

def load_onto(o):
    with open(f'/projects/data/GS/{o}', 'rb') as f:
        onto = pickle.load(f)
    G = graph_from_onto(onto)
    return G

def expand_codes(onto, codes):
    expanded = {}
    for name, top_codes in codes.items():
        expanded[name] = set()
        for c in top_codes:
            expanded[name].add(c)
            if c in onto:
                expanded[name].update(nx.ancestors(onto, c))
            else:
                print("NOT FOUND:", c, name)
        print(name, len(top_codes), len(expanded[name]))
    return expanded

In [None]:
exclusion_criteria = {
    'valvular_heart_disease' : ['426611007'],
    'congenital_heart_disease' : ['13213009'],
    'unstable_angina' : ['4557003'],
    'cancer' : ['86049000', '363346000'],
    'acute_hepatic_failure' : ['197270009'],
}

In [None]:
onto = load_onto('isa_rela_ch2pt_202009.pickle')
ec = expand_codes(onto,exclusion_criteria)

In [None]:
with open('/projects/data/GS/pt2cui_pos_dates.pickle', 'rb') as f:
    pt2cui_pos_dates = pickle.load(f)

In [None]:
def dates_ex(pt_data, codes):
    dates = None
    for x in codes:
        if x in pt_data and len(pt_data[x])>=2:
            if dates == None:
                dates = pt_data[x]
            else: 
                dates = dates.union(pt_data[x])
    return dates

rows = []
for pt in spiro:
    pt_data = pt2cui_pos_dates.get(pt, {})
    row = {'client_idcode': pt, 'first_mention_spiro': first_mention_spiro[pt]} 
    for concept, codes in ec.items():
        dates = dates_ex(pt_data, codes)
        if dates == None:
            row[f"{concept}_date"] = np.nan
        else:
            row[f"{concept}_date"] = dates
    rows.append(row)

In [None]:
df = pd.DataFrame(rows)
df.shape

In [None]:
df.head()

In [None]:
%%time
df = df[['client_idcode']].join((df[i].explode() for i in df.iloc[:,1:]))
df.shape

In [None]:
%%time
for k in ec.keys():
    df[f'{k}_date'] = pd.to_datetime(df[f'{k}_date']).dt.date

In [None]:
%%time
for k in ec.keys():
    df[f'{k}_delta'] = pd.to_datetime(df['first_mention_spiro']) - pd.to_datetime(df[f'{k}_date'])

In [None]:
df.head()

In [None]:
min_time = datetime.timedelta(days=1) 
max_time = datetime.timedelta(days=183)
for k in ec.keys():
    t1 = df[f'{k}_delta'] >= min_time 
    t2 = df[f'{k}_delta'] <= max_time
    in_window = t1 & t2 
    df[f'{k}_in_window'] = in_window

In [None]:
for k in ec.keys():
    df[f'{k}_true_count'] = (
        df[f'{k}_in_window'].eq(True)
            .groupby(df['client_idcode']).transform('sum')
    )

In [None]:
df = df.drop_duplicates('client_idcode') 
df.reset_index(drop=True, inplace=True)

In [None]:
print(df.shape)
df.head()

# Preparing the data for extraction

In [None]:
cancer = set()
for i,row in tqdm(df.iterrows()):
    if df.loc[i,'cancer_true_count'] >= 1:
        cancer.add(df.loc[i,'client_idcode'])
with open ('/projects/RALES TRIAL/1A/spiro dataextraction/spiro data/cancer.pickle', 'wb') as f:
    pickle.dump(cancer,f)
len(cancer)

In [None]:
# quick check

df.loc[df['cancer_true_count'] >= 1] 

In [None]:
valvular_heart_disease = set()
for i,row in tqdm(df.iterrows()):
    if df.loc[i,'valvular_heart_disease_true_count'] >= 1:
        valvular_heart_disease.add(df.loc[i,'client_idcode'])
len(valvular_heart_disease)

In [None]:
congenital_heart_disease = set()
for i,row in tqdm(df.iterrows()):
    if df.loc[i,'congenital_heart_disease_true_count'] >= 1:
        congenital_heart_disease.add(df.loc[i,'client_idcode'])
with open ('/projects/RALES TRIAL/1A/spiro dataextraction/spiro data/congenital_heart_disease.pickle', 'wb') as f:
    pickle.dump(congenital_heart_disease,f)
len(congenital_heart_disease)

In [None]:
unstable_angina = set()
for i,row in tqdm(df.iterrows()):
    if df.loc[i,'unstable_angina_true_count'] >= 1:
        unstable_angina.add(df.loc[i,'client_idcode'])
with open ('/projects/RALES TRIAL/1A/spiro dataextraction/spiro data/unstable_angina.pickle', 'wb') as f:
    pickle.dump(unstable_angina,f)
len(unstable_angina)

In [None]:
acute_hepatic_failure = set()
for i,row in tqdm(df.iterrows()):
    if df.loc[i,'acute_hepatic_failure_true_count'] >= 1:
        acute_hepatic_failure.add(df.loc[i,'client_idcode'])
with open ('/projects/RALES TRIAL/1A/spiro dataextraction/spiro data/acute_hepatic_failure.pickle', 'wb') as f:
    pickle.dump(acute_hepatic_failure,f)
len(acute_hepatic_failure)