In [None]:
import pickle
import networkx as nx
import pandas as pd 
import datetime
import pandas
import glob
import os

# Identifying HF patients  

In [None]:
def graph_from_onto(onto):
    G = nx.DiGraph()
    cl_edges = []
    for s, ts in onto.items():
        for t in ts:
            cl_edges.append((s.replace('S-', ''), t.replace('S-', '')))
    G.add_edges_from(cl_edges)
    return G

def load_onto(o):
    with open(f'/projects/data/GS/{o}', 'rb') as f:
        onto = pickle.load(f)
    G = graph_from_onto(onto)
    return G

def expand_codes(onto, codes):
    expanded = {}
    for name, top_codes in codes.items():
        expanded[name] = set()
        for c in top_codes:
            expanded[name].add(c)
            if c in onto:
                expanded[name].update(nx.ancestors(onto, c))
            else:
                print("NOT FOUND:", c, name)
        print(name, len(top_codes), len(expanded[name]))
    return expanded

In [None]:
hf = { 'HF' : ['84114007']}
onto = load_onto('isa_rela_ch2pt_202009.pickle')
hf = expand_codes(onto,hf)

In [None]:
hf_codes = set()
for x in hf.values():
    hf_codes.update(x)

In [None]:
%%time 
with open('/projects/data/GS/pt2cui_pos_dates.pickle', 'rb') as f:
    pt2cui_pos_dates = pickle.load(f)

In [None]:
def first_date_for_code_list(pt_data, codes):
    first = None
    for x in codes:
        if x in pt_data and len(pt_data[x])>=2:  
            if first == None:
                first = min(pt_data[x])
            else:
                first = min(first, min(pt_data[x]))
    return first

first_mention_hf = {}
for pt in pt2cui_pos_dates:
    pt_data = pt2cui_pos_dates.get(pt, {})
    first = first_date_for_code_list(pt_data, hf_codes)
    first_mention_hf[pt] = first

In [None]:
print(len(first_mention_hf))
first_mention_hf = {k: v for k, v in first_mention_hf.items() if v is not None}
len(filtered_first_mention_hf)

In [None]:
filtered_first_mention_hf = pd.DataFrame(list(filtered_first_mention_hf.items()), columns=['client_idcode', 'first_mention_hf'])
filtered_first_mention_hf['first_mention_hf'] = pd.to_datetime(filtered_first_mention_hf['first_mention_hf'], utc=True).dt.date 
filtered_first_mention_hf.head()

In [None]:
hf_patients = set(filtered_first_mention_hf['client_idcode'])
len(hf_patients)

# LVEF  

In [None]:
df = pd.read_csv('/projects/data/GS/HF/from_jack/FULL_TEXT_cardiac_echo_report_v2_with_vars.csv')
del df['Unnamed: 0']

In [None]:
df.shape

In [None]:
df = df.loc[df['ef_value'] <= 35] # automatisation
df.shape

# Extracting LVEF data for each HF patient  

In [None]:
df = pd.merge(df, filtered_first_mention_hf, on='client_idcode', how='right')
df = df[df['ef_value'].notna()]
df.shape

In [None]:
len(df.loc[df['document_datecreated'].isna()])

In [None]:
df = df[['client_idcode', 'first_mention_hf', 'document_datecreated', 'ef_value']]

In [None]:
df['document_datecreated'] = pd.to_datetime(df['document_datecreated'], utc=True).dt.date 
df['difference'] = pd.to_datetime(df['first_mention_hf']) - pd.to_datetime(df['document_datecreated'])
min_time = datetime.timedelta(days=-182) 
max_time = datetime.timedelta(days=0)
t1 = df['difference'] >= min_time 
t2 = df['difference'] <= max_time
in_window = t1 & t2 
df['in_window'] = in_window

In [None]:
df.head()

In [None]:
df = df.loc[df['in_window'] == True]
df.shape

In [None]:
duplicates = df['client_idcode'].duplicated()
duplicates.value_counts()

In [None]:
df = df[df.groupby('client_idcode').document_datecreated.transform('min') == df['document_datecreated']]
df.shape

In [None]:
df.head()

In [None]:
duplicates = df['client_idcode'].duplicated()
duplicates.value_counts()

In [None]:
df['result'] = df.groupby('client_idcode')['ef_value'].transform('mean')
df.drop_duplicates(subset='client_idcode', keep='first', inplace = True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.shape

In [None]:
ids = set(df['client_idcode'])
print(len(ids))
print(len(hf_patients))

In [None]:
no_lvef_ids = hf_patients.difference(ids)
len(no_lvef_ids)

In [None]:
df.head()

In [None]:
df1 = df[['client_idcode', 'result', 'document_datecreated']]
df1 = df1.rename(columns={'document_datecreated': 'studystartdatetime'})

In [None]:
df1.shape

# Load extra LVEF data 

In [None]:
path = '/projects/code/RALES Trial (ERROR)/LVEF/LVEF_not_spiro'
all_files = glob.glob(os.path.join(path, "*.csv"))
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

In [None]:
df.shape

In [None]:
df = df[['medicalrecordnumber', 'measname', 'measurevalue', 'studystartdatetime']]
df.rename(columns={'medicalrecordnumber':'client_idcode'}, inplace=True)
df = pd.merge(df, filtered_first_mention_hf, on='client_idcode') 
df.shape

In [None]:
df = df.loc[df['measname'] == 'EF(MOD-BP)']

In [None]:
print(df['first_mention_hf'].isna().sum())
df['studystartdatetime'] = pd.to_datetime(df['studystartdatetime'], utc=True).dt.date

In [None]:
%%time
df['difference'] =  pd.to_datetime(df['first_mention_hf']) - pd.to_datetime(df['studystartdatetime'])
min_time = datetime.timedelta(days=-182) 
max_time = datetime.timedelta(days=0)
t1 = df['difference'] >= min_time 
t2 = df['difference'] <= max_time
in_window = t1 & t2 
df['in_window'] = in_window

In [None]:
df.head()

In [None]:
df = df[df.in_window]
df.shape

In [None]:
df = df.loc[df['measurevalue'] <= 35]  # automatisation

In [None]:
duplicates = df['client_idcode'].duplicated()
duplicates.value_counts()

In [None]:
df = df[df.groupby('client_idcode').studystartdatetime.transform('min') == df['studystartdatetime']]

In [None]:
duplicates = df['client_idcode'].duplicated()
duplicates.value_counts()

In [None]:
df['result'] = df.groupby('client_idcode')['measurevalue'].transform('mean')
df.drop_duplicates(subset='client_idcode', keep='first', inplace = True)
df.reset_index(drop=True, inplace=True)
df.shape

In [None]:
df['extra'] = df['client_idcode'].isin(ids)
df = df.loc[df['extra'] == False]
df.shape

In [None]:
df.head()

In [None]:
df2 = df[['client_idcode', 'studystartdatetime', 'result']]
df2.shape

# Final LVEF dataset

In [None]:
df = df1.append(df2, ignore_index=True)
df.shape

In [None]:
duplicates = df['client_idcode'].duplicated()
duplicates.value_counts()

In [None]:
df.head()

In [None]:
df['start_date'] = df['studystartdatetime'] + pd.DateOffset(days=1)

In [None]:
df.head()

In [None]:
start_date = dict(zip(df.client_idcode, df.start_date))
lvef = dict(zip(df.client_idcode, df.result))

In [None]:
df.to_csv('/projects/RALES TRIAL/1A/notspiro dataextraction/notspiro data/hf_patients.csv', columns=['client_idcode', 'start_date'], index=False)

In [None]:
with open ('/projects/RALES TRIAL/1A/notspiro dataextraction/notspiro data/hf_patients.pickle', 'wb') as f:
    pickle.dump(start_date,f)

In [None]:
with open ('/projects/RALES TRIAL/1A/notspiro dataextraction/notspiro data/lvef.pickle', 'wb') as f:
    pickle.dump(lvef,f)

In [None]:
print(df.loc[df['result'] <= 35])