In [None]:
import pandas as pd 
import pickle
import datetime
import re
import networkx as nx
import numpy as np
from tqdm import tqdm

# Atrial fibrillation (AF) patients on warfarin

In [None]:
with open('/projects/ROCKET AF/1A/warfarin dataextraction/data/first_mention_warfarin.pickle', 'rb') as f:
    warfarin = pickle.load(f)

In [None]:
len(warfarin)

In [None]:
df = pd.DataFrame(list(warfarin.items()), columns=['client_idcode', 'first_mention_warfarin']) # This code converts a dictionary (warfarin) into a Pandas DataFrame with two columns named 'client_idcode' and 'first_mention_warfarin'
df.shape

In [None]:
df.head()

In [None]:
ids = set(df['client_idcode'])
len(ids)

# AF patients' blood pressure

In [None]:
blood_pressure = pd.read_csv('/projects/ROCKET AF/data/blood_pressure.csv')

In [None]:
blood_pressure.shape

In [None]:
blood_pressure.head()

In [None]:
del blood_pressure['Unnamed: 0']

In [None]:
blood_pressure.head()

In [None]:
blood_pressure['obscatalogmasteritem_displayname'].unique()

In [None]:
blood_pressure['observationdocument_createdwhen'].isna().sum()

In [None]:
blood_pressure['obscatalogmasteritem_unitofmeasure'].unique()

# Systolic blood pressure

In [None]:
systolic_blood_pressure = blood_pressure.loc[blood_pressure['obscatalogmasteritem_displayname'] == 'NEWS_Systolic_BP']
systolic_blood_pressure.shape

In [None]:
systolic_blood_pressure.head()

In [None]:
systolic_blood_pressure = systolic_blood_pressure[['client_idcode', 'observation_valuetext_analysed', 'observationdocument_createdwhen']]

In [None]:
systolic_blood_pressure['first_warfarin_prescription'] = systolic_blood_pressure['client_idcode'].map(warfarin)

In [None]:
systolic_blood_pressure.head()

In [None]:
systolic_blood_pressure['observationdocument_createdwhen'] = pd.to_datetime(systolic_blood_pressure['observationdocument_createdwhen'], utc=True).dt.date 

In [None]:
systolic_blood_pressure.head()

In [None]:
systolic_blood_pressure['difference'] = pd.to_datetime(systolic_blood_pressure['first_warfarin_prescription']) - pd.to_datetime(systolic_blood_pressure['observationdocument_createdwhen'])

In [None]:
systolic_blood_pressure.head()

In [None]:
min_time = datetime.timedelta(days=1) 
max_time = datetime.timedelta(days=183)
t1 = systolic_blood_pressure['difference'] >= min_time 
t2 = systolic_blood_pressure['difference'] <= max_time
in_window = t1 & t2 
systolic_blood_pressure['in_window'] = in_window

In [None]:
systolic_blood_pressure = systolic_blood_pressure.loc[systolic_blood_pressure['in_window'] == True]

In [None]:
systolic_blood_pressure.shape

In [None]:
systolic_blood_pressure.head()

In [None]:
# Group by 'client_idcode' and keep only the rows with the smallest 'difference'

min_values = systolic_blood_pressure.groupby('client_idcode')['difference'].transform('min')

# Filter the DataFrame to keep only rows with the minimum 'Value'
systolic_blood_pressure = systolic_blood_pressure[systolic_blood_pressure['difference'] == min_values]

In [None]:
systolic_blood_pressure.shape

In [None]:
systolic_blood_pressure.head()

In [None]:
systolic_blood_pressure['client_idcode'].nunique()

In [None]:
systolic_blood_pressure = systolic_blood_pressure.sort_values(by='client_idcode')
systolic_blood_pressure.head()

In [None]:
# Group by 'client_idcode' and calculate the average of 'systolic_blod_pressure value'

systolic_blood_pressure_avg = systolic_blood_pressure.groupby('client_idcode')['observation_valuetext_analysed'].mean().reset_index()
    

In [None]:
systolic_blood_pressure_avg.shape

In [None]:
systolic_blood_pressure_avg.head()

In [None]:
systolic_blood_pressure_avg = systolic_blood_pressure_avg[systolic_blood_pressure_avg['observation_valuetext_analysed'] > 140]

In [None]:
systolic_blood_pressure_avg.shape

In [None]:
systolic_blood_pressure_avg.head()

In [None]:
sbp = dict(zip(systolic_blood_pressure_avg['client_idcode'], systolic_blood_pressure_avg['observation_valuetext_analysed']))

In [None]:
with open('/projects/ROCKET AF/1A/warfarin dataextraction/data/sbp.pickle', 'wb') as f:
    pickle.dump(sbp,f)

# Diastolic blood pressure

In [None]:
diastolic_blood_pressure = blood_pressure.loc[blood_pressure['obscatalogmasteritem_displayname'] == 'NEWS_Diastolic_BP']
diastolic_blood_pressure.shape

In [None]:
diastolic_blood_pressure.head()

In [None]:
diastolic_blood_pressure = diastolic_blood_pressure[['client_idcode', 'observation_valuetext_analysed', 'observationdocument_createdwhen']]

In [None]:
diastolic_blood_pressure['first_warfarin_prescription'] = diastolic_blood_pressure['client_idcode'].map(warfarin)

In [None]:
diastolic_blood_pressure.head()

In [None]:
diastolic_blood_pressure['observationdocument_createdwhen'] = pd.to_datetime(diastolic_blood_pressure['observationdocument_createdwhen'], utc=True).dt.date 

In [None]:
diastolic_blood_pressure.head()

In [None]:
diastolic_blood_pressure['difference'] = pd.to_datetime(diastolic_blood_pressure['first_warfarin_prescription']) - pd.to_datetime(diastolic_blood_pressure['observationdocument_createdwhen'])

In [None]:
diastolic_blood_pressure.head()

In [None]:
min_time = datetime.timedelta(days=1) 
max_time = datetime.timedelta(days=183)
t1 = diastolic_blood_pressure['difference'] >= min_time 
t2 = diastolic_blood_pressure['difference'] <= max_time
in_window = t1 & t2 
diastolic_blood_pressure['in_window'] = in_window

In [None]:
diastolic_blood_pressure = diastolic_blood_pressure.loc[diastolic_blood_pressure['in_window'] == True]

In [None]:
diastolic_blood_pressure.shape

In [None]:
diastolic_blood_pressure.head()

In [None]:
# Group by 'client_idcode' and keep only the rows with the smallest 'difference'

min_values = diastolic_blood_pressure.groupby('client_idcode')['difference'].transform('min')

# Filter the DataFrame to keep only rows with the minimum 'Value'
diastolic_blood_pressure = diastolic_blood_pressure[diastolic_blood_pressure['difference'] == min_values]

In [None]:
diastolic_blood_pressure.shape

In [None]:
diastolic_blood_pressure.head()

In [None]:
diastolic_blood_pressure['client_idcode'].nunique()

In [None]:
diastolic_blood_pressure = diastolic_blood_pressure.sort_values(by='client_idcode')
diastolic_blood_pressure.head()

In [None]:
# Group by 'client_idcode' and calculate the average of 'systolic_blod_pressure value'

diastolic_blood_pressure_avg = diastolic_blood_pressure.groupby('client_idcode')['observation_valuetext_analysed'].mean().reset_index()
    

In [None]:
diastolic_blood_pressure_avg.shape

In [None]:
diastolic_blood_pressure_avg.head()

In [None]:
diastolic_blood_pressure_avg = diastolic_blood_pressure_avg[diastolic_blood_pressure_avg['observation_valuetext_analysed'] > 90]

In [None]:
diastolic_blood_pressure_avg.shape

In [None]:
diastolic_blood_pressure_avg.head()

In [None]:
dbp = dict(zip(diastolic_blood_pressure_avg['client_idcode'], diastolic_blood_pressure_avg['observation_valuetext_analysed']))

In [None]:
with open('/projects/ROCKET AF/1A/warfarin dataextraction/data/dbp.pickle', 'wb') as f:
    pickle.dump(dbp,f)

----------

# AF patients with high blood pressure

In [None]:
def graph_from_onto(onto):
    G = nx.DiGraph()
    cl_edges = []
    for s, ts in onto.items():
        for t in ts:
            cl_edges.append((s.replace('S-', ''), t.replace('S-', '')))
    G.add_edges_from(cl_edges)
    return G

def load_onto(o):
    with open(f'/projects/data/GS/{o}', 'rb') as f:
        onto = pickle.load(f)
    G = graph_from_onto(onto)
    return G

def expand_codes(onto, codes):
    expanded = {}
    for name, top_codes in codes.items():
        expanded[name] = set()
        for c in top_codes:
            expanded[name].add(c)
            if c in onto:
                expanded[name].update(nx.ancestors(onto, c))
            else:
                print("NOT FOUND:", c, name)
        print(name, len(top_codes), len(expanded[name]))
    return expanded

In [None]:
onto = load_onto('isa_rela_ch2pt_202009.pickle')

In [None]:
db = {'hypertension' : ['38341003']}
db = expand_codes(onto,db)

In [None]:
%%time 
with open('/projects/data/GS/pt2cui_pos_dates.pickle', 'rb') as f:
    pt2cui_pos_dates = pickle.load(f)

In [None]:
def dates_ex(pt_data, codes):
    dates = None
    for x in codes:
        if x in pt_data and len(pt_data[x])>=2:
            if dates == None:
                dates = pt_data[x]
            else: 
                dates = dates.union(pt_data[x])
    return dates

rows = []
for pt in ids:
    pt_data = pt2cui_pos_dates.get(pt, {})
    row = {'client_idcode': pt, 'first_mention_warfarin': warfarin[pt]} 
    for concept, codes in db.items():
        dates = dates_ex(pt_data, codes)
        if dates == None:
            row[f"{concept}_date"] = np.nan
        else:
            row[f"{concept}_date"] = dates
    rows.append(row)

In [None]:
df = pd.DataFrame(rows)
df.shape

In [None]:
df.head()

In [None]:
df.dropna(subset=['hypertension_date'], how='all', inplace = True)

In [None]:
df.head()

In [None]:
%%time
df = df[['client_idcode']].join((df[i].explode() for i in df.iloc[:,1:]))
df.shape

In [None]:
df.head()

In [None]:
for k in db.keys():
    df[f'{k}_date'] = pd.to_datetime(df[f'{k}_date']).dt.date

In [None]:
df.head()

In [None]:
for k in db.keys():
    df[f'{k}_delta'] = pd.to_datetime(df['first_mention_warfarin']) - pd.to_datetime(df[f'{k}_date'])

In [None]:
df.head()

In [None]:
min_time = datetime.timedelta(days=1) 
max_time = datetime.timedelta(days=183)
for k in db.keys():
    t1 = df[f'{k}_delta'] >= min_time 
    t2 = df[f'{k}_delta'] <= max_time
    in_window = t1 & t2 
    df[f'{k}_in_window'] = in_window

In [None]:
df.head()

In [None]:
df = df.loc[df['hypertension_in_window'] == True]

In [None]:
df.head()

In [None]:
hypertension = set(df['client_idcode'])
len(hypertension)

In [None]:
with open('/projects/ROCKET AF/1A/warfarin dataextraction/data/hypertension.pickle', 'wb') as f:
    pickle.dump(hypertension,f)