In [1]:
import pandas as pd 
import pickle
import datetime
import re
import networkx as nx
import numpy as np

# Atrial fibrillation (AF) patients on rivaroxaban

In [2]:
with open('/projects/ROCKET AF/1A/rivaroxaban dataextraction/data/first_mention_rivaroxaban.pickle', 'rb') as f:
    rivaroxaban = pickle.load(f)

In [3]:
len(rivaroxaban)

2585

In [4]:
df = pd.DataFrame(list(rivaroxaban.items()), columns=['client_idcode', 'first_mention_rivaroxaban']) # This code converts a dictionary (rivaroxaban) into a Pandas DataFrame with two columns named 'client_idcode' and 'first_mention_rivaroxaban'
df.shape

(2585, 2)

In [5]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban
0,0372858,2015-03-02
1,M282816,2018-10-25
2,V282449,2018-08-02
3,V459668,2015-06-25
4,R081687,2015-04-27


In [6]:
ids = set(df['client_idcode'])
len(ids)

2585

# AF patients on rivaroxaban with history of ischemic stroke & transient ischemic attack (TIA)

In [7]:
def graph_from_onto(onto):
    G = nx.DiGraph()
    cl_edges = []
    for s, ts in onto.items():
        for t in ts:
            cl_edges.append((s.replace('S-', ''), t.replace('S-', '')))
    G.add_edges_from(cl_edges)
    return G

def load_onto(o):
    with open(f'/projects/data/GS/{o}', 'rb') as f:
        onto = pickle.load(f)
    G = graph_from_onto(onto)
    return G

def expand_codes(onto, codes):
    expanded = {}
    for name, top_codes in codes.items():
        expanded[name] = set()
        for c in top_codes:
            expanded[name].add(c)
            if c in onto:
                expanded[name].update(nx.ancestors(onto, c))
            else:
                print("NOT FOUND:", c, name)
        print(name, len(top_codes), len(expanded[name]))
    return expanded

In [8]:
onto = load_onto('isa_rela_ch2pt_202009.pickle')

In [9]:
inclusion_criteria = {
    'ischemic_stroke' : ['422504002'],
    'transient_ischemic_attack': ['266257000']
                     }
ic = expand_codes(onto,inclusion_criteria)

ischemic_stroke 1 11
transient_ischemic_attack 1 14


In [11]:
%%time 
with open('/projects/data/GS/pt2cui_pos_dates.pickle', 'rb') as f:
    pt2cui_pos_dates = pickle.load(f)

CPU times: user 2min 7s, sys: 32.6 s, total: 2min 40s
Wall time: 2min 40s


In [22]:
def dates_ex(pt_data, codes):
    dates = None
    for x in codes:
        if x in pt_data and len(pt_data[x])>=2:
            if dates == None:
                dates = pt_data[x]
            else: 
                dates = dates.union(pt_data[x])
    return dates

rows = []
for pt in ids:
    pt_data = pt2cui_pos_dates.get(pt, {})
    row = {'client_idcode': pt, 'first_mention_rivaroxaban': rivaroxaban[pt]} 
    for concept, codes in ic.items():
        dates = dates_ex(pt_data, codes)
        if dates == None:
            row[f"{concept}_date"] = np.nan
        else:
            row[f"{concept}_date"] = dates
    rows.append(row)

In [23]:
df = pd.DataFrame(rows)
df.shape

(2585, 4)

In [24]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,ischemic_stroke_date,transient_ischemic_attack_date
0,V209282,2016-08-18,,
1,R158068,2017-08-23,,
2,V692148,2017-03-23,,
3,F115915,2019-11-01,,"{2019-11-01 15:52:00, 2019-10-28 10:34:00, 201..."
4,M270753,2018-12-16,,


In [27]:
df.dropna(subset=['ischemic_stroke_date', 'transient_ischemic_attack_date'], how='all', inplace = True)

In [28]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,ischemic_stroke_date,transient_ischemic_attack_date
3,F115915,2019-11-01,,"{2019-11-01 15:52:00, 2019-10-28 10:34:00, 201..."
8,M381390,2018-09-26,,"{2018-05-16 03:14:25.300000, 2018-05-15 21:02:00}"
9,P252924,2013-08-28,,"{2017-02-03 09:02:45.933000, 2013-05-22 14:23:..."
21,P775471,2018-05-02,"{2018-05-22 12:03:00, 2018-05-24 14:44:41.6370...",
31,P299512,2022-01-17,"{2019-09-24 09:40:02.833000, 2018-07-31 18:30:...","{2019-08-29 23:17:00, 2018-10-29 15:47:06.1100..."


In [29]:
row = df.loc[df['client_idcode'] == 'F115915']

In [30]:
specific_client_idcode = 'F115915'

In [31]:
if not row.empty:
    # Access the set within the 'Set_Column' and print it
    specific_set = row.iloc[0]['transient_ischemic_attack_date']
    print(f"Set for ID {specific_client_idcode}: {specific_set}")
else:
    print(f"No matching ID found for {specific_client_idcode}")

Set for ID F115915: {datetime.datetime(2019, 11, 1, 15, 52), datetime.datetime(2019, 10, 28, 10, 34), datetime.datetime(2019, 10, 21, 16, 20), datetime.datetime(2019, 10, 21, 16, 35)}


In [32]:
df['client_idcode'].nunique()

476

In [33]:
%%time
df = df[['client_idcode']].join((df[i].explode() for i in df.iloc[:,1:]))
df.shape

CPU times: user 18.1 ms, sys: 3.52 ms, total: 21.6 ms
Wall time: 19.4 ms


(9039, 4)

In [34]:
df.head(6)

Unnamed: 0,client_idcode,first_mention_rivaroxaban,ischemic_stroke_date,transient_ischemic_attack_date
3,F115915,2019-11-01,NaT,2019-11-01 15:52:00.000
3,F115915,2019-11-01,NaT,2019-10-28 10:34:00.000
3,F115915,2019-11-01,NaT,2019-10-21 16:20:00.000
3,F115915,2019-11-01,NaT,2019-10-21 16:35:00.000
8,M381390,2018-09-26,NaT,2018-05-16 03:14:25.300
8,M381390,2018-09-26,NaT,2018-05-15 21:02:00.000


In [35]:
for k in ic.keys():
    df[f'{k}_date'] = pd.to_datetime(df[f'{k}_date']).dt.date

In [36]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,ischemic_stroke_date,transient_ischemic_attack_date
3,F115915,2019-11-01,NaT,2019-11-01
3,F115915,2019-11-01,NaT,2019-10-28
3,F115915,2019-11-01,NaT,2019-10-21
3,F115915,2019-11-01,NaT,2019-10-21
8,M381390,2018-09-26,NaT,2018-05-16


In [38]:
for k in ic.keys():
    df[f'{k}_delta'] = pd.to_datetime(df['first_mention_rivaroxaban']) - pd.to_datetime(df[f'{k}_date'])

In [39]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,ischemic_stroke_date,transient_ischemic_attack_date,ischemic_stroke_delta,transient_ischemic_attack_delta
3,F115915,2019-11-01,NaT,2019-11-01,NaT,0 days
3,F115915,2019-11-01,NaT,2019-10-28,NaT,4 days
3,F115915,2019-11-01,NaT,2019-10-21,NaT,11 days
3,F115915,2019-11-01,NaT,2019-10-21,NaT,11 days
8,M381390,2018-09-26,NaT,2018-05-16,NaT,133 days


In [41]:
min_time = datetime.timedelta(days=1) 
max_time = datetime.timedelta(days=183)
for k in ic.keys():
    t1 = df[f'{k}_delta'] >= min_time 
    t2 = df[f'{k}_delta'] <= max_time
    in_window = t1 & t2 
    df[f'{k}_in_window'] = in_window

In [42]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,ischemic_stroke_date,transient_ischemic_attack_date,ischemic_stroke_delta,transient_ischemic_attack_delta,ischemic_stroke_in_window,transient_ischemic_attack_in_window
3,F115915,2019-11-01,NaT,2019-11-01,NaT,0 days,False,False
3,F115915,2019-11-01,NaT,2019-10-28,NaT,4 days,False,True
3,F115915,2019-11-01,NaT,2019-10-21,NaT,11 days,False,True
3,F115915,2019-11-01,NaT,2019-10-21,NaT,11 days,False,True
8,M381390,2018-09-26,NaT,2018-05-16,NaT,133 days,False,True


In [43]:
ischemic_stroke = df.loc[df['ischemic_stroke_in_window'] == True]
ischemic_stroke = ischemic_stroke[['client_idcode', 'ischemic_stroke_in_window']]
ischemic_stroke.shape

(824, 2)

In [44]:
ischemic_stroke.head()

Unnamed: 0,client_idcode,ischemic_stroke_in_window
21,P775471,True
69,R081687,True
69,R081687,True
182,0259488,True
182,0259488,True


In [51]:
ichemic_stroke = set(ischemic_stroke['client_idcode'])
len(ischemic_stroke)

38

In [53]:
with open('/projects/ROCKET AF/1A/rivaroxaban dataextraction/data/ischemic_stroke.pickle', 'wb') as f:
    pickle.dump(ischemic_stroke,f)

In [49]:
transient_ischemic_attack = df.loc[df['transient_ischemic_attack_in_window'] == True]
transient_ischemic_attack = transient_ischemic_attack[['client_idcode', 'transient_ischemic_attack_in_window']]
transient_ischemic_attack.shape

(1238, 2)

In [50]:
transient_ischemic_attack.head()

Unnamed: 0,client_idcode,transient_ischemic_attack_in_window
3,F115915,True
3,F115915,True
3,F115915,True
8,M381390,True
8,M381390,True


In [54]:
transient_ischemic_attack = set(transient_ischemic_attack['client_idcode'])
len(transient_ischemic_attack)

179

In [55]:
with open('/projects/ROCKET AF/1A/rivaroxaban dataextraction/data/transient_ischemic_attack.pickle', 'wb') as f:
    pickle.dump(transient_ischemic_attack,f)