In [64]:
import pandas as pd 
import pickle
import datetime
import re
import networkx as nx
import numpy as np

# Atrial fibrillation (AF) patients on rivaroxaban

In [12]:
with open('/projects/ROCKET AF/1A/rivaroxaban dataextraction/data/first_mention_rivaroxaban.pickle', 'rb') as f:
    rivaroxaban = pickle.load(f)

In [13]:
len(rivaroxaban)

2585

In [14]:
df = pd.DataFrame(list(rivaroxaban.items()), columns=['client_idcode', 'first_mention_rivaroxaban']) # This code converts a dictionary (rivaroxaban) into a Pandas DataFrame with two columns named 'client_idcode' and 'first_mention_rivaroxaban'
df.shape

(2585, 2)

In [15]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban
0,0372858,2015-03-02
1,M282816,2018-10-25
2,V282449,2018-08-02
3,V459668,2015-06-25
4,R081687,2015-04-27


In [16]:
ids = set(df['client_idcode'])
len(ids)

2585

# AF patients on rivaroxaban with a left ventricular ejection fraction (LVEF) <35%

In [17]:
lvef = pd.read_csv('/projects/data/GS/HF/from_jack/FULL_TEXT_cardiac_echo_report_v2_with_vars.csv')

In [18]:
lvef.shape

(245484, 21)

In [19]:
lvef.head()

Unnamed: 0.1,Unnamed: 0,document_dateadded,document_description,document_guid,client_idcode,document_datecreated,body_analysed,ef_value,ef_text,lvidd,...,lvpwd,e_e_prime_avg,e_e_prime_lat,e_e_prime_sept,tr_max_vel,pasp_v1,pasp_v2,lavi,gls,tapse
0,0,2014-01-13T16:30:00.903+0000,Cardiac - Echo Reports,999308380,R044506,2014-01-13T16:29:55.167+0000,\nStudy ID: 289702\n\nInterpretation Summary \...,55.0,preserved,5.2,...,1.3,,,,279.0,,,,,2.2
1,1,2016-06-14T09:35:25.560+0100,Cardiac - Echo Reports,1074577738,P517376,2016-06-14T09:35:25.733+0100,\nStudy ID: 402414\n\nCardiology Department \n...,69.4,preserved,4.1,...,1.1,13.0,14.0,12.0,,,,16.3,,1.9
2,2,2016-07-04T10:25:32.120+0100,Cardiac - Echo Reports,1075890234,P250150,2016-07-04T10:25:40.297+0100,\nStudy ID: 404983\n\nCardiology Department \n...,,,,...,,,,,,,,,,
3,3,2014-01-13T19:54:11.850+0000,Cardiac - Echo Reports,999311874,0730904,2014-01-13T19:53:52.203+0000,\nStudy ID: 289701\n\nInterpretation Summary \...,48.0,impaired,5.9,...,1.2,,,10.0,,,,,,2.0
4,4,2017-07-14T09:05:33.157+0100,Cardiac - Echo Reports,1106856686,M874841,2017-07-14T09:05:33.470+0100,\nStudy ID: 455585\n\nCardiology Department \n...,47.0,impaired,4.8,...,0.92,6.0,6.0,7.0,256.0,,,,,2.3


In [20]:
lvef['document_datecreated'].isna().sum()

0

In [23]:
lvef = lvef[['client_idcode', 'document_datecreated', 'ef_value']]

In [24]:
lvef.head()

Unnamed: 0,client_idcode,document_datecreated,ef_value
0,R044506,2014-01-13T16:29:55.167+0000,55.0
1,P517376,2016-06-14T09:35:25.733+0100,69.4
2,P250150,2016-07-04T10:25:40.297+0100,
3,0730904,2014-01-13T19:53:52.203+0000,48.0
4,M874841,2017-07-14T09:05:33.470+0100,47.0


In [25]:
lvef.shape

(245484, 3)

In [26]:
lvef.dropna(subset=['ef_value'], inplace = True)

In [27]:
lvef.shape

(164628, 3)

In [28]:
df = pd.merge(lvef, df, on='client_idcode', how='right') 

In [29]:
df.shape

(5084, 4)

In [30]:
df.head()

Unnamed: 0,client_idcode,document_datecreated,ef_value,first_mention_rivaroxaban
0,0372858,,,2015-03-02
1,M282816,2019-02-15T10:50:25.223+0000,39.0,2018-10-25
2,M282816,2018-11-16T13:00:24.540+0000,43.2,2018-10-25
3,M282816,2019-10-09T15:14:58.283+0100,66.0,2018-10-25
4,V282449,,,2018-08-02


In [31]:
df['ef_value'].isna().sum()

1111

In [32]:
df.dropna(subset=['ef_value'], inplace = True)

In [33]:
df.shape

(3973, 4)

In [35]:
df['client_idcode'].nunique()

1474

In [36]:
1474 + 1111

2585

In [37]:
df['document_datecreated'] = pd.to_datetime(df['document_datecreated'], utc=True).dt.date 

In [38]:
df.head()

Unnamed: 0,client_idcode,document_datecreated,ef_value,first_mention_rivaroxaban
1,M282816,2019-02-15,39.0,2018-10-25
2,M282816,2018-11-16,43.2,2018-10-25
3,M282816,2019-10-09,66.0,2018-10-25
5,V459668,2015-06-17,63.0,2015-06-25
6,R081687,2015-04-21,50.0,2015-04-27


In [39]:
%%time 
df['difference'] = pd.to_datetime(df['first_mention_rivaroxaban']) - pd.to_datetime(df['document_datecreated'])
min_time = datetime.timedelta(days=1) 
max_time = datetime.timedelta(days=183)
t1 = df['difference'] >= min_time 
t2 = df['difference'] <= max_time
in_window = t1 & t2 
df['in_window'] = in_window

CPU times: user 5 ms, sys: 3.62 ms, total: 8.62 ms
Wall time: 6.93 ms


In [40]:
df.head()

Unnamed: 0,client_idcode,document_datecreated,ef_value,first_mention_rivaroxaban,difference,in_window
1,M282816,2019-02-15,39.0,2018-10-25,-113 days,False
2,M282816,2018-11-16,43.2,2018-10-25,-22 days,False
3,M282816,2019-10-09,66.0,2018-10-25,-349 days,False
5,V459668,2015-06-17,63.0,2015-06-25,8 days,True
6,R081687,2015-04-21,50.0,2015-04-27,6 days,True


In [41]:
df = df.loc[df['in_window'] == True]
df['client_idcode'].nunique()

569

In [43]:
df = df[df['ef_value'] <= 35]

In [44]:
df['client_idcode'].nunique()

107

In [46]:
lvef = set(df['client_idcode'])
len(lvef)

107

In [47]:
with open('/projects/ROCKET AF/1A/rivaroxaban dataextraction/data/lvef.pickle', 'wb') as f:
    pickle.dump(lvef,f)

# AF patients on rivaroxaban with heart failure (HF)

In [84]:
def graph_from_onto(onto):
    G = nx.DiGraph()
    cl_edges = []
    for s, ts in onto.items():
        for t in ts:
            cl_edges.append((s.replace('S-', ''), t.replace('S-', '')))
    G.add_edges_from(cl_edges)
    return G

def load_onto(o):
    with open(f'/projects/data/GS/{o}', 'rb') as f:
        onto = pickle.load(f)
    G = graph_from_onto(onto)
    return G

def expand_codes(onto, codes):
    expanded = {}
    for name, top_codes in codes.items():
        expanded[name] = set()
        for c in top_codes:
            expanded[name].add(c)
            if c in onto:
                expanded[name].update(nx.ancestors(onto, c))
            else:
                print("NOT FOUND:", c, name)
        print(name, len(top_codes), len(expanded[name]))
    return expanded

In [85]:
onto = load_onto('isa_rela_ch2pt_202009.pickle')

In [86]:
hf = { 'HF' : ['84114007']}
hf = expand_codes(onto,hf)

HF 1 100


In [87]:
%%time 
with open('/projects/data/GS/pt2cui_pos_dates.pickle', 'rb') as f:
    pt2cui_pos_dates = pickle.load(f)

CPU times: user 2min 32s, sys: 35.4 s, total: 3min 7s
Wall time: 3min 7s


In [97]:
def dates_ex(pt_data, codes):
    dates = None
    for x in codes:
        if x in pt_data and len(pt_data[x])>=2:
            if dates == None:
                dates = pt_data[x]
            else: 
                dates = dates.union(pt_data[x])
    return dates

rows = []
for pt in ids:
    pt_data = pt2cui_pos_dates.get(pt, {})
    row = {'client_idcode': pt, 'first_mention_rivaroxaban': rivaroxaban[pt]} 
    for concept, codes in hf.items():
        dates = dates_ex(pt_data, codes)
        if dates == None:
            row[f"{concept}_date"] = np.nan
        else:
            row[f"{concept}_date"] = dates
    rows.append(row)

In [98]:
df = pd.DataFrame(rows)
df.shape

(2585, 3)

In [99]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,HF_date
0,P502654,2014-09-12,
1,M211261,2020-09-05,
2,V501863,2016-02-25,
3,M441841,2015-07-20,
4,A510695,2018-02-09,


In [100]:
df.dropna(subset=['HF_date'], inplace = True)

In [101]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,HF_date
6,P234624,2014-06-27,"{2014-06-30 09:32:00, 2014-06-25 02:41:00, 201..."
17,D100013,2017-10-17,"{2017-10-30 10:56:58.007000, 2017-12-12 10:59:..."
19,0640313,2019-01-10,"{2019-01-04 21:59:44.213000, 2015-03-30 19:58:..."
21,0976670,2010-09-13,"{2018-12-27 10:40:31.660000, 2012-01-25 15:41:..."
22,P473600,2015-12-02,"{2015-12-09 16:34:14.920000, 2018-01-11 15:16:..."


In [102]:
row = df.loc[df['client_idcode'] == 'P234624']

In [106]:
specific_client_idcode = 'P234624'

In [107]:
if not row.empty:
    # Access the set within the 'Set_Column' and print it
    specific_set = row.iloc[0]['HF_date']
    print(f"Set for ID {specific_client_idcode}: {specific_set}")
else:
    print(f"No matching ID found for {specific_client_idcode}")

Set for ID P234624: {datetime.datetime(2014, 6, 30, 9, 32), datetime.datetime(2014, 6, 25, 2, 41), datetime.datetime(2014, 6, 25, 1, 52), datetime.datetime(2014, 6, 26, 12, 12), datetime.datetime(2014, 6, 24, 10, 3)}


In [113]:
df['client_idcode'].nunique()

597

In [114]:
%%time
df = df[['client_idcode']].join((df[i].explode() for i in df.iloc[:,1:]))
df.shape

CPU times: user 8.67 ms, sys: 3.11 ms, total: 11.8 ms
Wall time: 9.94 ms


(8219, 3)

In [115]:
df.head(6)

Unnamed: 0,client_idcode,first_mention_rivaroxaban,HF_date
6,P234624,2014-06-27,2014-06-30 09:32:00.000
6,P234624,2014-06-27,2014-06-25 02:41:00.000
6,P234624,2014-06-27,2014-06-25 01:52:00.000
6,P234624,2014-06-27,2014-06-26 12:12:00.000
6,P234624,2014-06-27,2014-06-24 10:03:00.000
17,D100013,2017-10-17,2017-10-30 10:56:58.007


In [117]:
df['HF_date'] = pd.to_datetime(df['HF_date'], utc=True).dt.date 

In [118]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,HF_date
6,P234624,2014-06-27,2014-06-30
6,P234624,2014-06-27,2014-06-25
6,P234624,2014-06-27,2014-06-25
6,P234624,2014-06-27,2014-06-26
6,P234624,2014-06-27,2014-06-24


In [119]:
%%time 
df['difference'] = pd.to_datetime(df['first_mention_rivaroxaban']) - pd.to_datetime(df['HF_date'])
min_time = datetime.timedelta(days=1) 
max_time = datetime.timedelta(days=183)
t1 = df['difference'] >= min_time 
t2 = df['difference'] <= max_time
in_window = t1 & t2 
df['in_window'] = in_window

CPU times: user 9.12 ms, sys: 298 µs, total: 9.42 ms
Wall time: 7.88 ms


In [120]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,HF_date,difference,in_window
6,P234624,2014-06-27,2014-06-30,-3 days,False
6,P234624,2014-06-27,2014-06-25,2 days,True
6,P234624,2014-06-27,2014-06-25,2 days,True
6,P234624,2014-06-27,2014-06-26,1 days,True
6,P234624,2014-06-27,2014-06-24,3 days,True


In [121]:
df = df.loc[df['in_window'] == True]
df['client_idcode'].nunique()

237

In [122]:
hf = set(df['client_idcode'])
len(hf)

237

In [123]:
with open('/projects/ROCKET AF/1A/rivaroxaban dataextraction/data/hf.pickle', 'wb') as f:
    pickle.dump(hf,f)