In [7]:
import pandas as pd 
import pickle
import datetime
import re
import networkx as nx
import numpy as np
from tqdm import tqdm

# Atrial fibrillation (AF) patients on rivaroxaban

In [8]:
with open('/projects/ROCKET AF/1A/rivaroxaban dataextraction/data/first_mention_rivaroxaban.pickle', 'rb') as f:
    rivaroxaban = pickle.load(f)

In [9]:
len(rivaroxaban)

2585

In [10]:
df = pd.DataFrame(list(rivaroxaban.items()), columns=['client_idcode', 'first_mention_rivaroxaban']) # This code converts a dictionary (rivaroxaban) into a Pandas DataFrame with two columns named 'client_idcode' and 'first_mention_rivaroxaban'
df.shape

(2585, 2)

In [11]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban
0,0372858,2015-03-02
1,M282816,2018-10-25
2,V282449,2018-08-02
3,V459668,2015-06-25
4,R081687,2015-04-27


In [12]:
ids = set(df['client_idcode'])
len(ids)

2585

# AF patients on rivaroxaban with diabetes

In [13]:
def graph_from_onto(onto):
    G = nx.DiGraph()
    cl_edges = []
    for s, ts in onto.items():
        for t in ts:
            cl_edges.append((s.replace('S-', ''), t.replace('S-', '')))
    G.add_edges_from(cl_edges)
    return G

def load_onto(o):
    with open(f'/projects/data/GS/{o}', 'rb') as f:
        onto = pickle.load(f)
    G = graph_from_onto(onto)
    return G

def expand_codes(onto, codes):
    expanded = {}
    for name, top_codes in codes.items():
        expanded[name] = set()
        for c in top_codes:
            expanded[name].add(c)
            if c in onto:
                expanded[name].update(nx.ancestors(onto, c))
            else:
                print("NOT FOUND:", c, name)
        print(name, len(top_codes), len(expanded[name]))
    return expanded

In [14]:
onto = load_onto('isa_rela_ch2pt_202009.pickle')

In [15]:
db = {'type_1_diabetes' : ['46635009'],
      'type_2_diabetes' : ['44054006']
     }
db = expand_codes(onto,db)

type_1_diabetes 1 40
type_2_diabetes 1 43


In [16]:
%%time 
with open('/projects/data/GS/pt2cui_pos_dates.pickle', 'rb') as f:
    pt2cui_pos_dates = pickle.load(f)

CPU times: user 2min 10s, sys: 18.7 s, total: 2min 29s
Wall time: 2min 29s


In [18]:
def dates_ex(pt_data, codes):
    dates = None
    for x in codes:
        if x in pt_data and len(pt_data[x])>=2:
            if dates == None:
                dates = pt_data[x]
            else: 
                dates = dates.union(pt_data[x])
    return dates

rows = []
for pt in ids:
    pt_data = pt2cui_pos_dates.get(pt, {})
    row = {'client_idcode': pt, 'first_mention_rivaroxaban': rivaroxaban[pt]} 
    for concept, codes in db.items():
        dates = dates_ex(pt_data, codes)
        if dates == None:
            row[f"{concept}_date"] = np.nan
        else:
            row[f"{concept}_date"] = dates
    rows.append(row)

In [19]:
df = pd.DataFrame(rows)
df.shape

(2585, 4)

In [20]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,type_1_diabetes_date,type_2_diabetes_date
0,F324540,2016-08-23,,
1,F412499,2016-01-04,,
2,0976670,2010-09-13,,"{2019-06-28 10:44:53.690000, 2018-12-27 10:40:..."
3,V503477,2015-09-30,,
4,M623481,2022-02-14,,"{2018-07-22 03:09:46.360000, 2015-05-10 03:03:..."


In [21]:
df.dropna(subset=['type_1_diabetes_date', 'type_2_diabetes_date'], how='all', inplace = True)

In [22]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,type_1_diabetes_date,type_2_diabetes_date
2,0976670,2010-09-13,,"{2019-06-28 10:44:53.690000, 2018-12-27 10:40:..."
4,M623481,2022-02-14,,"{2018-07-22 03:09:46.360000, 2015-05-10 03:03:..."
13,P660551,2016-08-04,"{2016-08-20 03:09:59.377000, 2016-08-18 16:59:...","{2017-02-06 10:25:00, 2017-05-30 11:39:00, 201..."
17,D079404,2013-04-22,,"{2013-04-21 14:08:00, 2018-03-08 10:21:43.0930..."
22,V598622,2016-07-01,,"{2016-07-11 19:09:00, 2016-07-05 04:31:00, 201..."


In [23]:
row = df.loc[df['client_idcode'] == 'V209282']

In [24]:
specific_client_idcode = 'V209282'

In [25]:
if not row.empty:
    # Access the set within the 'Set_Column' and print it
    specific_set = row.iloc[0]['type_2_diabetes_date']
    print(f"Set for ID {specific_client_idcode}: {specific_set}")
else:
    print(f"No matching ID found for {specific_client_idcode}")

Set for ID V209282: {datetime.datetime(2016, 8, 19, 7, 1, 27, 567000), datetime.datetime(2016, 8, 15, 9, 2), datetime.datetime(2018, 1, 5, 8, 42), datetime.datetime(2013, 2, 19, 7, 4), datetime.datetime(2012, 12, 12, 13, 5), datetime.datetime(2012, 11, 28, 11, 59, 30, 257000), datetime.datetime(2018, 3, 15, 12, 28, 25, 797000), datetime.datetime(2013, 2, 20, 16, 22), datetime.datetime(2013, 2, 25, 16, 40, 15, 40000), datetime.datetime(2017, 1, 25, 11, 52, 50, 590000)}


In [26]:
df['client_idcode'].nunique()

564

In [27]:
%%time
df = df[['client_idcode']].join((df[i].explode() for i in df.iloc[:,1:]))
df.shape

CPU times: user 12.2 ms, sys: 8.71 ms, total: 20.9 ms
Wall time: 18.8 ms


(54807, 4)

In [28]:
df.head(11)

Unnamed: 0,client_idcode,first_mention_rivaroxaban,type_1_diabetes_date,type_2_diabetes_date
2,0976670,2010-09-13,NaT,2019-06-28 10:44:53.690
2,0976670,2010-09-13,NaT,2018-12-27 10:40:31.660
2,0976670,2010-09-13,NaT,2019-12-18 13:50:36.733
4,M623481,2022-02-14,NaT,2018-07-22 03:09:46.360
4,M623481,2022-02-14,NaT,2015-05-10 03:03:31.473
4,M623481,2022-02-14,NaT,2016-08-29 03:02:24.923
4,M623481,2022-02-14,NaT,2015-05-09 12:57:00.000
4,M623481,2022-02-14,NaT,2018-07-23 07:31:00.577
4,M623481,2022-02-14,NaT,2017-04-14 02:09:00.000
4,M623481,2022-02-14,NaT,2016-08-28 23:25:00.000


In [29]:
for k in db.keys():
    df[f'{k}_date'] = pd.to_datetime(df[f'{k}_date']).dt.date

In [30]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,type_1_diabetes_date,type_2_diabetes_date
2,0976670,2010-09-13,NaT,2019-06-28
2,0976670,2010-09-13,NaT,2018-12-27
2,0976670,2010-09-13,NaT,2019-12-18
4,M623481,2022-02-14,NaT,2018-07-22
4,M623481,2022-02-14,NaT,2015-05-10


In [31]:
for k in db.keys():
    df[f'{k}_delta'] = pd.to_datetime(df['first_mention_rivaroxaban']) - pd.to_datetime(df[f'{k}_date'])

In [32]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,type_1_diabetes_date,type_2_diabetes_date,type_1_diabetes_delta,type_2_diabetes_delta
2,0976670,2010-09-13,NaT,2019-06-28,NaT,-3210 days
2,0976670,2010-09-13,NaT,2018-12-27,NaT,-3027 days
2,0976670,2010-09-13,NaT,2019-12-18,NaT,-3383 days
4,M623481,2022-02-14,NaT,2018-07-22,NaT,1303 days
4,M623481,2022-02-14,NaT,2015-05-10,NaT,2472 days


In [33]:
min_time = datetime.timedelta(days=1) 
max_time = datetime.timedelta(days=183)
for k in db.keys():
    t1 = df[f'{k}_delta'] >= min_time 
    t2 = df[f'{k}_delta'] <= max_time
    in_window = t1 & t2 
    df[f'{k}_in_window'] = in_window

In [34]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,type_1_diabetes_date,type_2_diabetes_date,type_1_diabetes_delta,type_2_diabetes_delta,type_1_diabetes_in_window,type_2_diabetes_in_window
2,0976670,2010-09-13,NaT,2019-06-28,NaT,-3210 days,False,False
2,0976670,2010-09-13,NaT,2018-12-27,NaT,-3027 days,False,False
2,0976670,2010-09-13,NaT,2019-12-18,NaT,-3383 days,False,False
4,M623481,2022-02-14,NaT,2018-07-22,NaT,1303 days,False,False
4,M623481,2022-02-14,NaT,2015-05-10,NaT,2472 days,False,False


In [35]:
type_1_diabetes = df.loc[df['type_1_diabetes_in_window'] == True]
type_1_diabetes = type_1_diabetes[['client_idcode', 'type_1_diabetes_in_window']]
type_1_diabetes.shape

(5062, 2)

In [36]:
type_1_diabetes.head()

Unnamed: 0,client_idcode,type_1_diabetes_in_window
44,M250369,True
44,M250369,True
44,M250369,True
44,M250369,True
44,M250369,True


In [37]:
type_1_diabetes = set(type_1_diabetes['client_idcode'])
len(type_1_diabetes)

51

In [38]:
with open('/projects/ROCKET AF/1A/rivaroxaban dataextraction/data/type_1_diabetes.pickle', 'wb') as f:
    pickle.dump(type_1_diabetes,f)

In [39]:
type_2_diabetes = df.loc[df['type_2_diabetes_in_window'] == True]
type_2_diabetes = type_2_diabetes[['client_idcode', 'type_2_diabetes_in_window']]
type_2_diabetes.shape

(6027, 2)

In [40]:
type_2_diabetes.head()

Unnamed: 0,client_idcode,type_2_diabetes_in_window
13,P660551,True
13,P660551,True
13,P660551,True
13,P660551,True
13,P660551,True


In [41]:
type_2_diabetes = set(type_2_diabetes['client_idcode'])
len(type_2_diabetes)

300

In [42]:
with open('/projects/ROCKET AF/1A/rivaroxaban dataextraction/data/type_2_diabetes.pickle', 'wb') as f:
    pickle.dump(type_2_diabetes,f)

# AF patients on antidiabetic drugs

In [43]:
antidiabetic_drugs = pd.read_csv('/projects/ROCKET AF/data/antidiabetic_drugs.csv')

In [44]:
antidiabetic_drugs.shape

(357170, 5)

In [45]:
antidiabetic_drugs.head()

Unnamed: 0.1,Unnamed: 0,client_idcode,order_name,order_arrivaldtm,order_summaryline
0,8,R093460,Pioglitazone Tablet,2015-07-31T14:39:07.423+0100,"30 mg, Oral, every MORNING (0800). **IN DOSETT..."
1,10,M422838,Pioglitazone Tablet,2017-05-20T18:00:02.287+0100,"30 mg, Oral, every MORNING (0800)"
2,17,A847865,Pioglitazone Tablet,2008-05-19T09:56:44.970+0100,"15 mg Oral, every MORNING"
3,19,R091915,Pioglitazone Tablet,2015-07-16T13:37:12.437+0100,"15 mg, Oral, THREE times a day (8, 12 & 18) RE..."
4,30,D009571,Pioglitazone Tablet,2011-10-26T08:52:39.887+0100,"15 mg Oral, every MORNING (0800)\r\nVERIFIED D..."


In [46]:
del antidiabetic_drugs['Unnamed: 0']

In [47]:
antidiabetic_drugs['first_rivaroxaban_prescription'] = antidiabetic_drugs['client_idcode'].map(rivaroxaban)

In [48]:
antidiabetic_drugs = antidiabetic_drugs[antidiabetic_drugs['first_rivaroxaban_prescription'].notna()]
antidiabetic_drugs.shape

(38365, 5)

In [49]:
antidiabetic_drugs.head()

Unnamed: 0,client_idcode,order_name,order_arrivaldtm,order_summaryline,first_rivaroxaban_prescription
0,R093460,Pioglitazone Tablet,2015-07-31T14:39:07.423+0100,"30 mg, Oral, every MORNING (0800). **IN DOSETT...",2015-07-31
1,M422838,Pioglitazone Tablet,2017-05-20T18:00:02.287+0100,"30 mg, Oral, every MORNING (0800)",2017-05-19
22,M735294,Pioglitazone Tablet,2022-08-03T10:07:20.357+0100,"15 mg, Oral, every MORNING (0800) VERIFIED",2023-02-27
29,P303654,Pioglitazone Tablet,2013-02-06T11:00:11.347+0000,"15 mg Oral, every MORNING (0800)\r\nVERIFIED *...",2015-02-23
38,M735294,Pioglitazone Tablet,2022-08-05T15:59:41.717+0000,"15 mg, Oral, every MORNING (0800). STOPPED AS ...",2023-02-27


In [50]:
antidiabetic_drugs['order_arrivaldtm'].isna().sum()

0

In [51]:
antidiabetic_drugs['order_arrivaldtm'] = pd.to_datetime(antidiabetic_drugs['order_arrivaldtm'], utc=True).dt.date 

In [52]:
antidiabetic_drugs['difference'] = pd.to_datetime(antidiabetic_drugs['first_rivaroxaban_prescription']) - pd.to_datetime(antidiabetic_drugs['order_arrivaldtm'])

In [53]:
antidiabetic_drugs.head()

Unnamed: 0,client_idcode,order_name,order_arrivaldtm,order_summaryline,first_rivaroxaban_prescription,difference
0,R093460,Pioglitazone Tablet,2015-07-31,"30 mg, Oral, every MORNING (0800). **IN DOSETT...",2015-07-31,0 days
1,M422838,Pioglitazone Tablet,2017-05-20,"30 mg, Oral, every MORNING (0800)",2017-05-19,-1 days
22,M735294,Pioglitazone Tablet,2022-08-03,"15 mg, Oral, every MORNING (0800) VERIFIED",2023-02-27,208 days
29,P303654,Pioglitazone Tablet,2013-02-06,"15 mg Oral, every MORNING (0800)\r\nVERIFIED *...",2015-02-23,747 days
38,M735294,Pioglitazone Tablet,2022-08-05,"15 mg, Oral, every MORNING (0800). STOPPED AS ...",2023-02-27,206 days


In [54]:
min_time = datetime.timedelta(days=1) 
max_time = datetime.timedelta(days=183)
t1 = antidiabetic_drugs['difference'] >= min_time 
t2 = antidiabetic_drugs['difference'] <= max_time
in_window = t1 & t2 
antidiabetic_drugs['in_window'] = in_window

In [55]:
antidiabetic_drugs = antidiabetic_drugs.loc[antidiabetic_drugs['in_window'] == True]
antidiabetic_drugs.shape

(4425, 7)

In [56]:
antidiabetic_drugs.head()

Unnamed: 0,client_idcode,order_name,order_arrivaldtm,order_summaryline,first_rivaroxaban_prescription,difference,in_window
48,M806667,Pioglitazone Tablet,2019-08-31,"15 mg, Oral, every MORNING (0800)",2019-10-30,60 days,True
92,M822204,Pioglitazone Tablet,2016-02-24,"30 mg, Oral, every MORNING (0800) VERIFIED",2016-03-15,20 days,True
116,M806667,Pioglitazone Tablet,2019-06-22,"15 mg, Oral, every MORNING (0800)",2019-10-30,130 days,True
146,M822204,Pioglitazone Tablet,2016-02-05,"30 mg, Oral, every AFTERNOON (1200),\r\nluncht...",2016-03-15,39 days,True
370,M806667,Pioglitazone Tablet,2019-08-30,"15 mg, Oral, every MORNING (0800). **Please st...",2019-10-30,61 days,True


In [57]:
antidiabetic_drugs['client_idcode'].nunique()

419

In [58]:
antidiabetic_drugs['drug_name'] = antidiabetic_drugs['order_name'].str.upper()
antidiabetic_drugs['order_summaryline_upper_case'] = antidiabetic_drugs['order_summaryline'].str.upper()

In [59]:
matches = ['HELD', 'ON HOLD', 'STOPPED', 'STOP', 'WITHHELD', 'WITHHOLD', 'DISCONTINUED', 'DISCONTINUE']

In [60]:
on_antidiabetic_drugs = set(antidiabetic_drugs['client_idcode'])
len(on_antidiabetic_drugs)

419

In [61]:
with open('/projects/ROCKET AF/1A/rivaroxaban dataextraction/data/on_antidiabetic_drugs.pickle', 'wb') as f:
    pickle.dump(on_antidiabetic_drugs,f)

--------------