In [71]:
import pandas as pd 
import pickle
import datetime
import re
import networkx as nx
import numpy as np
from tqdm import tqdm

# Atrial fibrillation (AF) patients on rivaroxaban

In [2]:
with open('/projects/ROCKET AF/1A/rivaroxaban dataextraction/data/first_mention_rivaroxaban.pickle', 'rb') as f:
    rivaroxaban = pickle.load(f)

In [3]:
len(rivaroxaban)

2585

In [4]:
df = pd.DataFrame(list(rivaroxaban.items()), columns=['client_idcode', 'first_mention_rivaroxaban']) # This code converts a dictionary (rivaroxaban) into a Pandas DataFrame with two columns named 'client_idcode' and 'first_mention_rivaroxaban'
df.shape

(2585, 2)

In [5]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban
0,0372858,2015-03-02
1,M282816,2018-10-25
2,V282449,2018-08-02
3,V459668,2015-06-25
4,R081687,2015-04-27


In [6]:
ids = set(df['client_idcode'])
len(ids)

2585

# AF patients' blood pressure

In [7]:
blood_pressure = pd.read_csv('/projects/ROCKET AF/data/blood_pressure.csv')

In [8]:
blood_pressure.shape

(3502984, 7)

In [9]:
blood_pressure.head()

Unnamed: 0.1,Unnamed: 0,client_idcode,obscatalogmasteritem_unitofmeasure,obscatalogmasteritem_displayname,observation_valuetext_analysed,observationdocument_createdwhen,client_dob
0,0,Q029932,mmHg,NEWS_Systolic_BP,123.0,2020-07-06T05:43:24.117+0100,1927-02-11T00:00:00.000+0000
1,2,F335077,,NEWS_Systolic_BP,147.0,2019-06-27T14:31:55.980+0100,1967-05-01T00:00:00.000+0100
2,10,P303828,mmHg,NEWS_Systolic_BP,75.0,2018-08-28T10:39:00.833+0100,1942-06-11T00:00:00.000+0200
3,11,P553416,mmHg,NEWS_Systolic_BP,126.0,2018-07-23T15:55:24.370+0100,1945-06-24T00:00:00.000+0200
4,12,D652011,,NEWS_Systolic_BP,138.0,2019-07-11T06:26:15.553+0100,1939-03-21T00:00:00.000+0000


In [10]:
del blood_pressure['Unnamed: 0']

In [11]:
blood_pressure.head()

Unnamed: 0,client_idcode,obscatalogmasteritem_unitofmeasure,obscatalogmasteritem_displayname,observation_valuetext_analysed,observationdocument_createdwhen,client_dob
0,Q029932,mmHg,NEWS_Systolic_BP,123.0,2020-07-06T05:43:24.117+0100,1927-02-11T00:00:00.000+0000
1,F335077,,NEWS_Systolic_BP,147.0,2019-06-27T14:31:55.980+0100,1967-05-01T00:00:00.000+0100
2,P303828,mmHg,NEWS_Systolic_BP,75.0,2018-08-28T10:39:00.833+0100,1942-06-11T00:00:00.000+0200
3,P553416,mmHg,NEWS_Systolic_BP,126.0,2018-07-23T15:55:24.370+0100,1945-06-24T00:00:00.000+0200
4,D652011,,NEWS_Systolic_BP,138.0,2019-07-11T06:26:15.553+0100,1939-03-21T00:00:00.000+0000


In [12]:
blood_pressure['obscatalogmasteritem_displayname'].unique()

array(['NEWS_Systolic_BP', 'NEWS_Diastolic_BP'], dtype=object)

In [13]:
blood_pressure['observationdocument_createdwhen'].isna().sum()

0

In [14]:
blood_pressure['obscatalogmasteritem_unitofmeasure'].unique()

array(['mmHg', nan], dtype=object)

# Systolic blood pressure

In [15]:
systolic_blood_pressure = blood_pressure.loc[blood_pressure['obscatalogmasteritem_displayname'] == 'NEWS_Systolic_BP']
systolic_blood_pressure.shape

(1753357, 6)

In [16]:
systolic_blood_pressure.head()

Unnamed: 0,client_idcode,obscatalogmasteritem_unitofmeasure,obscatalogmasteritem_displayname,observation_valuetext_analysed,observationdocument_createdwhen,client_dob
0,Q029932,mmHg,NEWS_Systolic_BP,123.0,2020-07-06T05:43:24.117+0100,1927-02-11T00:00:00.000+0000
1,F335077,,NEWS_Systolic_BP,147.0,2019-06-27T14:31:55.980+0100,1967-05-01T00:00:00.000+0100
2,P303828,mmHg,NEWS_Systolic_BP,75.0,2018-08-28T10:39:00.833+0100,1942-06-11T00:00:00.000+0200
3,P553416,mmHg,NEWS_Systolic_BP,126.0,2018-07-23T15:55:24.370+0100,1945-06-24T00:00:00.000+0200
4,D652011,,NEWS_Systolic_BP,138.0,2019-07-11T06:26:15.553+0100,1939-03-21T00:00:00.000+0000


In [17]:
systolic_blood_pressure = systolic_blood_pressure[['client_idcode', 'observation_valuetext_analysed', 'observationdocument_createdwhen']]

In [18]:
systolic_blood_pressure['first_rivaroxaban_prescription'] = systolic_blood_pressure['client_idcode'].map(rivaroxaban)

In [19]:
systolic_blood_pressure.head()

Unnamed: 0,client_idcode,observation_valuetext_analysed,observationdocument_createdwhen,first_rivaroxaban_prescription
0,Q029932,123.0,2020-07-06T05:43:24.117+0100,
1,F335077,147.0,2019-06-27T14:31:55.980+0100,
2,P303828,75.0,2018-08-28T10:39:00.833+0100,
3,P553416,126.0,2018-07-23T15:55:24.370+0100,
4,D652011,138.0,2019-07-11T06:26:15.553+0100,2012-07-04


In [20]:
systolic_blood_pressure['observationdocument_createdwhen'] = pd.to_datetime(systolic_blood_pressure['observationdocument_createdwhen'], utc=True).dt.date 

In [21]:
systolic_blood_pressure.head()

Unnamed: 0,client_idcode,observation_valuetext_analysed,observationdocument_createdwhen,first_rivaroxaban_prescription
0,Q029932,123.0,2020-07-06,
1,F335077,147.0,2019-06-27,
2,P303828,75.0,2018-08-28,
3,P553416,126.0,2018-07-23,
4,D652011,138.0,2019-07-11,2012-07-04


In [22]:
systolic_blood_pressure['difference'] = pd.to_datetime(systolic_blood_pressure['first_rivaroxaban_prescription']) - pd.to_datetime(systolic_blood_pressure['observationdocument_createdwhen'])

In [23]:
systolic_blood_pressure.head()

Unnamed: 0,client_idcode,observation_valuetext_analysed,observationdocument_createdwhen,first_rivaroxaban_prescription,difference
0,Q029932,123.0,2020-07-06,,NaT
1,F335077,147.0,2019-06-27,,NaT
2,P303828,75.0,2018-08-28,,NaT
3,P553416,126.0,2018-07-23,,NaT
4,D652011,138.0,2019-07-11,2012-07-04,-2563 days


In [24]:
min_time = datetime.timedelta(days=1) 
max_time = datetime.timedelta(days=183)
t1 = systolic_blood_pressure['difference'] >= min_time 
t2 = systolic_blood_pressure['difference'] <= max_time
in_window = t1 & t2 
systolic_blood_pressure['in_window'] = in_window

In [25]:
systolic_blood_pressure = systolic_blood_pressure.loc[systolic_blood_pressure['in_window'] == True]

In [26]:
systolic_blood_pressure.shape

(27368, 6)

In [27]:
systolic_blood_pressure.head()

Unnamed: 0,client_idcode,observation_valuetext_analysed,observationdocument_createdwhen,first_rivaroxaban_prescription,difference,in_window
211,M438228,110.0,2018-04-19,2018-04-23,4 days,True
257,P367142,133.0,2018-07-13,2018-07-16,3 days,True
325,V332037,140.0,2018-10-20,2018-10-23,3 days,True
385,D700895,112.0,2019-05-16,2019-06-12,27 days,True
419,P764466,137.0,2018-10-12,2018-10-24,12 days,True


In [28]:
# Group by 'client_idcode' and keep only the rows with the smallest 'difference'

min_values = systolic_blood_pressure.groupby('client_idcode')['difference'].transform('min')

# Filter the DataFrame to keep only rows with the minimum 'Value'
systolic_blood_pressure = systolic_blood_pressure[systolic_blood_pressure['difference'] == min_values]

In [29]:
systolic_blood_pressure.shape

(3096, 6)

In [30]:
systolic_blood_pressure.head()

Unnamed: 0,client_idcode,observation_valuetext_analysed,observationdocument_createdwhen,first_rivaroxaban_prescription,difference,in_window
463,Q028487,168.0,2018-12-12,2018-12-13,1 days,True
525,M561450,116.0,2018-06-03,2018-06-04,1 days,True
1770,M270753,128.0,2018-12-15,2018-12-16,1 days,True
2844,P429250,102.0,2019-07-05,2019-07-07,2 days,True
3700,M228631,131.0,2019-07-31,2019-08-01,1 days,True


In [31]:
systolic_blood_pressure['client_idcode'].nunique()

658

In [32]:
systolic_blood_pressure = systolic_blood_pressure.sort_values(by='client_idcode')
systolic_blood_pressure.head()

Unnamed: 0,client_idcode,observation_valuetext_analysed,observationdocument_createdwhen,first_rivaroxaban_prescription,difference,in_window
822604,197309,180.0,2020-08-26,2020-08-27,1 days,True
1108811,197309,108.0,2020-08-26,2020-08-27,1 days,True
1637574,197309,102.0,2020-08-26,2020-08-27,1 days,True
1027712,197309,122.0,2020-08-26,2020-08-27,1 days,True
493701,272062,99.0,2023-03-22,2023-03-23,1 days,True


In [33]:
# Group by 'client_idcode' and calculate the average of 'systolic_blod_pressure value'

systolic_blood_pressure_avg = systolic_blood_pressure.groupby('client_idcode')['observation_valuetext_analysed'].mean().reset_index()
    

In [34]:
systolic_blood_pressure_avg.shape

(658, 2)

In [35]:
systolic_blood_pressure_avg.head()

Unnamed: 0,client_idcode,observation_valuetext_analysed
0,197309,128.0
1,272062,110.142857
2,386909,122.428571
3,402415,96.444444
4,492288,124.5


In [36]:
systolic_blood_pressure_avg = systolic_blood_pressure_avg[systolic_blood_pressure_avg['observation_valuetext_analysed'] > 140]

In [37]:
systolic_blood_pressure_avg.shape

(149, 2)

In [38]:
systolic_blood_pressure_avg.head()

Unnamed: 0,client_idcode,observation_valuetext_analysed
18,A695224,141.0
22,A804409,150.333333
23,A880815,156.0
29,D083276,147.8
32,D091555,145.0


In [40]:
sbp = dict(zip(systolic_blood_pressure_avg['client_idcode'], systolic_blood_pressure_avg['observation_valuetext_analysed']))

In [41]:
with open('/projects/ROCKET AF/1A/rivaroxaban dataextraction/data/sbp.pickle', 'wb') as f:
    pickle.dump(sbp,f)

# Diastolic blood pressure

In [42]:
diastolic_blood_pressure = blood_pressure.loc[blood_pressure['obscatalogmasteritem_displayname'] == 'NEWS_Diastolic_BP']
diastolic_blood_pressure.shape

(1749627, 6)

In [43]:
diastolic_blood_pressure.head()

Unnamed: 0,client_idcode,obscatalogmasteritem_unitofmeasure,obscatalogmasteritem_displayname,observation_valuetext_analysed,observationdocument_createdwhen,client_dob
1753357,P751564,mmHg,NEWS_Diastolic_BP,46.0,2019-04-25T07:01:58.787+0100,1952-11-23T00:00:00.000+0000
1753358,M493218,,NEWS_Diastolic_BP,42.0,2019-07-29T16:18:31.693+0100,1930-12-13T00:00:00.000+0000
1753359,F389914,mmHg,NEWS_Diastolic_BP,84.0,2019-03-28T16:56:14.197+0000,1931-01-13T00:00:00.000+0000
1753360,F208420,mmHg,NEWS_Diastolic_BP,50.0,2020-02-01T05:56:17.903+0000,1944-06-18T00:00:00.000+0200
1753361,M513117,mmHg,NEWS_Diastolic_BP,85.0,2019-04-30T12:21:18.870+0100,1959-04-16T00:00:00.000+0000


In [44]:
diastolic_blood_pressure = diastolic_blood_pressure[['client_idcode', 'observation_valuetext_analysed', 'observationdocument_createdwhen']]

In [45]:
diastolic_blood_pressure['first_rivaroxaban_prescription'] = diastolic_blood_pressure['client_idcode'].map(rivaroxaban)

In [46]:
diastolic_blood_pressure.head()

Unnamed: 0,client_idcode,observation_valuetext_analysed,observationdocument_createdwhen,first_rivaroxaban_prescription
1753357,P751564,46.0,2019-04-25T07:01:58.787+0100,
1753358,M493218,42.0,2019-07-29T16:18:31.693+0100,
1753359,F389914,84.0,2019-03-28T16:56:14.197+0000,2011-02-03
1753360,F208420,50.0,2020-02-01T05:56:17.903+0000,
1753361,M513117,85.0,2019-04-30T12:21:18.870+0100,


In [47]:
diastolic_blood_pressure['observationdocument_createdwhen'] = pd.to_datetime(diastolic_blood_pressure['observationdocument_createdwhen'], utc=True).dt.date 

In [48]:
diastolic_blood_pressure.head()

Unnamed: 0,client_idcode,observation_valuetext_analysed,observationdocument_createdwhen,first_rivaroxaban_prescription
1753357,P751564,46.0,2019-04-25,
1753358,M493218,42.0,2019-07-29,
1753359,F389914,84.0,2019-03-28,2011-02-03
1753360,F208420,50.0,2020-02-01,
1753361,M513117,85.0,2019-04-30,


In [49]:
diastolic_blood_pressure['difference'] = pd.to_datetime(diastolic_blood_pressure['first_rivaroxaban_prescription']) - pd.to_datetime(diastolic_blood_pressure['observationdocument_createdwhen'])

In [50]:
diastolic_blood_pressure.head()

Unnamed: 0,client_idcode,observation_valuetext_analysed,observationdocument_createdwhen,first_rivaroxaban_prescription,difference
1753357,P751564,46.0,2019-04-25,,NaT
1753358,M493218,42.0,2019-07-29,,NaT
1753359,F389914,84.0,2019-03-28,2011-02-03,-2975 days
1753360,F208420,50.0,2020-02-01,,NaT
1753361,M513117,85.0,2019-04-30,,NaT


In [51]:
min_time = datetime.timedelta(days=1) 
max_time = datetime.timedelta(days=183)
t1 = diastolic_blood_pressure['difference'] >= min_time 
t2 = diastolic_blood_pressure['difference'] <= max_time
in_window = t1 & t2 
diastolic_blood_pressure['in_window'] = in_window

In [52]:
diastolic_blood_pressure = diastolic_blood_pressure.loc[diastolic_blood_pressure['in_window'] == True]

In [53]:
diastolic_blood_pressure.shape

(27314, 6)

In [54]:
diastolic_blood_pressure.head()

Unnamed: 0,client_idcode,observation_valuetext_analysed,observationdocument_createdwhen,first_rivaroxaban_prescription,difference,in_window
1753478,V395517,50.0,2019-01-20,2019-01-21,1 days,True
1753485,M670865,81.0,2019-03-03,2019-03-06,3 days,True
1753521,V993447,67.0,2019-04-01,2019-04-02,1 days,True
1753562,P782490,64.0,2018-07-04,2018-07-16,12 days,True
1753578,M234282,51.0,2018-04-29,2018-06-04,36 days,True


In [55]:
# Group by 'client_idcode' and keep only the rows with the smallest 'difference'

min_values = diastolic_blood_pressure.groupby('client_idcode')['difference'].transform('min')

# Filter the DataFrame to keep only rows with the minimum 'Value'
diastolic_blood_pressure = diastolic_blood_pressure[diastolic_blood_pressure['difference'] == min_values]

In [56]:
diastolic_blood_pressure.shape

(3094, 6)

In [57]:
diastolic_blood_pressure.head()

Unnamed: 0,client_idcode,observation_valuetext_analysed,observationdocument_createdwhen,first_rivaroxaban_prescription,difference,in_window
1753478,V395517,50.0,2019-01-20,2019-01-21,1 days,True
1753521,V993447,67.0,2019-04-01,2019-04-02,1 days,True
1753846,V610321,77.0,2020-07-22,2020-07-23,1 days,True
1755099,M270753,77.0,2018-12-15,2018-12-16,1 days,True
1755316,V481126,76.0,2022-01-11,2022-01-12,1 days,True


In [58]:
diastolic_blood_pressure['client_idcode'].nunique()

658

In [59]:
diastolic_blood_pressure = diastolic_blood_pressure.sort_values(by='client_idcode')
diastolic_blood_pressure.head()

Unnamed: 0,client_idcode,observation_valuetext_analysed,observationdocument_createdwhen,first_rivaroxaban_prescription,difference,in_window
2328596,197309,63.0,2020-08-26,2020-08-27,1 days,True
3065589,197309,99.0,2020-08-26,2020-08-27,1 days,True
2720462,197309,68.0,2020-08-26,2020-08-27,1 days,True
2829905,197309,63.0,2020-08-26,2020-08-27,1 days,True
2245255,272062,82.0,2023-03-22,2023-03-23,1 days,True


In [60]:
# Group by 'client_idcode' and calculate the average of 'systolic_blod_pressure value'

diastolic_blood_pressure_avg = diastolic_blood_pressure.groupby('client_idcode')['observation_valuetext_analysed'].mean().reset_index()
    

In [61]:
diastolic_blood_pressure_avg.shape

(658, 2)

In [62]:
diastolic_blood_pressure_avg.head()

Unnamed: 0,client_idcode,observation_valuetext_analysed
0,197309,73.25
1,272062,73.428571
2,386909,58.428571
3,402415,55.333333
4,492288,69.5


In [63]:
diastolic_blood_pressure_avg = diastolic_blood_pressure_avg[diastolic_blood_pressure_avg['observation_valuetext_analysed'] > 90]

In [64]:
diastolic_blood_pressure_avg.shape

(42, 2)

In [65]:
diastolic_blood_pressure_avg.head()

Unnamed: 0,client_idcode,observation_valuetext_analysed
20,A752193,108.0
23,A880815,94.333333
32,D091555,90.666667
38,D256357,92.0
49,D448903,98.0


In [67]:
dbp = dict(zip(diastolic_blood_pressure_avg['client_idcode'], diastolic_blood_pressure_avg['observation_valuetext_analysed']))

In [68]:
with open('/projects/ROCKET AF/1A/rivaroxaban dataextraction/data/dbp.pickle', 'wb') as f:
    pickle.dump(dbp,f)

----------

# AF patients with high blood pressure

In [69]:
def graph_from_onto(onto):
    G = nx.DiGraph()
    cl_edges = []
    for s, ts in onto.items():
        for t in ts:
            cl_edges.append((s.replace('S-', ''), t.replace('S-', '')))
    G.add_edges_from(cl_edges)
    return G

def load_onto(o):
    with open(f'/projects/data/GS/{o}', 'rb') as f:
        onto = pickle.load(f)
    G = graph_from_onto(onto)
    return G

def expand_codes(onto, codes):
    expanded = {}
    for name, top_codes in codes.items():
        expanded[name] = set()
        for c in top_codes:
            expanded[name].add(c)
            if c in onto:
                expanded[name].update(nx.ancestors(onto, c))
            else:
                print("NOT FOUND:", c, name)
        print(name, len(top_codes), len(expanded[name]))
    return expanded

In [72]:
onto = load_onto('isa_rela_ch2pt_202009.pickle')

In [73]:
db = {'hypertension' : ['38341003']}
db = expand_codes(onto,db)

hypertension 1 137


In [74]:
%%time 
with open('/projects/data/GS/pt2cui_pos_dates.pickle', 'rb') as f:
    pt2cui_pos_dates = pickle.load(f)

CPU times: user 1min 54s, sys: 16.6 s, total: 2min 11s
Wall time: 2min 17s


In [75]:
def dates_ex(pt_data, codes):
    dates = None
    for x in codes:
        if x in pt_data and len(pt_data[x])>=2:
            if dates == None:
                dates = pt_data[x]
            else: 
                dates = dates.union(pt_data[x])
    return dates

rows = []
for pt in ids:
    pt_data = pt2cui_pos_dates.get(pt, {})
    row = {'client_idcode': pt, 'first_mention_rivaroxaban': rivaroxaban[pt]} 
    for concept, codes in db.items():
        dates = dates_ex(pt_data, codes)
        if dates == None:
            row[f"{concept}_date"] = np.nan
        else:
            row[f"{concept}_date"] = dates
    rows.append(row)

In [76]:
df = pd.DataFrame(rows)
df.shape

(2585, 3)

In [77]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,hypertension_date
0,M485292,2018-09-14,"{2017-10-21 13:54:00, 2017-09-20 14:20:09.8400..."
1,V354424,2021-12-08,
2,M743291,2021-04-12,"{2018-06-11 15:20:02.573000, 2018-05-09 15:10:..."
3,M941279,2014-08-08,"{2014-08-08 11:55:00, 2017-11-07 12:50:49.7100..."
4,A800979,2010-03-24,"{2014-02-03 15:34:00, 2012-10-08 16:02:00, 201..."


In [79]:
df.dropna(subset=['hypertension_date'], how='all', inplace = True)

In [80]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,hypertension_date
0,M485292,2018-09-14,"{2017-10-21 13:54:00, 2017-09-20 14:20:09.8400..."
2,M743291,2021-04-12,"{2018-06-11 15:20:02.573000, 2018-05-09 15:10:..."
3,M941279,2014-08-08,"{2014-08-08 11:55:00, 2017-11-07 12:50:49.7100..."
4,A800979,2010-03-24,"{2014-02-03 15:34:00, 2012-10-08 16:02:00, 201..."
6,V337542,2014-05-02,"{2014-05-06 09:05:00, 2014-05-03 07:29:00, 201..."


In [81]:
%%time
df = df[['client_idcode']].join((df[i].explode() for i in df.iloc[:,1:]))
df.shape

CPU times: user 20 ms, sys: 8 ms, total: 28 ms
Wall time: 27.2 ms


(34583, 3)

In [82]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,hypertension_date
0,M485292,2018-09-14,2017-10-21 13:54:00.000
0,M485292,2018-09-14,2017-09-20 14:20:09.840
0,M485292,2018-09-14,2017-10-22 03:12:37.690
0,M485292,2018-09-14,2018-02-27 12:49:08.160
0,M485292,2018-09-14,2017-12-21 01:56:08.570


In [83]:
for k in db.keys():
    df[f'{k}_date'] = pd.to_datetime(df[f'{k}_date']).dt.date

In [84]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,hypertension_date
0,M485292,2018-09-14,2017-10-21
0,M485292,2018-09-14,2017-09-20
0,M485292,2018-09-14,2017-10-22
0,M485292,2018-09-14,2018-02-27
0,M485292,2018-09-14,2017-12-21


In [85]:
for k in db.keys():
    df[f'{k}_delta'] = pd.to_datetime(df['first_mention_rivaroxaban']) - pd.to_datetime(df[f'{k}_date'])

In [86]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,hypertension_date,hypertension_delta
0,M485292,2018-09-14,2017-10-21,328 days
0,M485292,2018-09-14,2017-09-20,359 days
0,M485292,2018-09-14,2017-10-22,327 days
0,M485292,2018-09-14,2018-02-27,199 days
0,M485292,2018-09-14,2017-12-21,267 days


In [87]:
min_time = datetime.timedelta(days=1) 
max_time = datetime.timedelta(days=183)
for k in db.keys():
    t1 = df[f'{k}_delta'] >= min_time 
    t2 = df[f'{k}_delta'] <= max_time
    in_window = t1 & t2 
    df[f'{k}_in_window'] = in_window

In [88]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,hypertension_date,hypertension_delta,hypertension_in_window
0,M485292,2018-09-14,2017-10-21,328 days,False
0,M485292,2018-09-14,2017-09-20,359 days,False
0,M485292,2018-09-14,2017-10-22,327 days,False
0,M485292,2018-09-14,2018-02-27,199 days,False
0,M485292,2018-09-14,2017-12-21,267 days,False


In [89]:
df = df.loc[df['hypertension_in_window'] == True]

In [90]:
df.head()

Unnamed: 0,client_idcode,first_mention_rivaroxaban,hypertension_date,hypertension_delta,hypertension_in_window
6,V337542,2014-05-02,2014-04-07,25 days,True
6,V337542,2014-05-02,2014-04-30,2 days,True
7,F387251,2015-04-30,2015-04-29,1 days,True
7,F387251,2015-04-30,2015-02-03,86 days,True
7,F387251,2015-04-30,2015-04-27,3 days,True


In [91]:
hypertension = set(df['client_idcode'])
len(hypertension)

1209

In [92]:
with open('/projects/ROCKET AF/1A/rivaroxaban dataextraction/data/hypertension.pickle', 'wb') as f:
    pickle.dump(hypertension,f)