In [None]:
import pandas as pd 
import pickle
import datetime
import re

# Atrial Fibrillation (AF) Patients on Rivaroxaban

In [None]:
with open('/projects/ROCKET AF/1A/rivaroxaban dataextraction/data/first_mention_rivaroxaban.pickle', 'rb') as f:
    rivaroxaban = pickle.load(f)

In [None]:
len(rivaroxaban)

In [None]:
df = pd.DataFrame(list(rivaroxaban.items()), columns=['client_idcode', 'first_mention_rivaroxaban']) # This code converts a dictionary (rivaroxaban) into a Pandas DataFrame with two columns named 'client_idcode' and 'first_mention_rivaroxaban'
df.shape

In [None]:
df.head()

In [None]:
ids = set(df['client_idcode'])
len(ids)

In [None]:
df['first_mention_rivaroxaban'].min()

# Echo Reports

In [None]:
echos = pd.read_csv('/projects/ROCKET AF/data/af_echos.csv') 

In [None]:
echos.info()

In [None]:
echos.shape

In [None]:
echos.head()

In [None]:
echos['measname'].unique()

In [None]:
echos = echos[echos['medicalrecordnumber'].isin(ids)]
echos.shape

In [None]:
echos['studystartdatetime'].isna().sum()

In [None]:
echos = echos[['medicalrecordnumber', 'studystartdatetime', 'findingcodetext']]

In [None]:
echos.rename(columns =  {'medicalrecordnumber': 'client_idcode'}, inplace = True)

In [None]:
echos.head()

# Identifying non-valvular AF patients

In [None]:
df = pd.merge(echos, df, on='client_idcode', how='right') 

In [None]:
df.head()

In [None]:
df['studystartdatetime'].isna().sum()

In [None]:
df = df[df['studystartdatetime'].notna()]
df['client_idcode'].nunique()

In [None]:
1509+1076

In [None]:
df['studystartdatetime'] = pd.to_datetime(df['studystartdatetime'], utc=True).dt.date 

In [None]:
df.head()

In [None]:
%%time 
df['difference'] = pd.to_datetime(df['first_mention_rivaroxaban']) - pd.to_datetime(df['studystartdatetime'])
min_time = datetime.timedelta(days=1) 
max_time = datetime.timedelta(days=183)
t1 = df['difference'] >= min_time 
t2 = df['difference'] <= max_time
in_window = t1 & t2 
df['in_window'] = in_window

In [None]:
df = df.loc[df['in_window'] == True]
df['client_idcode'].nunique()

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.head()

## Cleaning text data

In [None]:
df.loc[0,'findingcodetext']

In [None]:
df.info()

In [None]:
def clean_text(text):
    
    if isinstance(text,str):
        
        # Convert to lowercase
        text = text.lower()
   
        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)
   
        # Remove extra spaces
        text = ' '.join(text.split())
   
    return text

In [None]:
df['findingcodetext'] =  df['findingcodetext'].apply(clean_text)

In [None]:
df.loc[0,'findingcodetext']

### Severe mitral stenosis

In [None]:
df1 = df[df['findingcodetext'].str.contains('severe mitral stenosis', regex=True, na=False)]
df1.shape

In [None]:
df1['client_idcode'].nunique()

In [None]:
df1.head()

In [None]:
df1.loc[920,'findingcodetext']

In [None]:
ids_1 = set(df1['client_idcode'])
len(ids_1)

### Severe mitral regurgitation

In [None]:
df2 = df[df['findingcodetext'].str.contains('severe mitral regurgitation', regex=True, na=False)]
df2.shape

In [None]:
df2['client_idcode'].nunique()

In [None]:
df2.head()

In [None]:
df2.loc[2888,'findingcodetext']

In [None]:
ids_2 = set(df2['client_idcode'])
len(ids_2)

### Moderate mitral stenosis

In [None]:
df3 = df[df['findingcodetext'].str.contains('there is moderate mitral stenosis', regex=True, na=False)]
df3.shape

In [None]:
df3['client_idcode'].nunique()

In [None]:
df3.head()

In [None]:
df3.loc[35747,'findingcodetext']

In [None]:
ids_3 = set(df3['client_idcode'])
len(ids_3)

### Moderate mitral regurgiation

In [None]:
df4 = df[df['findingcodetext'].str.contains('there is moderate mitral regurgitation', regex=True, na=False)]
df4.shape

In [None]:
df4['client_idcode'].nunique()

In [None]:
df4.head()

In [None]:
df4.loc[1020,'findingcodetext']

In [None]:
ids_4 = set(df4['client_idcode'])
len(ids_4)

-----------------

In [None]:
ids = ids_1 | ids_2 | ids_3 | ids_4 
len(ids)

In [None]:
with open ('/projects/ROCKET AF/1A/rivaroxaban dataextraction/data/valvular_af.pickle', 'wb') as f:
    pickle.dump(ids,f)