In [1]:
import boto3
# import dask
# import dask.dataframe as dd
import numpy as np
import pandas as pd
from pymetamap import MetaMap
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import time
from pharmpy.epc import EPCEngine

pd.set_option('display.max_rows', None)

In [2]:
cohort = pd.read_csv('cohort.csv', index_col=0)
ndc = pd.read_csv('ndcxls/product.csv')

In [3]:
cohort.head()
# cohort.shape

Unnamed: 0,subject_id,target
0,109,1.0
1,188,1.0
2,222,1.0
3,236,1.0
4,305,1.0


In [4]:
all_notes = pd.read_csv('s3://athena-output-mimic/cohort_notes/2022/04/14/07d790b5-026b-48d1-8f8f-7216ef2425fd.csv')
all_scripts = pd.read_csv('s3://athena-output-mimic/cohort_scripts/2022/04/14/7c40a6e6-2058-4e39-95fa-a57ec2d753ef.csv')

desc = pd.read_csv('s3://athena-output-mimic/diagnosis_descriptions/2022/04/10/539668ed-8120-4085-b072-7ea0168a4ae6.csv')
diag = pd.read_csv('s3://athena-output-mimic/cohort_diagnoses/2022/04/14/f46af8e4-5247-499c-ba65-0c2a5f851fd0.csv')

In [5]:
all_notes.head()

Unnamed: 0,subject_id,text
0,65824,Chief Complaint: LGIB\n I saw and examined t...
1,90369,Chief Complaint:\n HPI:\n 24 Hour Events:\...
2,56317,Chief Complaint: SOB\n I saw and examined th...
3,99231,"Chief Complaint: acute renal failure, hyperten..."
4,67906,Chief Complaint:\n I saw and examined the pa...


In [6]:
all_notes['text'] = all_notes['text'].astype(str)

In [7]:
notes_grpd = all_notes.groupby('subject_id')['text'].apply(lambda x: ' '.join(x))
notes_grpd.to_csv('notes_grpd.csv')

In [8]:
notes_grpd.head()

subject_id
75     Admission Date:  [**2147-4-5**]              D...
109    Chief Complaint: Hypertensive urgency\n   I sa...
188    [**2158-8-11**] 10:43 AM\n CT ABD W&W/O C; CT ...
214    [**2188-10-8**] 4:08 PM\n T-SPINE; L-SPINE (AP...
222    68 yo F with extensive PMH: MI x 3; 70 % occlu...
Name: text, dtype: object

In [9]:
text_limit=1000
notes_grpd = notes_grpd.apply(lambda x: x[:text_limit])

### Perform concept extraction

In [10]:

cm_client = boto3.client("comprehendmedical")

In [11]:
def get_snomed(x):
    text = x
    response = cm_client.infer_snomedct(Text=text)

    signs = []
    for i in response['Entities']:
        if i['Category'] == 'MEDICAL_CONDITION' and i['Traits']:
            if i['Traits'][0]['Name'] == 'SIGN' or i['Traits'][0]['Name'] == 'SYMPTOM':
                signs.extend([i['Text'] + ' '])

    return ''.join(signs)

In [12]:
sign_symp = notes_grpd.apply(get_snomed)
sign_symp.to_csv('sign_symp.csv')

In [13]:
sign_symp.head()

subject_id
75                               
109                        alert 
188                lethargy pain 
214                      limited 
222    active bleed active bleed 
Name: text, dtype: object

In [14]:
all_scripts.head()

Unnamed: 0,subject_id,ndc
0,75,904404100.0
1,75,904526200.0
2,75,121075200.0
3,75,63739010000.0
4,75,121054400.0


In [15]:
all_scripts = all_scripts.dropna()

In [16]:
all_scripts['ndc'] = all_scripts['ndc'].astype('int64').astype('str')
all_scripts['ndc'] = all_scripts['ndc'].str.zfill(11)

In [17]:
all_scripts.head()

Unnamed: 0,subject_id,ndc
0,75,904404073
1,75,904526161
2,75,121075210
3,75,63739008901
4,75,121054410


In [18]:
# ndc.head()

In [19]:
epe = EPCEngine()
all_scripts['ndc'] = all_scripts['ndc'].apply(lambda x: epe.get_epc(x)['ndc'])

In [20]:
all_scripts.head()

Unnamed: 0,subject_id,ndc
0,75,0904-4040
1,75,na
2,75,na
3,75,63739-089
4,75,0121-0544


In [21]:
ndc_map = pd.Series(ndc['PHARM_CLASSES'].values,index=ndc['PRODUCTNDC']).to_dict()

In [22]:
all_scripts['pharm_classes'] = all_scripts['ndc'].map(ndc_map)

In [23]:
all_scripts.head()

Unnamed: 0,subject_id,ndc,pharm_classes
0,75,0904-4040,"Anti-Inflammatory Agents, Non-Steroidal [CS], ..."
1,75,na,
2,75,na,
3,75,63739-089,
4,75,0121-0544,


In [24]:
all_scripts = all_scripts.dropna(subset=['pharm_classes'])

In [25]:
all_scripts['pharm_classes'] = all_scripts['pharm_classes'].astype(str)

In [26]:
scripts_grpd = all_scripts.groupby('subject_id')['pharm_classes'].apply(lambda x: ' '.join(x))
scripts_grpd.to_csv('sign_symp.csv')

In [27]:
scripts_grpd.head()

subject_id
75     Anti-Inflammatory Agents, Non-Steroidal [CS], ...
109    Angiotensin 2 Receptor Antagonists [MoA], Angi...
188    Increased Large Intestinal Motility [PE], Inhi...
214    HMG-CoA Reductase Inhibitor [EPC], Hydroxymeth...
222    Adrenergic beta-Antagonists [MoA], beta-Adrene...
Name: pharm_classes, dtype: object

In [28]:
diag.head()

Unnamed: 0,subject_id,icd9_code
0,109,40301
1,109,486
2,109,58281
3,109,5855
4,109,4254


In [29]:
desc.head()

Unnamed: 0,row_id,icd9_code,short_title,long_title
0,174,1166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
1,175,1170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
2,176,1171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
3,177,1172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."
4,178,1173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou..."


In [30]:
icd_map = pd.Series(desc['short_title'].values, index=desc['icd9_code']).to_dict()
diag['icd9_code'] = diag['icd9_code'].map(icd_map)
diag.head()

Unnamed: 0,subject_id,icd9_code
0,109,Mal hyp kid w cr kid V
1,109,"Pneumonia, organism NOS"
2,109,Chr nephritis in oth dis
3,109,Chron kidney dis stage V
4,109,Prim cardiomyopathy NEC


In [31]:
diag['icd9_code'] = diag['icd9_code'].astype(str)
diag_grpd = diag.groupby('subject_id')['icd9_code'].apply(lambda x: ' '.join(x))
diag_grpd.to_csv('sign_symp.csv')

In [32]:
diag_grpd.isna().any()

False

In [33]:
cohort = cohort.merge(sign_symp, on='subject_id', how='left')
cohort = cohort.merge(scripts_grpd, on='subject_id', how='left')
cohort = cohort.merge(diag_grpd, on='subject_id', how='left')
cohort = cohort.fillna('')
cohort = cohort.rename(columns={'text':'notes'})

In [34]:
cohort.head()

Unnamed: 0,subject_id,target,notes,pharm_classes,icd9_code
0,109,1.0,alert,"Angiotensin 2 Receptor Antagonists [MoA], Angi...","Mal hyp kid w cr kid V Pneumonia, organism NOS..."
1,188,1.0,lethargy pain,"Increased Large Intestinal Motility [PE], Inhi...",Compl liver transplant Acute respiratry failur...
2,222,1.0,active bleed active bleed,"Adrenergic beta-Antagonists [MoA], beta-Adrene...",Mal neo lower lobe lung Acute & chronc resp fa...
3,236,1.0,normal in echogenicity pleural effusion,"Increased Large Intestinal Motility [PE], Inhi...",Acute necrosis of liver Malignant neo liver NO...
4,305,1.0,pain pain pain ileus nausea vomiting pain ileus,"Insulin [CS], Insulin [EPC] Insulin [CS], Insu...","Subendo infarct, initial Mitral valve disorder..."


In [35]:
cohort['medication_diagnosis'] = cohort['pharm_classes'] + cohort['icd9_code']
cohort['medication_diagnosis_notes'] = cohort['notes'] + cohort['pharm_classes'] + cohort['icd9_code']

In [36]:
cohort.head()

Unnamed: 0,subject_id,target,notes,pharm_classes,icd9_code,medication_diagnosis,medication_diagnosis_notes
0,109,1.0,alert,"Angiotensin 2 Receptor Antagonists [MoA], Angi...","Mal hyp kid w cr kid V Pneumonia, organism NOS...","Angiotensin 2 Receptor Antagonists [MoA], Angi...",alert Angiotensin 2 Receptor Antagonists [MoA]...
1,188,1.0,lethargy pain,"Increased Large Intestinal Motility [PE], Inhi...",Compl liver transplant Acute respiratry failur...,"Increased Large Intestinal Motility [PE], Inhi...",lethargy pain Increased Large Intestinal Motil...
2,222,1.0,active bleed active bleed,"Adrenergic beta-Antagonists [MoA], beta-Adrene...",Mal neo lower lobe lung Acute & chronc resp fa...,"Adrenergic beta-Antagonists [MoA], beta-Adrene...",active bleed active bleed Adrenergic beta-Anta...
3,236,1.0,normal in echogenicity pleural effusion,"Increased Large Intestinal Motility [PE], Inhi...",Acute necrosis of liver Malignant neo liver NO...,"Increased Large Intestinal Motility [PE], Inhi...",normal in echogenicity pleural effusion Increa...
4,305,1.0,pain pain pain ileus nausea vomiting pain ileus,"Insulin [CS], Insulin [EPC] Insulin [CS], Insu...","Subendo infarct, initial Mitral valve disorder...","Insulin [CS], Insulin [EPC] Insulin [CS], Insu...",pain pain pain ileus nausea vomiting pain ileu...


In [37]:
cohort_feat = cohort[['subject_id', 'notes', 'medication_diagnosis', 'medication_diagnosis_notes', 'target']]

In [38]:
cohort_feat.head()

Unnamed: 0,subject_id,notes,medication_diagnosis,medication_diagnosis_notes,target
0,109,alert,"Angiotensin 2 Receptor Antagonists [MoA], Angi...",alert Angiotensin 2 Receptor Antagonists [MoA]...,1.0
1,188,lethargy pain,"Increased Large Intestinal Motility [PE], Inhi...",lethargy pain Increased Large Intestinal Motil...,1.0
2,222,active bleed active bleed,"Adrenergic beta-Antagonists [MoA], beta-Adrene...",active bleed active bleed Adrenergic beta-Anta...,1.0
3,236,normal in echogenicity pleural effusion,"Increased Large Intestinal Motility [PE], Inhi...",normal in echogenicity pleural effusion Increa...,1.0
4,305,pain pain pain ileus nausea vomiting pain ileus,"Insulin [CS], Insulin [EPC] Insulin [CS], Insu...",pain pain pain ileus nausea vomiting pain ileu...,1.0


In [39]:
X_trainRaw, X_testRaw, y_trainRaw, y_testRaw = train_test_split(cohort_feat.drop('target', axis=1), cohort_feat['target'], test_size=0.30, random_state=42)

In [40]:
X_trainRaw.head()
# y_trainRaw.head()

Unnamed: 0,subject_id,notes,medication_diagnosis,medication_diagnosis_notes
599,32658,GI bleed GI bleed nodular cirrhosis ascites di...,"Adrenergic beta-Antagonists [MoA], beta-Adrene...",GI bleed GI bleed nodular cirrhosis ascites di...
613,42327,low bibasilar opacities pleural effusions,"Adrenergic beta-Antagonists [MoA], beta-Adrene...",low bibasilar opacities pleural effusions Adre...
312,17735,ache poor appetite fatigued unable to lie flat...,"Adrenergic beta-Antagonists [MoA], beta-Adrene...",ache poor appetite fatigued unable to lie flat...
964,27390,Shortness of breath stenosis occlusion ulcerat...,"Increased Large Intestinal Motility [PE], Inhi...",Shortness of breath stenosis occlusion ulcerat...
997,43737,weakness,"Increased Large Intestinal Motility [PE], Inhi...",weakness Increased Large Intestinal Motility [...


In [41]:
# X_testRaw.head()

In [42]:
raw_train = pd.concat([X_trainRaw, y_trainRaw], axis=1)
raw_test = pd.concat([X_testRaw, y_testRaw], axis=1)

raw_train.to_csv('raw_train.csv')
raw_test.to_csv('raw_test.csv')

In [43]:
cohort_feat.head()

Unnamed: 0,subject_id,notes,medication_diagnosis,medication_diagnosis_notes,target
0,109,alert,"Angiotensin 2 Receptor Antagonists [MoA], Angi...",alert Angiotensin 2 Receptor Antagonists [MoA]...,1.0
1,188,lethargy pain,"Increased Large Intestinal Motility [PE], Inhi...",lethargy pain Increased Large Intestinal Motil...,1.0
2,222,active bleed active bleed,"Adrenergic beta-Antagonists [MoA], beta-Adrene...",active bleed active bleed Adrenergic beta-Anta...,1.0
3,236,normal in echogenicity pleural effusion,"Increased Large Intestinal Motility [PE], Inhi...",normal in echogenicity pleural effusion Increa...,1.0
4,305,pain pain pain ileus nausea vomiting pain ileus,"Insulin [CS], Insulin [EPC] Insulin [CS], Insu...",pain pain pain ileus nausea vomiting pain ileu...,1.0


In [44]:
def tf_idf(col):
    tfIdfVectorizer = TfidfVectorizer(use_idf=True)
    tf_idf = tfIdfVectorizer.fit_transform(cohort_feat[col].values.astype('U'))
    tf_idf = pd.DataFrame(tf_idf.toarray())
    fin_frame = pd.concat([cohort_feat['subject_id'], tf_idf, cohort_feat['target']], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(fin_frame.drop('target', axis=1), fin_frame['target'], test_size=0.30, random_state=42)
    train = pd.concat([X_train, y_train], axis=1)
    test = pd.concat([X_test, y_test], axis=1)
    train.to_csv(col + '_' + 'train.csv')
    test.to_csv(col + '_' + 'test.csv')

In [45]:
_ = tf_idf('notes')
_ = tf_idf('medication_diagnosis')
_ = tf_idf('medication_diagnosis_notes')

In [None]:
# df = pd.DataFrame(tf_idf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
# df = df.sort_values('TF-IDF', ascending=False)