In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd 
import numpy as np
from preprocessing.cleaning_utils import *
from functools import partial
import torch 

from datasets import load_dataset
from datasets import Value, ClassLabel, Features, DatasetDict
import transformers
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import logging
from transformers import TrainingArguments, Trainer

logging.set_verbosity_warning()

In [3]:
# !python -m torch.utils.collect_env

In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [5]:
mimic_dir = "/home/vs428/project/MIMIC/files/mimiciii/1.4/"
n2c2_dir = "/home/vs428/project/n2c2/2022/N2C2-AP-Reasoning/"
n2c2_data_dir =  "/home/vs428/project/n2c2/2022/Data/"


In [6]:
samples = pd.read_csv(n2c2_dir + "n2c2_sample.csv")
samples_raw = pd.read_csv(n2c2_dir + "n2c2_sample_raw.csv")

In [7]:

# classes = ['Not Relevant', 'Neither', 'Indirect', 'Direct']
# features = Features({
#     'ROW ID':Value("int64"),
#     'HADM ID':Value("int64"),
#     'Assessment':Value("string"),
#     'PlanSubsection':Value("string"),
#     "Relation":Value("string")
# }) 
# dataset = load_dataset("csv", data_files=n2c2_dir + "n2c2_sample_raw.csv", 
#                        features=features)
# dataset = dataset.class_encode_column("Relation")
# dataset = dataset.rename_column("Relation", "label")

In [11]:
classes = ['Not Relevant', 'Neither', 'Indirect', 'Direct']
features = Features({
    'ROW ID':Value("int64"),
    'HADM ID':Value("int64"),
    'Assessment':Value("string"),
    'Plan Subsection':Value("string"),
    "Relation":Value("string")
}) 

dataset = load_dataset("csv", data_files={
                            "train":n2c2_data_dir + "train.csv",
                            "valid":n2c2_data_dir + "dev.csv",
                        },

                       features=features)
# dataset = dataset.class_encode_column("Relation")
dataset = dataset.rename_column("Relation", "label")

Using custom data configuration default-b1948d86214b7517
Reusing dataset csv (/home/vs428/.cache/huggingface/datasets/csv/default-b1948d86214b7517/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
dataset['train'][23]

{'ROW ID': 711150,
 'HADM ID': 183333,
 'Assessment': '75 yo M with h/o cardiomyopahy (EF 15%), atrial fibrillation, pulmonary\n   embolism (on coumadin), adrenal insufficiency, and s/p several recent\n   admissions following episode of Klebsilella pneumonia complicated by\n   respiratory failure and septic shock. Presents to hospital with\n   syncope, hypoxia, hypotension and altered mental status.',
 'Plan Subsection': '# Adrenal insufficiency:\n   Patient on 10 mg hydrocortisone at home.\n   - Increase to 50 mg hydrocortisone given stress of acute illness (day 2\n   today)',
 'label': 'Indirect'}

# Check some assumptions about the data

## Check if any "direct" relations have no overlap

In [20]:
import scispacy
import spacy
from scispacy.abbreviation import AbbreviationDetector


In [21]:
nlp = spacy.load("en_core_sci_lg")
nlp.add_pipe("abbreviation_detector")

<scispacy.abbreviation.AbbreviationDetector at 0x2adc5b6defe0>

In [23]:
non_overlap_direct_rels = []
for data in dataset['train']:
    if data['label'] == "Direct":
        assessment_tokens = set(w.orth for w in nlp(data['Assessment']))
        plan_tokens = set(w.orth for w in nlp(data['Plan Subsection']))
        if len(assessment_tokens.intersection(plan_tokens)) < 1:
            print(data)
            non_overlap_direct_rels.append(data)


  global_matches = self.global_matcher(doc)


{'ROW ID': 476539, 'HADM ID': 111846, 'Assessment': 'This is a 55M with COPD/CO2 retainer, VTE,  bilateral chronic\n   lymphedema and multiple ulcers who was admited for LE cellulitis/sepsis\n   and who underwent BKA three days ago is transferred to MICU for medical\n   management', 'Plan Subsection': 'Hypotension: resolved', 'label': 'Direct'}
{'ROW ID': 358146, 'HADM ID': 106860, 'Assessment': '79F Russian speaking s/p CABG, dCHF s/p dual pacemaker, DMII, AF,\n   CRI that presents with decreased PO and hypotension likely [**2-19**] to\n   decresed PO in setting of UTI and potential PNA.', 'Plan Subsection': 'HYPOTENSION (NOT SHOCK)\n COUGH', 'label': 'Direct'}
{'ROW ID': 634432, 'HADM ID': 199781, 'Assessment': '57 y.o. man with DM, HTN, and paranoid schizophrenia, presents from\n   inpatient Psych facility altered mental status', 'Plan Subsection': 'Respiratory alkalosis: Resolved on intubation', 'label': 'Direct'}
{'ROW ID': 685471, 'HADM ID': 106691, 'Assessment': '48yo F intoxica

In [27]:
non_overlap_direct_df = pd.DataFrame(non_overlap_direct_rels)

In [36]:

for idx, data in non_overlap_direct_df.iterrows():
    prompt_str = 'Assessment: ' + data['Assessment'] + '\nPlan: ' + data['Plan Subsection'] + '\nLabel: ' \
        + str(data['label'])
    print(prompt_str)
    print("\n---------------------------------------------\n")

Assessment: This is a 55M with COPD/CO2 retainer, VTE,  bilateral chronic
   lymphedema and multiple ulcers who was admited for LE cellulitis/sepsis
   and who underwent BKA three days ago is transferred to MICU for medical
   management
Plan: Hypotension: resolved
Label: Direct

---------------------------------------------

Assessment: 79F Russian speaking s/p CABG, dCHF s/p dual pacemaker, DMII, AF,
   CRI that presents with decreased PO and hypotension likely [**2-19**] to
   decresed PO in setting of UTI and potential PNA.
Plan: HYPOTENSION (NOT SHOCK)
 COUGH
Label: Direct

---------------------------------------------

Assessment: 57 y.o. man with DM, HTN, and paranoid schizophrenia, presents from
   inpatient Psych facility altered mental status
Plan: Respiratory alkalosis: Resolved on intubation
Label: Direct

---------------------------------------------

Assessment: 48yo F intoxicated s/p fall down stairs with
   severe facial trauma, elective OSH intubation for airway protec

## "Indirect" Relations that don't have much overlap

In [37]:
non_overlap_indirect_rels = []
for data in dataset['train']:
    if data['label'] == "Indirect":
        assessment_tokens = set(w.orth for w in nlp(data['Assessment']))
        plan_tokens = set(w.orth for w in nlp(data['Plan Subsection']))
        if len(assessment_tokens.intersection(plan_tokens)) < 1:
            print(data)
            non_overlap_indirect_rels.append(data)


  global_matches = self.global_matcher(doc)


{'ROW ID': 553008, 'HADM ID': 195768, 'Assessment': 'MYOCARDIAL INFARCTION, ACUTE (AMI, STEMI, NSTEMI)\n GASTROINTESTINAL BLEED, LOWER (HEMATOCHEZIA, BRBPR, GI BLEED, GIB)\n TACHYCARDIA, OTHER\n RESPIRATORY FAILURE, ACUTE (NOT ARDS/[**Doctor Last Name 2**])\n   69F w/ h/o afib on coumadin, PUD, COPD, CKD, h/o CVA, HTN,\n   hyperlipidemia transferred from [**Hospital1 709**] by family request after presenting\n   with black stools, developed hypoxemia and tachyarrhythmia with RVR,\n   now with continued respiratory distress complicated by demand ischemia.', 'Plan Subsection': '# Hypercholesterolemia: Continue statin', 'label': 'Indirect'}
{'ROW ID': 580343, 'HADM ID': 183659, 'Assessment': '53M with HIV/hepC, admitted with nausea, weakness, hypotension', 'Plan Subsection': '# hypoglycemia - likely [**12-30**] septic picture and possibly complicated by\n   ESLD.\n   - D50 as needed.\n   - check FSBS QID.', 'label': 'Indirect'}
{'ROW ID': 657100, 'HADM ID': 100571, 'Assessment': 'his is a

In [38]:
non_overlap_indirect_df = pd.DataFrame(non_overlap_indirect_rels)

In [39]:

for idx, data in non_overlap_indirect_df.iterrows():
    prompt_str = 'Assessment: ' + data['Assessment'] + '\nPlan: ' + data['Plan Subsection'] + '\nLabel: ' \
        + str(data['label'])
    print(prompt_str)
    print("\n---------------------------------------------\n")

Assessment: MYOCARDIAL INFARCTION, ACUTE (AMI, STEMI, NSTEMI)
 GASTROINTESTINAL BLEED, LOWER (HEMATOCHEZIA, BRBPR, GI BLEED, GIB)
 TACHYCARDIA, OTHER
 RESPIRATORY FAILURE, ACUTE (NOT ARDS/[**Doctor Last Name 2**])
   69F w/ h/o afib on coumadin, PUD, COPD, CKD, h/o CVA, HTN,
   hyperlipidemia transferred from [**Hospital1 709**] by family request after presenting
   with black stools, developed hypoxemia and tachyarrhythmia with RVR,
   now with continued respiratory distress complicated by demand ischemia.
Plan: # Hypercholesterolemia: Continue statin
Label: Indirect

---------------------------------------------

Assessment: 53M with HIV/hepC, admitted with nausea, weakness, hypotension
Plan: # hypoglycemia - likely [**12-30**] septic picture and possibly complicated by
   ESLD.
   - D50 as needed.
   - check FSBS QID.
Label: Indirect

---------------------------------------------

Assessment: his is a 66 y/o M hx seizures from EEE encephalitis p/w status c/b
   aspiration PNA, intub

In [40]:
mimic_dir = "/home/vs428/project/MIMIC/files/mimiciii/1.4/"
notes = pd.read_csv(mimic_dir + "NOTEEVENTS.csv")

  notes = pd.read_csv(mimic_dir + "NOTEEVENTS.csv")


In [47]:
# notes[notes['HADM_ID'] == 183659]
print(notes[notes['ROW_ID'] == 580343]['TEXT'].tolist()[0])

Chief Complaint:  weakness, shortness of breath.
   HPI:
   53M with h/o ESLD [**12-30**] hepC, HIV (CD4 322), pHTN, presenting to ED this
   morning with weakness, nausea, and cough.  Per his caregiver [**Name (NI) 1708**], he
   was in his USOH until 4AM, and had been doing relatively well since his
   last discharge [**3-29**] at which time dobhoff was replaced [**12-30**] weight loss,
   tolerating tube feeds poorly, but recently switching to oral feeds only
   for the past 1.5 weeks.
   .
   He had no complaints prior to going to bed, other than his baseline
   abdominal discomfort, and nausea.  His caregiver found him at 4AM
   slumped towards the left on the couch, incontient of urine and stool
   (wears depends at baseline), and disoriented.  EMS was activated, and
   he was brought to ED.
   .
   The patient denies confusion, hemetemesis, melena, or change in his
   chronic abdominal pain, and has been having daily soft bowel movements
   on lactulose.
   .
   In ED VS=98.1  9