In [1]:
import os
import re
import json
import bisect

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [25]:
DATA_DIR_RAW = '/Users/chenkx/Desktop/TBIC-not_synced/Negation/data/2010_relations_challenge'
DATA_DIR = r'/Users/chenkx/Box Sync/NLP group/2010 i2b2 challenge - rel'
MAP_DIR1 = "/Users/chenkx/Desktop/TBIC-not_synced/Negation/data/simple_header_map.json"
MAP_DIR2 = "/Users/chenkx/Desktop/TBIC-not_synced/Sectionizer/data/section_mapping/sectionTypeMapping.json"
MAP_DIR = "/Users/chenkx/Desktop/TBIC-not_synced/Negation/notebooks/2010Corpus/section_map_v1.csv"

### Combine section map for the train set and the test set

In [35]:
with open("/Users/chenkx/Desktop/TBIC-not_synced/Negation/notebooks/2010Corpus/section_mapping_v2_train.csv", 'r') as f:
    train_map = f.read()
with open("/Users/chenkx/Desktop/TBIC-not_synced/Negation/notebooks/2010Corpus/Section_mapping_v2_test.csv", 'r') as f:
    test_map = f.read()
train_map = train_map.split('\n')[1:]
train_map = {i.split(',')[0]:i.split(',')[1] for i in train_map}
test_map = test_map.split('\n')[1:]
test_map = {i.split(',')[0]:i.split(',')[1] for i in test_map}

In [36]:
test_map

{'medications': 'Medications',
 'discharge date': 'Date/Time',
 'd': 'Date/Time',
 't': 'Date/Time',
 'date': 'Date/Time',
 'past medical history': 'Past history',
 'present illness': 'Present illness',
 'hospital course': 'Hospital course',
 'diagnoses': 'Diagnoses',
 'sex': 'Patient information/Demographics',
 'disposition': 'Follow-up/Instructions',
 'allergies': 'Allergies',
 'physical examination': 'Physical examination/Status',
 'report status': 'Unknown/Unclassified',
 'service': '?',
 'date of birth': 'Date/Time',
 'signed electronically by': 'Providers',
 'r': 'Unknown/Unclassified',
 'social history': 'Social history',
 'dictated by': 'Providers',
 'patient name': 'Patient information/Demographics',
 'attending': 'Providers',
 'labs': 'Laboratory tests',
 'job id': '?',
 'attending physician': 'Providers',
 'dictator': 'Providers',
 'discharge instructions': 'Follow-up/Instructions',
 'abdomen': 'Subsection',
 'cc': 'Chief complaint',
 'family history': 'Family history',
 'ca

--------

In [26]:
with open(MAP_DIR, 'r') as f:
    section_map = f.read()
section_map = section_map.split('\n')[1:]
section_map = {i.split(',')[0]:i.split(',')[1] for i in section_map}

In [3]:
with open(MAP_DIR1, 'r') as f:
    section_map1 = json.load(f)
with open(MAP_DIR2, 'r') as f:
    section_map2_tmp = json.load(f)
section_map2 = {}
# reformat section_map2 to lower case
for i in section_map2_tmp:
    section_map2[i.lower()] = section_map2_tmp[i].lower()
del section_map2_tmp

In [8]:
filenames = [i[:-4] for i in os.listdir(os.path.join(\
    DATA_DIR, "test", "txt")) \
     if i.endswith(".txt")]

In [9]:
file = filenames[0]

In [10]:
with open(os.path.join(DATA_DIR, 'test', 'ref', file+'.ann'), 'r') as f:
    ann = f.readlines()

In [11]:
def std_header(phrase):
    """
    standardize heading from regex matches.
    First, use simple rules to check if a phrase is a header. Rules:
        1. A header does not contain digits.
    Then, normalize headings by:
        1. converting to lower case
        2. trim white space.
    """    
    if re.search('(^mg )|(^ml )|(^g )', phrase):
        return None
    
    phrase = re.sub(" :$", "", phrase.strip().lower())
    if phrase in section_map1:
        phrase = section_map1[phrase]
    if phrase in section_map2:
        phrase = section_map2[phrase]
    
    if phrase == '':
        return None
    
    return phrase

In [12]:
class Reader:
    def __init__(self, path, fname):
        """
        :param path - Path to the folder of which subfolders include "txt" and "ref"
        """

        self._path = path
        self.fname = fname
        
        self.ann = {}
        self._ann_raw = []
        self.all_headings = []
        
        with open(os.path.join(self._path, "txt", self.fname+".txt"), 'r') as f:
            self.txt = f.read()
        
    def get_all_headings(self):
        matches = re.finditer('(\n[a-zA-Z -]+)(( :\n)|( : ))', reader.txt)
        for m in matches:
            match = std_header(m.group(0))
            if match:
                b, e = m.span()
                self.all_headings.append( (match, b, e) )
#         self.all_headings = [std_header(match) for i, match in enumerate([re.search('[a-zA-Z ]+(( :$)|( : ))', txt) for txt in self.txt.split('\n')])]
    
    def get_annotation(self):
        """
        {
            fname: {
                iterm_id: {
                    b: int begin_offset, 
                    e: int end_offset, 
                    t: str "type",
                    a: str "assertion", 
                    c: str "concept_raw_text", 
                    s: str "section"
                }
                length: int length of the note 
            }
        }

        """
        fname = self.fname
        with open(os.path.join(self._path, "ref", fname+".ann"), 'r') as f:
            ann_raw = f.read().split('\n')
            self._ann_raw = ann_raw
        
        annotations = {}
        for line in ann_raw:
            line = line.split('\t')
            if line[0].startswith('T'):
                annotations[line[0]] = {
                    'b': int(line[1].split()[1]),  
                    'e': int(line[1].split()[2]), 
                    't': line[1].split()[0], 
                    'a': None, 
                    'c': line[2], 
                    's': None
                }
        for line in ann_raw:
            line = line.split('\t')
            if line[0].startswith('A'):
                if line[1].split()[1] not in annotations:
                    print(f"Warning: {line[0]} ??")
                annotations[line[1].split()[1]]['a'] = line[1].split()[0]
                
        # remove annotations that don't have assertion informaiton 
        delete = []
        for i in annotations:
            if annotations[i]['a'] is None:
                delete.append(i)
        for i in delete:
            del annotations[i]
        
        annotations['length'] = len(self.txt)
        
        self.ann = {fname: annotations}    

In [13]:
all_headings = []
for file in filenames:
    reader = Reader(os.path.join(DATA_DIR, "test"), file)
    reader.get_all_headings()
    all_headings.extend([i[0] for i in reader.all_headings])

In [14]:
len(all_headings)

8709

In [16]:
len(set(all_headings))

806

In [19]:
with open('../../ClinicalSectionsOntology-v2.csv', 'r') as f:
    ont = f.read().split('\n')

In [20]:
ont_map = {}
for line in ont[2:]:
    keys = line.split(',')[5:]
    val = line.split(',')[0]
    key2 = line.split(',')[1]
    abbr = line.split(',')[2]
    ont_map[val.lower()] = val
    for k in keys:
        k = re.sub('"', '', k).strip()
        if k != '':
            ont_map[k.lower()] = val
    if key2 != '':
        ont_map[key2.lower()] = val
    if abbr != '':
        ont_map[abbr.lower()] = val

In [27]:
# conbine the ontology with the map I created for the train set
ont_map = {**ont_map, **section_map}

In [28]:
# count un/mapped headings
freq = {}
mapped = {}
for i in all_headings:
    if i in ont_map:
        if i not in mapped:
            mapped[i] = 0
        mapped[i] += 1
        continue
    if i not in freq:
        freq[i] = 0
    freq[i] +=1 
    
freq = [(i, "?", freq[i]) for i in freq]
mapped = [(i, ont_map[i], mapped[i]) for i in mapped]
# freq = sorted(list(freq.items()), key=lambda x:-x[1])
# mapped = sorted(list(mapped.items()), key=lambda x:-x[1])

In [29]:
to_write = [','.join([i[0], i[1], str(i[2])]) for i in sorted(freq+mapped, key=lambda x:-x[2])]

In [30]:
to_write[:5]

['medications,Medications,344',
 'discharge date,Date/Time,263',
 'd,Date/Time,262',
 't,Date/Time,262',
 'date,Date/Time,234']

In [31]:
with open('all_headings_2010-test.csv', 'w') as f:
    f.writelines([ i + "\n" for i in to_write])

In [32]:
[i for i in set(all_headings) if i not in ont_map]
# for some of them manually map. 
# her discharge medications are as followsh, is a heading 
# sort by freq 

['motor system',
 'treatments frequency',
 'past gynecological history',
 'please contact lab control at mattapan community health center for results',
 'sensory system',
 'acc',
 'neuro exam',
 'the patient had a repeat transthoracic echocardiogram prior to drain removal which revealed the following',
 'ambulation',
 'operations and nonsurgical procedures',
 'investigations performed',
 'ct head',
 'motor',
 'mra',
 'state newborn screen',
 'coronary disease',
 'psychiatric',
 'data',
 'bilateral lower extremities',
 'kub',
 'fluids and electrolytes',
 'mitral valve - a wave',
 'post discharge testing',
 'disposition and discharge medications',
 'feedings at discharge',
 'radiology studies',
 'procedures done during hospitalization',
 'last update',
 'coagulation',
 'renal us - impression',
 'phoen',
 'contour',
 'sensitivities',
 'discharge diagnoses - as above plus',
 'heme',
 'summary of stay',
 'other problems',
 'physical examination upon admission to the cmed ccu',
 'thorax',
 '

---------------------------

---------------------------

In [14]:
for f in filenames:
    reader = Reader(os.path.join(DATA_DIR, "train"), f)
    break

In [67]:
ms = re.findall('(\n[a-zA-Z -]+)( :\n)', reader.txt)
print(ms)
len(ms)
# for m in ms:
#     print(m.group(0))

[('\nDischarge Date', ' :\n'), ('\nDate of Birth', ' :\n'), ('\nSex', ' :\n'), ('\nAllergies', ' :\n'), ('\nChief Complaint', ' :\n'), ('\nMajor Surgical or Invasive Procedure', ' :\n'), ('\nHistory of Present Illness', ' :\n'), ('\nPast Medical History', ' :\n'), ('\nSocial History', ' :\n'), ('\nFamily History', ' :\n'), ('\nPhysical Exam', ' :\n'), ('\nBrief Hospital Course', ' :\n'), ('\nIndication', ' :\n'), ('\nMedications on Admission', ' :\n'), ('\nDischarge Medications', ' :\n'), ('\nDischarge Disposition', ' :\n'), ('\nDischarge Diagnosis', ' :\n'), ('\nDischarge Condition', ' :\n'), ('\nDischarge Instructions', ' :\n'), ('\nFollowup Instructions', ' :\n')]


20

In [68]:
# matches = re.finditer('[a-zA-Z ]+(( :\n)|( : ))', reader.txt)
matches = re.finditer('([a-zA-Z -]+)(( :\n)|( : ))', reader.txt)
for m in matches:
#     match = std_header(m.group(0))
    if not m:
        continue
    match = re.sub(':', '', m.group(0)).lower().strip()
    if match in ont_map: 
        continue
    b, e = m.span()
    print("=======================")
    print(match)
    print("-------------")
    print(reader.txt[max(b-10, 0):min(e+11, len(reader.txt))])

sex
-------------
981-03-17
Sex :
M
Service :
service
-------------
7
Sex :
M
Service : MEDICINE
Al
major surgical or invasive procedure
-------------
 vomiting
Major Surgical or Invasive Procedure :
01-23 PORT 
vitals
-------------
al Exam :
Vitals : 97.4 , 164/
general
-------------
 , 97% 4L
General : sleepy , ar
heent
-------------
ical exam
HEENT : PERRL , lef
neck
-------------
P lesions
Neck : Supple , no
cv
-------------
 , no JVD
CV : RRR , nl S1
chest
-------------
CTAB post
Chest : HD line in 
abd
-------------
 erythema
Abd : Soft , ND ,
ext
-------------
led scars
Ext : no c/c/e , 
skin
-------------
od thrill
Skin : no rashes
B
brief hospital course
-------------
no rashes
Brief Hospital Course :
Pt admitted
hypertensive urgency
-------------
laints .
# hypertensive urgency :
Upon presen
bacteremia
-------------
egimen .
# bacteremia :
pt with 2/4
indication
-------------
 in OMR .
Indication :
diabetic ga
abdominal pain
-------------
t .
# n/v/ abdominal pain : pt with m