In [3]:
!pip install nltk pandas

You should consider upgrading via the '/home/kabirkhan/Documents/Consulting/Explosion/projects/tutorials/ner_pytorch_medical/.venv/bin/python -m pip install --upgrade pip' command.[0m


In [8]:
import glob  # Finds all the pathnames matching a specified pattern, 
             # typically specified with regex (re) 
from pathlib import Path
import re
import pandas as pd
from nltk import pos_tag
from nltk.chunk.regexp import RegexpChunkParser, ChunkRule, RegexpParser
from nltk.tree import Tree
import numpy as np

pd.set_option('max_colwidth', None)  # Remove any limitation on length 
                                     # of text displayed in a cell
pd.set_option('max_rows', 300)  # Display up to 300 rows in a dataset

In [10]:
input_dir = Path('../assets/n2c2_2011/')
output_dir = input_dir / 'output'

base_str = 'clinical-' 

In [None]:
# Annotation format:
# c= “<Markable>” <StartLineOffset>:<StartWordOffset>
# <EndLineOffset>:<EndWordOffset>||t=“<Class>”


In [65]:
anns = input_dir / "Beth_Train" / "concepts" / "clinical-103.txt.con"
p = input_dir / "Beth_Train" / "docs" / "clinical-103.txt"
lines = p.open().read().splitlines()
text = p.open().read()
annotations = anns.open().read().splitlines()

lines[13]

'Chief Complaint :'

In [125]:
from collections import defaultdict
from typing import List
from spacy.language import Language
from spacy.tokens import Doc, DocBin, Span
from spacy.util import filter_spans


def docs_from_clinical_record(lines: List[str], annotations: List[str], merge_docs: bool = False) -> List[Doc]:
    docs = []
    spans_by_line = defaultdict(list)

    for row in annotations:    
        row = row.split("||")
        text_info = row[0]
        type_info = row[1]

        text = text_info.split('"')[1]

        offset_start = text_info.split(' ')[-2]
        offset_end = text_info.split(' ')[-1]

        start_line, word_start = offset_start.split(":")
        end_line, word_end = offset_end.split(":")

        label = type_info.split('"')[-2]

        if start_line != end_line:
            print("different line numbers")
            print(row)
            continue


        else:
            spans_by_line[int(start_line)].append((int(word_start), int(word_end), label))

    for i, line in enumerate(lines):
        n = i + 1
        doc = nlp.make_doc(line)
        if n in spans_by_line:
            ents = [Span(doc, start, end + 1, label=label) for (start, end, label) in spans_by_line[n]]
            ents = [e for e in ents if bool(e.text.strip()) and e.text.strip() == e.text]
            doc.ents = filter_spans(ents)

        docs.append(doc)
        
    return [Doc.from_docs(docs)] if merge_docs else docs


docs = docs_from_clinical_record(lines, annotations, False)

In [126]:
def docs_from_many_clinical_records(base_path: Path, nlp: Language = spacy.blank("en"), merge_docs: bool = True):
    
    all_docs = []
    concept_paths = sorted((base_path / "concepts").glob("*.txt.con"))
    document_paths = sorted((base_path / "docs").glob("*.txt"))

    for con_path, doc_path in zip(concept_paths, document_paths):
        annotations = con_path.open().read().splitlines()
        lines = doc_path.open().read().splitlines()

        docs = docs_from_clinical_record(lines, annotations, merge_docs=merge_docs)
        all_docs += docs

    return all_docs



beth_train_docs = docs_from_many_clinical_records(input_dir / "Beth_Train")
partners_train_docs = docs_from_many_clinical_records(input_dir / "Partners_Train")
train_docs = beth_train_docs + partners_train_docs

beth_test_docs = docs_from_many_clinical_records(input_dir / "Task_1C/i2b2_Test/i2b2_Beth_Test")
partners_test_docs = docs_from_many_clinical_records(input_dir / "Task_1C/i2b2_Test/i2b2_Partners_Test")
test_docs = beth_test_docs + partners_test_docs

len(train_docs), len(test_docs)

different line numbers
['c="naa naricha early , m.d." 140:1 141:0', 't="person"']
different line numbers
['c="vit. b-3" 41:4 42:0', 't="treatment"']
different line numbers
['c="chronic inflammatory disease" 71:18 72:0', 't="problem"']
different line numbers
['c="a lower extremity angiography" 18:17 19:1', 't="test"']
different line numbers
['c="an exploratory laparotomy" 31:33 32:1', 't="test"']
different line numbers
['c="physical therapy" 86:28 87:0', 't="treatment"']
different line numbers
['c="vit . b-3" 45:2 46:0', 't="treatment"']
different line numbers
['c="vit . b-3" 38:2 39:0', 't="treatment"']
different line numbers
['c="staph . sepsis" 6:1 7:0', 't="problem"']
different line numbers
['c="vit . b-3" 59:2 60:0', 't="treatment"']
different line numbers
['c="vit . b-3" 69:2 70:0', 't="treatment"']
different line numbers
['c="vit . b-3" 36:4 37:0', 't="treatment"']
different line numbers
['c="vit . b-3" 46:2 47:0', 't="treatment"']
different line numbers
['c="a 27 mm . st. jude p

(251, 173)

In [127]:
import random
random.seed(42)
random.shuffle(train_docs)
train_docs, dev_docs = train_docs[:200], train_docs[200:]

len(train_docs), len(dev_docs), len(test_docs)

(200, 51, 173)

In [128]:
DocBin(docs=train_docs).to_disk("../corpus/train.spacy")
DocBin(docs=dev_docs).to_disk("../corpus/dev.spacy")
DocBin(docs=test_docs).to_disk("../corpus/test.spacy")

In [7]:
a_ids = []
e_ids = []

# Use regex to create doc id 

for con in a_corpus:
    f_id = re.findall(r'\d+', con)[0]
    a_ids.append(f_id)
for doc in e_corpus:
    f_id = re.findall(r'\d+', doc)[0]
    e_ids.append(f_id)
    
a_ids = tuple(sorted(a_ids)) 
e_ids = tuple(sorted(e_ids))

intersection = list(set(a_ids) & set(e_ids))
if len(intersection) == len(a_ids):
    print("Count of concept files with corresponding doc:", len(intersection))

Count of concept files with corresponding doc: 0
