In [1]:
import glob
import numpy
import pandas
import itertools
from xml.dom import minidom
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from collections import Counter

def get_annotation(element, indicator):
    if element.tagName == 'SMOKER' or element.tagName == 'FAMILY_HIST':
        return (element.tagName.lower() + '.' + 
                element.getAttribute(indicator).lower().strip().replace(' ', '_'))
    else:
        return (element.tagName.lower() + '.' + 
                element.getAttribute(indicator).lower().strip().replace(' ', '_'),  
                element.getAttribute('time').lower().strip().replace(' ', '_'))
    
def combine_annotations(annotations):
    results = list()
    for annotation in annotations:
        if len(annotation) == 2:
            if ((annotation[0], 'before_dct') in annotations and 
                (annotation[0], 'during_dct') in annotations and 
                (annotation[0], 'after_dct') in annotations):
                 results.append((annotation[0] + '.continuing'))
            else:
                results.append((annotation[0] + '.' + annotation[1]))
        else:
            results.append(annotation)
    return list(set(results))

def write_text(filename, data):
    file = open(filename, 'w')
    for i in range(0, len(data)):
        file.write(','.join(str(x) for x in data[i]) + '\n')
    file.close()

def write_annotations(filename, data):
    file = open(filename, 'w')
    if len(data) == 0:
        file.write('0')
    else:
        file.write(','.join(str(x) for x in data))
    file.close()

def write_to_file(filename, text, labels):
    file = open(filename, 'w')
    for i in range(0, len(text)):
        file.write(','.join(str(x) for x in text[i]) + '\n')
    if len(labels) == 0:
        file.write('0')
    else:
        file.write(','.join(str(x) for x in labels))
    file.close()

In [2]:
tagnames = ['CAD', 'DIABETES', 'FAMILY_HIST', 'HYPERLIPIDEMIA', 'HYPERTENSION', 'MEDICATION', 'OBESE', 'SMOKER']
folder = '/host_home/data/i2b2/2014/testing/testing-RiskFactors-Complete'
files = glob.glob(folder+'/*.xml')

In [7]:
files[39]

'/host_home/data/i2b2/2014/testing/testing-RiskFactors-Complete/119-02.xml'

In [11]:
for file in [files[39]]:

    root = minidom.parse(file)
    annotation_objects = [root.getElementsByTagName(x) for x in tagnames]
    annotations = [x for x in annotation_objects]
    annotations = [y for x in annotations for y in x]
    annotations = [get_annotation(x, 'type1')
                    if x.tagName == 'MEDICATION' else get_annotation(x, 'status')
                    if x.tagName == 'SMOKER' else get_annotation(x, 'indicator') 
                    for x in annotations]
    annotations = combine_annotations(annotations)
    annotations = [x for x in annotations if x != 'family_hist.not_present' and x != 'smoker.unknown']
#     encoded_annotations = [classes['I-' + x] for x in annotations if ('I-' + x) in classes]
#     encoded_annotations.sort(key=lambda x: x)
    
    text = root.getElementsByTagName("TEXT")[0].firstChild.data
    sentences = sent_tokenize(text)
    stemmer = SnowballStemmer("english")
    words = [[stemmer.stem(y.lower()) for y in word_tokenize(x)] for x in sentences]

In [10]:
words

[['record',
  'date',
  ':',
  '2089-08-24',
  'name',
  ':',
  'curti',
  ',',
  'om',
  'mrn',
  ':',
  '7682941',
  'he',
  'is',
  'feel',
  'quit',
  'fine',
  'today',
  '.'],
 ['he',
  'has',
  'no',
  'specif',
  'problem',
  'to',
  'be',
  'to',
  'my',
  'attent',
  '.'],
 ['he',
  'say',
  'that',
  'he',
  'is',
  'have',
  'no',
  'problem',
  'with',
  'chest',
  'pain',
  'whatsoev',
  'and',
  'that',
  'includ',
  'with',
  'exert',
  '.'],
 ['his', 'breath', 'has', 'been', 'fine', 'as', 'well', '.'],
 ['no', 'short', 'of', 'breath', '.'],
 ['otherwis',
  'he',
  'deni',
  'ani',
  'abdomin',
  'pain',
  ',',
  'joint',
  'pain',
  ',',
  'bowel',
  'or',
  'bladder',
  'difficulti',
  '.'],
 ['the',
  'remaind',
  'of',
  'the',
  'review',
  'of',
  'system',
  'is',
  'negat',
  'in',
  'detail',
  '.'],
 ['physic', 'exam', ':', 'on', 'exam', ',', 'he', 'look', 'well', '.'],
 ['skin', 'is', 'clear', '.'],
 ['heent', '-', 'perrla', '.'],
 ['eomi', '.'],
 ['tms', 'an

In [12]:
annotations

['hyperlipidemia.mention.continuing',
 'hypertension.mention.continuing',
 'cad.symptom.before_dct',
 'diabetes.mention.continuing']