In [1]:
import os
import re 
import xml.etree.ElementTree as et
import numpy as np
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from nltk.tokenize import WhitespaceTokenizer
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
from sklearn.utils import check_consistent_length
from sklearn.utils.multiclass import type_of_target, unique_labels
from sklearn.utils.sparsefuncs import count_nonzero
from sklearn.utils.validation import column_or_1d
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from tqdm import tqdm_notebook

######### for data #########
def get_text_tags(xml_file):
    """
    This function loads individual xml file in ibi2 2014 Heart Disease Risk Factors Challenge. 
    Returns the clinical note as a string and tags as a list of xml object
    """
    text = et.parse(xml_file).getroot()[0].text
    tags = []
    for i in et.parse(xml_file).getroot()[1]:
        for j in i:
            tag = j.attrib
            tag.update({'tag': j.tag})
            tags.append(tag)       
    return text, tags

def get_words_locations(text, punc='!",.:;?'):
    """
    This function tokenize the clinical note as a list of words, with the consideration of removing parenthesis pairs. 
    It also returns the start and end index of each words in original clinical note string. 
    """
    words = []
    locations = []
    for i in WhitespaceTokenizer().span_tokenize(text):
        start = i[0]
        end = i[1]
        original_word = text[start:end]
        if original_word.startswith('(') and original_word.endswith(')'):
            word = original_word.lstrip('(').rstrip(')')
            start = start + 1
            end = end - 1
        elif original_word.startswith('(') and not(original_word.endswith(')')):
            word = original_word.lstrip('(')
            start = start + 1
        elif not(original_word.startswith('(')) and original_word.endswith(')') and '(' not in original_word:
            word = original_word.rstrip(')')
            end = end - 1
        else:
            word = original_word.rstrip(punc)
            end = end - len(original_word) + len(word)
        if word != '':
            words.append(word)
            locations.append((start, end))
    return words, locations

def get_labels(tags, locations):
    """
    This function takes the tags and the start and end index of each words in original clinical note string, 
    converts to a list of customized tags correspnding to the words list.  
    """
    labels = []
    for start, end in locations:
        token_tags = ['O']
        for j in tags:
            try:
                if start >= int(j.get('start')) and end <= int(j.get('end')):
                    token_tags = []
            except:
                continue
        for j in tags:
            try:
                if start >= int(j.get('start')) and end <= int(j.get('end')):
                    if j['tag'] == 'MEDICATION':
                        token_tag = 'I'+'.'+j['tag']+'.'+j.get('type1',).replace(' ', '_')+'.'+j.get('time').replace(' ', '_')
                    elif j['tag'] == 'FAMILY_HIST':
                        token_tag = 'I'+'.'+j['tag']+'.'+j.get('indicator').replace(' ', '_')+'.NA'
                    elif j['tag'] == 'SMOKER':
                        token_tag = 'I'+'.'+j['tag']+'.'+j.get('status')+'.NA'
                    else:
                        token_tag = 'I'+'.'+j['tag']+'.'+j.get('indicator').replace(' ', '_').replace('.', '')+'.'+j.get('time').replace(' ', '_')
                    token_tags.append(token_tag)
                    type_tag = j['tag']
                    tagged = True
            except:
                continue
        labels.append(set(token_tags))
    return labels

def get_words_labels(file):
    """
    This function loads individual xml file in ibi2 2014 Heart Disease Risk Factors Challenge. 
    Returns the words list of the clinical note, list of customized tags correspndingly and the set of all customized tags except 'O'.
    """
    text, tags = get_text_tags(file)    
    words, locations = get_words_locations(text)
    labels = get_labels(tags, locations)
    gold_labels = set([j for i in labels for j in i if j != 'O'])
    return words, labels, gold_labels

def get_tagged_sents(file, re_sep='\n'): # re_sep='\n|(?<!\d)[,.]|[,.](?!\d)' -- some bugs
    """
    This function loads individual xml file in ibi2 2014 Heart Disease Risk Factors Challenge. 
    Returns the list of lines (sentences---bugs) that are tagged with any except 'O', in the form of words lists.
    """
    p = re.compile(re_sep)
    tagged_sents = []
    text, tags = get_text_tags(file)  
    pl = [i.start() for i in p.finditer(text)]
    for i, _ in enumerate(pl):
        start = pl[i]
        try:
            end = pl[i+1]
        except:
            end = len(text)
        sent = text[start:end]
        tagged = False
        for j in tags:
            try:
                if start <= int(j.get('start')) and end >= int(j.get('end')):
                    tagged = True
            except:
                continue
        if tagged:
            s = sent.strip()
            #if s.startswith('.') or s.startswith(','):
            #    s = s[1:].strip()
            words, _ = get_words_locations(s)
            tagged_sents.append(words)
    return tagged_sents

def slice_x_in_y(x, y, sep='|'):
    """
    Gets the start and end index of the string list x out of the string list y.
    Make sure sep is not in either x or y.
    """
    x1 = sep.join(x)
    y1 = sep.join(y)
    nx = x1.count(sep)
    n1 = None
    n2 = None
    if x1 in y1:
        if x[0] != y[0] and x[-1] != y[-1]:
            y12 =  y1.replace(x1,'').split(sep*2)
            n1 = y12[0].count(sep) + 1
            n2 = nx + n1 + 1
        if x[0] == y[0]:
            n1 = 0
            n2 = len(x)
        if x[-1] == y[-1]:
            n1 = len(y) - len(x)
            n2 = len(y)
    return slice(n1, n2)

def get_tagged_sents_labels(tagged_sents, words, labels):
    """
    This function gets the list of corresponding tags lists of the tagged sentences in the clinical notes.
    """
    tagged_sents_labels = []
    for sent in tagged_sents:
        l = labels[slice_x_in_y(sent, words)]
        tagged_sents_labels.append(l)
    return tagged_sents_labels

def get_all_words_labels(file):
    """
    This function gets words, tags, concatenated words of tagged sentences and their taggs.
    """
    words, labels, gold_labels = get_words_labels(file)
    tagged_sents = get_tagged_sents(file)
    tagged_sents_labels = get_tagged_sents_labels(tagged_sents, words, labels)
    up_words = [item for sublist in tagged_sents for item in sublist]
    up_labels = [item for sublist in tagged_sents_labels for item in sublist]
    return words, labels, gold_labels, up_words, up_labels

def up_sampling(file, n=0):
    """
    This function gets words and tags. It then add tagged sentences and their taggs for n times for up-sampling.
    """
    words, labels, gold_labels, up_words, up_labels = get_all_words_labels(file)
    words += n * up_words
    labels += n * up_labels
    return words, labels, gold_labels

def get_all_notes_labels(xml_folder):
    notes = []
    notes_labels = []
    up_notes = []
    up_notes_labels = []
    notes_gold_labels = []
    for i in tqdm_notebook(os.listdir(xml_folder)):
        file = os.path.join(xml_folder, i)
        if file.endswith('.xml'):
            words, labels, gold_labels, up_words, up_labels = get_all_words_labels(file)
            notes.append(words)
            notes_labels.append(labels)
            up_notes.append(up_words)
            up_notes_labels.append(up_labels)
            notes_gold_labels.append(gold_labels)
    return notes, notes_labels, up_notes, up_notes_labels, notes_gold_labels

def process_data(xml_folder, up=0):
    notes = []
    notes_labels = []
    notes_gold_labels = []
    if up > 0:
        print('Loading files with '+str(up)+' times upsampling for tagged lines in '+xml_folder)
    else:
        print('Loading files in '+xml_folder)
    for i in tqdm_notebook(os.listdir(xml_folder)):
        file = os.path.join(xml_folder, i)    
        words, labels, gold_labels = up_sampling(file, n=up)
        notes.append(words)
        notes_labels.append(labels)
        notes_gold_labels.append(gold_labels)
    return notes, notes_labels, notes_gold_labels

def check_data(xml_folder):
    notes = []
    notes_labels = []
    tagged_notes = []
    tagged_notes_labels = []
    for i in os.listdir(xml_folder):
        file = os.path.join(xml_folder, i)    
        words, labels, _ = get_words_labels(file)
        tagged_sents = get_tagged_sents(file)
        tagged_sents_labels = get_tagged_sents_labels(tagged_sents, words, labels)
        notes.append(words)
        notes_labels.append(labels)
        tagged_notes.append(tagged_sents)
        tagged_notes_labels.append(tagged_sents_labels)
    return notes, notes_labels, tagged_notes, tagged_notes_labels

######### other helper #########
# function help convert labels to category labels
def get_cat_labels(label):
    c = '.'
    positions = [pos for pos, char in enumerate(label) if char == c]
    if label != 'O':
        sl = slice(positions[0]+1,positions[1])
        cat_label = label[sl]
    else:
        cat_label = label
    return cat_label

# function help convert labels to category and indicator labels
def get_cat_ind_labels(label):
    c = '.'
    positions = [pos for pos, char in enumerate(label) if char == c]
    if label != 'O':
        sl = slice(positions[0]+1,positions[2])
        cat_ind_label = label[sl]
    else:
        cat_ind_label = label
    return cat_ind_label

# function help convert labels to time_flattened labels
def get_time_flattened_label(label):
    c = '.'
    positions = [pos for pos, char in enumerate(label) if char == c]
    if label != 'O':
        sl = slice(positions[0]+1,positions[2])
        time_flattened_labels = [label[sl], label[positions[2]+1:]]
    else:
        time_flattened_labels = [label]
    return time_flattened_labels

# function help convert flattened labels back to gold labels
def get_flattened_reverted(y_pred, mlb, all_gold_label):
    
    flattened_pred = mlb.inverse_transform(y_pred>0.5)
    categories = ['CAD', 'DIABETES', 'FAMILY_HIST', 'HYPERLIPIDEMIA', 'HYPERTENSION', 'MEDICATION', 'O', 'OBESE', 'SMOKER']
    times = [ 'after_DCT', 'before_DCT', 'during_DCT', 'NA']
    indicators = [i for i in mlb.classes_ if i not in times + categories]
    
    x = set()
    for i in flattened_pred:
        cat = [j for j in i if j in categories]
        ind = [j for j in i if j in indicators]
        tim = [j for j in i if j in times]
        for m in cat:
            if m !='O':
                for n in ind:
                    for l in tim:
                        tag = 'I.' + m + '.' + n + '.' + l
                        if tag in all_gold_label:
                            x.add(tag)
                            
    return x

# function help convert time_flattened labels back to gold labels
def get_time_flattened_reverted(y_pred, mlb, all_gold_label):
    
    time_flattened_pred = mlb.inverse_transform(y_pred>0.5)
    cat_ind_lst = ['CAD.event', 'CAD.mention', 'CAD.symptom', 'CAD.test',
                   'DIABETES.A1C', 'DIABETES.glucose', 'DIABETES.mention',
                   'FAMILY_HIST.present', 'HYPERLIPIDEMIA.high_LDL',
                   'HYPERLIPIDEMIA.high_chol', 'HYPERLIPIDEMIA.mention',
                   'HYPERTENSION.high_bp', 'HYPERTENSION.mention',
                   'MEDICATION.ACE_inhibitor', 'MEDICATION.ARB',
                   'MEDICATION.DPP4_inhibitors', 'MEDICATION.anti_diabetes',
                   'MEDICATION.aspirin', 'MEDICATION.beta_blocker',
                   'MEDICATION.calcium_channel_blocker', 'MEDICATION.diuretic',
                   'MEDICATION.ezetimibe', 'MEDICATION.fibrate', 'MEDICATION.insulin',
                   'MEDICATION.metformin', 'MEDICATION.niacin', 'MEDICATION.nitrate',
                   'MEDICATION.statin', 'MEDICATION.sulfonylureas',
                   'MEDICATION.thiazolidinedione', 'MEDICATION.thienopyridine',
                   'OBESE.BMI', 'OBESE.mention', 'SMOKER.current', 'SMOKER.ever',
                   'SMOKER.never', 'SMOKER.past', 'SMOKER.unknown']
    times = ['after_DCT', 'before_DCT', 'during_DCT', 'NA']
    
    x = set()
    for i in time_flattened_pred:
        cat_ind = [j for j in i if j in cat_ind_lst]
        tim = [j for j in i if j in times]
        for m in cat_ind:
            for n in tim:
                tag = 'I.' + m + '.' + n
                if tag in all_gold_label:
                    x.add(tag)

    return x

# prepare features
def get_features(max_features, notes_train, notes_test, verbose=1):
    
    if verbose != 0: print('preparing features ...')
    notes = notes_train + notes_test
    X_txt = [' '.join(i) for i in notes]
    X_train_txt = [' '.join(i) for i in notes_train]
    X_test_txt = [' '.join(i) for i in notes_test]
    tokenizer = Tokenizer(num_words=max_features, filters='')
    tokenizer.fit_on_texts(X_txt)
    X_seq = tokenizer.texts_to_sequences(X_txt) 
    X_train_seq = tokenizer.texts_to_sequences(X_train_txt) 
    X_test_seq = tokenizer.texts_to_sequences(X_test_txt)
    word_index = tokenizer.word_index
    
    return X_train_seq, X_test_seq, word_index
    
# prepare targets
def get_targets(labels_train, labels_test, category=None, verbose=1):    
    if verbose != 0: print('preparing targets ...')
    labels = labels_train + labels_test
       
    if category == 'cat_only':
        # prepare cagtegory label targets
        labels = [[set([get_cat_labels(i) for i in list(j)]) for j in k] for k in labels]
        labels_train = [[set([get_cat_labels(i) for i in list(j)]) for j in k] for k in labels_train]
        labels_test = [[set([get_cat_labels(i) for i in list(j)]) for j in k] for k in labels_test]
    elif category == 'cat_ind':
        # prepare cagtegory indicator label targets
        labels = [[set([get_cat_ind_labels(i) for i in list(j)]) for j in k] for k in labels]
        labels_train = [[set([get_cat_ind_labels(i) for i in list(j)]) for j in k] for k in labels_train]
        labels_test = [[set([get_cat_ind_labels(i) for i in list(j)]) for j in k] for k in labels_test]
    elif category == 'flattened':    
        labels = [[set([m for n in [i.split('.') for i in list(j)] for m in n if m != 'I']) for j in k] for k in labels]
        labels_train = [[set([m for n in [i.split('.') for i in list(j)] for m in n if m != 'I']) for j in k] for k in labels_train]
        labels_test = [[set([m for n in [i.split('.') for i in list(j)] for m in n if m != 'I']) for j in k] for k in labels_test]
    elif category == 'time_flattened':   
        labels = [[set([m for n in [get_time_flattened_label(i) for i in list(j)] for m in n if m != 'I']) for j in k] for k in labels]
        labels_train = [[set([m for n in [get_time_flattened_label(i) for i in list(j)] for m in n if m != 'I']) for j in k] for k in labels_train]
        labels_test = [[set([m for n in [get_time_flattened_label(i) for i in list(j)] for m in n if m != 'I']) for j in k] for k in labels_test]
    else:
        pass
    
    all_labels = [label for notes_label in labels for label in notes_label]
    
    mlb = MultiLabelBinarizer()
    mlb.fit(all_labels)
    num_labels = len(mlb.classes_)
    Y_train = []
    Y_test = []
    for i in labels_train:
        l = mlb.transform(i)
        Y_train.append(l)
    for i in labels_test:
        l = mlb.transform(i)
        Y_test.append(l)
    return Y_train, Y_test, mlb, num_labels

# prepare gold label targets
def get_gold_label_targets(Y_pred, gold_labels, gold_labels_test, mlb, category=None, verbose=1):
    if verbose != 0: print('preparing gold label targets ...')
    if category == 'cat_only':
        gold_labels = [{get_cat_labels(val) for val in gold_label} for gold_label in gold_labels]
        gold_labels_test = [{get_cat_labels(val) for val in gold_label} for gold_label in gold_labels_test]
        gold_labels_pred = [{i for s in mlb.inverse_transform(y_pred>0.5) for i in s if i != 'O'} for y_pred in Y_pred]
    elif category == 'cat_ind':
        gold_labels = [{get_cat_ind_labels(val) for val in gold_label} for gold_label in gold_labels]
        gold_labels_test =[{get_cat_ind_labels(val) for val in gold_label} for gold_label in gold_labels_test]
        gold_labels_pred = [{i for s in mlb.inverse_transform(y_pred>0.5) for i in s if i != 'O'} for y_pred in Y_pred]
    elif category == 'flattened':
        all_gold_label = {i for gold_label in gold_labels for i in gold_label}
        gold_labels_pred = [get_flattened_reverted(i, mlb, all_gold_label) for i in Y_pred]    
    elif category == 'time_flattened':
        all_gold_label = {i for gold_label in gold_labels for i in gold_label}
        gold_labels_pred = [get_time_flattened_reverted(i, mlb, all_gold_label) for i in Y_pred] 
    else:
        gold_labels_pred = [{i for s in mlb.inverse_transform(y_pred>0.5) for i in s if i != 'O'} for y_pred in Y_pred]
        
    gmlb = MultiLabelBinarizer()
    gmlb.fit(gold_labels)
    Y_gold_test = gmlb.transform(gold_labels_test)
    Y_gold_pred = gmlb.transform(gold_labels_pred)
    return Y_gold_test, Y_gold_pred, gmlb

# data generator function
def data_generator(X_seq, Y):
    while True:
        for x, y in zip(X_seq, Y):
            x = np.array(x).reshape((1,-1))
            y = np.array(y).reshape((1,-1, y.shape[1]))
            yield x, y
    
######### for embedding matrix #########
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

def get_embedding_matrix(embedding_index, word_index, max_features, embed_size):
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: 
            continue
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i-1] = embedding_vector
    return embedding_matrix


######### for Markdown in Demo #########
def underline_and_tag(text, locations, labels):
    sl = []
    last = len(text)
    sl.append(0)
    lab = []
    for i, j in zip(locations, labels):
        if 'O' not in j:
            sl.append(i[0])
            sl.append(i[1])
            lab.append(j)
    sl.append(last)
    nl = []
    for i, _ in enumerate(sl):
        if i < len(sl) - 1:
            nl.append(slice(sl[i], sl[i+1]))
    frg = []
    for i in nl:
        frg.append(text[i])
    new_lab = []
    new_front = []
    new_lab.append('')
    new_front.append('')
    for i in lab:
        i = [j[2:] for j in i]
        new_lab.append('</u></b><sub font-size: small><mark style="background-color: lightblue">'+','.join(i)+'</mark></sub>')
        new_front.append('<u><b>')
        new_lab.append('')
        new_front.append('')
    markdown = []
    for i , j, k in zip(new_front, frg, new_lab):
        markdown.append(i+j+k)
        
    show = ''.join(markdown).replace("\n", "<br>")
    return show

######### for multiple comfusion matrix #########
def _check_targets(y_true, y_pred):
    
    check_consistent_length(y_true, y_pred)
    type_true = type_of_target(y_true)
    type_pred = type_of_target(y_pred)

    y_type = {type_true, type_pred}
    if y_type == {"binary", "multiclass"}:
        y_type = {"multiclass"}

    if len(y_type) > 1:
        raise ValueError("Classification metrics can't handle a mix of {0} "
                         "and {1} targets".format(type_true, type_pred))

    # We can't have more than one value on y_type => The set is no more needed
    y_type = y_type.pop()

    # No metrics support "multiclass-multioutput" format
    if (y_type not in ["binary", "multiclass", "multilabel-indicator"]):
        raise ValueError("{0} is not supported".format(y_type))

    if y_type in ["binary", "multiclass"]:
        y_true = column_or_1d(y_true)
        y_pred = column_or_1d(y_pred)
        if y_type == "binary":
            unique_values = np.union1d(y_true, y_pred)
            if len(unique_values) > 2:
                y_type = "multiclass"

    if y_type.startswith('multilabel'):
        y_true = csr_matrix(y_true)
        y_pred = csr_matrix(y_pred)
        y_type = 'multilabel-indicator'

    return y_type, y_true, y_pred


def multilabel_confusion_matrix(y_true, y_pred, sample_weight=None, labels=None, samplewise=False):
    
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    if sample_weight is not None:
        sample_weight = column_or_1d(sample_weight)
    check_consistent_length(y_true, y_pred, sample_weight)

    if y_type not in ("binary", "multiclass", "multilabel-indicator"):
        raise ValueError("%s is not supported" % y_type)

    present_labels = unique_labels(y_true, y_pred)
    if labels is None:
        labels = present_labels
        n_labels = None
    else:
        n_labels = len(labels)
        labels = np.hstack([labels, np.setdiff1d(present_labels, labels,
                                                 assume_unique=True)])

    if y_true.ndim == 1:
        if samplewise:
            raise ValueError("Samplewise metrics are not available outside of "
                             "multilabel classification.")

        le = LabelEncoder()
        le.fit(labels)
        y_true = le.transform(y_true)
        y_pred = le.transform(y_pred)
        sorted_labels = le.classes_

        # labels are now from 0 to len(labels) - 1 -> use bincount
        tp = y_true == y_pred
        tp_bins = y_true[tp]
        if sample_weight is not None:
            tp_bins_weights = np.asarray(sample_weight)[tp]
        else:
            tp_bins_weights = None

        if len(tp_bins):
            tp_sum = np.bincount(tp_bins, weights=tp_bins_weights,
                                 minlength=len(labels))
        else:
            # Pathological case
            true_sum = pred_sum = tp_sum = np.zeros(len(labels))
        if len(y_pred):
            pred_sum = np.bincount(y_pred, weights=sample_weight,
                                   minlength=len(labels))
        if len(y_true):
            true_sum = np.bincount(y_true, weights=sample_weight,
                                   minlength=len(labels))

        # Retain only selected labels
        indices = np.searchsorted(sorted_labels, labels[:n_labels])
        tp_sum = tp_sum[indices]
        true_sum = true_sum[indices]
        pred_sum = pred_sum[indices]

    else:
        sum_axis = 1 if samplewise else 0

        # All labels are index integers for multilabel.
        # Select labels:
        if not np.array_equal(labels, present_labels):
            if np.max(labels) > np.max(present_labels):
                raise ValueError('All labels must be in [0, n labels) for '
                                 'multilabel targets. '
                                 'Got %d > %d' %
                                 (np.max(labels), np.max(present_labels)))
            if np.min(labels) < 0:
                raise ValueError('All labels must be in [0, n labels) for '
                                 'multilabel targets. '
                                 'Got %d < 0' % np.min(labels))

        if n_labels is not None:
            y_true = y_true[:, labels[:n_labels]]
            y_pred = y_pred[:, labels[:n_labels]]

        # calculate weighted counts
        true_and_pred = y_true.multiply(y_pred)
        tp_sum = count_nonzero(true_and_pred, axis=sum_axis,
                               sample_weight=sample_weight)
        pred_sum = count_nonzero(y_pred, axis=sum_axis,
                                 sample_weight=sample_weight)
        true_sum = count_nonzero(y_true, axis=sum_axis,
                                 sample_weight=sample_weight)

    fp = pred_sum - tp_sum
    fn = true_sum - tp_sum
    tp = tp_sum

    if sample_weight is not None and samplewise:
        sample_weight = np.array(sample_weight)
        tp = np.array(tp)
        fp = np.array(fp)
        fn = np.array(fn)
        tn = sample_weight * y_true.shape[1] - tp - fp - fn
    elif sample_weight is not None:
        tn = sum(sample_weight) - tp - fp - fn
    elif samplewise:
        tn = y_true.shape[1] - tp - fp - fn
    else:
        tn = y_true.shape[0] - tp - fp - fn

    return np.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2)

Using TensorFlow backend.


In [2]:
notes_train_1, labels_train_1, gold_labels_train_1 = process_data('/host_home/data/i2b2/2014/training/training-RiskFactors-Complete-Set1') 
notes_train_2, labels_train_2, gold_labels_train_2 = process_data('/host_home/data/i2b2/2014/training/training-RiskFactors-Complete-Set2') 
notes_train = notes_train_1 + notes_train_2
labels_train = labels_train_1 + labels_train_2
gold_labels_train = gold_labels_train_1 + gold_labels_train_2
notes_test, labels_test, gold_labels_test = process_data('/host_home/data/i2b2/2014/testing/testing-RiskFactors-Complete') 
notes = notes_train + notes_test
labels = labels_train + labels_test
gold_labels = gold_labels_train + gold_labels_test

Loading files in /host_home/data/i2b2/2014/training/training-RiskFactors-Complete-Set1


HBox(children=(IntProgress(value=0, max=521), HTML(value='')))


Loading files in /host_home/data/i2b2/2014/training/training-RiskFactors-Complete-Set2


HBox(children=(IntProgress(value=0, max=269), HTML(value='')))


Loading files in /host_home/data/i2b2/2014/testing/testing-RiskFactors-Complete


HBox(children=(IntProgress(value=0, max=514), HTML(value='')))




In [83]:
from IPython.display import display, Markdown

file = '/host_home/data/i2b2/2014/training/training-RiskFactors-Complete-Set1/220-01.xml'
text, tags = get_text_tags(file)    
words, locations = get_words_locations(text)
labels = get_labels(tags, locations)

In [5]:
display(Markdown(text))




Record date: 2067-05-03

Narrative History

   55 yo woman who presents for f/u 

   

   Seen in Cardiac rehab locally last week and BP 170/80.  They called us and we increased her HCTZ to 25 mg from 12.5 mg.  States her BP's were fine there since - 130-140/70-80.

   

   

   Saw Dr Oakley 4/5/67 - she was happy with results of ETT at Clarkfield.  To f/u 7/67.  No CP's since last admit.

   

   Back to work and starting to walk.  No wt loss and discouraged by this, but just starting to exercise.

   

   No smoking for 3 months now!

   

   Still with hotflashes, wakes her up at night.

Problems

      FH breast cancer   37 yo s 



      FH myocardial infarction   mother died 66 yo 



      Hypertension



      Uterine fibroids   u/s 2062 



      Smoking



      hyperlipidemia   CRF mild chol, cigs, HTN, Fhx and known hx CAD in pt. 



      borderline diabetes mellitus   4/63 125 , follow hgbaic 



      VPB   2065 - ETT showed freq PVC 



      coronary artery disease   s/p ant SEMI + stent LAD 2/67, Dr Oakley 



      thyroid nodule   2065, hot, follow TSH. 

Medications

      NORVASC (AMLODIPINE)     5MG  1 Tablet(s) PO QD  



      PLAVIX (CLOPIDOGREL)   75 MG     PO QD  



      ATENOLOL     50MG  1 Tablet(s) PO QD  



      ASA (ACETYLSALICYLIC ACID)     325MG  1 Tablet(s) PO QD  



      ZESTRIL (LISINOPRIL)     40MG  1 Tablet(s) PO QD   



      LIPITOR (ATORVASTATIN)     10MG  1 Tablet(s) PO QD  



      HCTZ (HYDROCHLOROTHIAZIDE)     25MG  1 Tablet(s) PO QD   



      NITROGLYCERIN 1/150 (0.4 MG)   1 TAB     SL x1  PRN prn CP 

Allergies

      CECLOR (CEFACLOR)   Rash   

Vital Signs

      BLOOD PRESSURE-SITTING   150/70 

         repeat 145/80   HR 60 reg  WT  202 lbs

Physical Exam

   Looks well.  Lungs clear,  CVS  RRRs1s2, Ext - no edema	

Assessment and Plan

   1. H/M - Pe next appt in few months.  Overdue for mammo - will need to stress this.

   2.  CAD - cont with cardiac rehab, exercise, Dr Oakley, but doing well.

   3.  BP - better on increased HCTZ.  Check labs next visit, good K recently and on ACEI.

   4.  Cigs - great!

   5. Hot flashes - doesn't want to try any other meds - will try to put up with it.

   6. Hot thyroid nodule - recheck TSH next visit.

   7. Borderline glc - ok hgba1c 3/67.

   8.  Chol - ok, fasting on next visit.

   

   







In [25]:
print("\033[4mhello\033[0m")

[4mhello[0m


In [91]:
def underline_and_tag(text, locations, labels):
    sl = []
    last = len(text)
    sl.append(0)
    lab = []
    for i, j in zip(locations, labels):
        if 'O' not in j:
            sl.append(i[0])
            sl.append(i[1])
            lab.append(j)
    sl.append(last)
    nl = []
    for i, _ in enumerate(sl):
        if i < len(sl) - 1:
            nl.append(slice(sl[i], sl[i+1]))
    frg = []
    for i in nl:
        frg.append(text[i])
    new_lab = []
    new_front = []
    new_lab.append('')
    new_front.append('')
    for i in lab:
        i = [j[2:] for j in i]
        new_lab.append('</u></b><sub font-size: small><mark style="background-color: lightblue">'+','.join(i)+'</mark></sub>')
        new_front.append('<u><b>')
        new_lab.append('')
        new_front.append('')
    markdown = []
    for i , j, k in zip(new_front, frg, new_lab):
        markdown.append(i+j+k)
        
    show = ''.join(markdown).replace("\n", "<br>")
    return show

In [156]:
def underline_and_tag1(text, locations, labels):
    lines = text.split('\n')
    l = []
    n = 0
    for i in lines:
        m = n
        n = n + len(i) + 1
        kkk = []
        for (j, k), kk in zip(locations, labels):
            max_len = 0
            for ii in kk:
                if len(ii) > max_len:
                    max_len = len(ii)
            if m <= j and n >= k:
                if list(kk) != ['O']:
                    kkk.append(((j-m, k-m), kk, max_len-k+j))
        nn = 0
        newll = ''
        newl = text[m:n]
        for (_, jjj), mmm, ml in kkk:
            ll = newl[:nn+jjj]
            newll += _*' '+list(mmm)[0]
            newl = ll+ml*' '+newl[nn+jjj:]
            
            nn += ml
            
            
        l.append((0, len(i)+1, text[m:n], kkk, newl, newll))
    return l

In [157]:
underline_and_tag1(text, locations, labels)

[(0, 1, '\n', [], '\n', ''),
 (0, 1, '\n', [], '\n', ''),
 (0, 1, '\n', [], '\n', ''),
 (0, 24, 'Record date: 2067-05-03\n', [], 'Record date: 2067-05-03\n', ''),
 (0, 1, '\n', [], '\n', ''),
 (0, 18, 'Narrative History\n', [], 'Narrative History\n', ''),
 (0, 1, '\n', [], '\n', ''),
 (0,
  37,
  '   55 yo woman who presents for f/u \n',
  [],
  '   55 yo woman who presents for f/u \n',
  ''),
 (0, 1, '\n', [], '\n', ''),
 (0, 4, '   \n', [], '   \n', ''),
 (0, 1, '\n', [], '\n', ''),
 (0,
  179,
  "   Seen in Cardiac rehab locally last week and BP 170/80.  They called us and we increased her HCTZ to 25 mg from 12.5 mg.  States her BP's were fine there since - 130-140/70-80.\n",
  [((47, 49), {'I.HYPERTENSION.high_bp.before_DCT'}, 31),
   ((50, 56), {'I.HYPERTENSION.high_bp.before_DCT'}, 27),
   ((95, 99),
    {'I.MEDICATION.diuretic.after_DCT',
     'I.MEDICATION.diuretic.before_DCT',
     'I.MEDICATION.diuretic.during_DCT'},
    28)],
  "   Seen in Cardiac rehab locally last week and

In [68]:
len("   Seen in Cardiac rehab locally last week and ")

47

In [69]:
"   Seen in Cardiac rehab locally last week and BP 170/80.  They called us and we increased her HCTZ to 25 mg from 12.5 mg.  States her BP's were fine there since - 130-140/70-80.\n"[47:49]

'BP'

In [13]:
text

"\n\n\nRecord date: 2067-05-03\n\nNarrative History\n\n   55 yo woman who presents for f/u \n\n   \n\n   Seen in Cardiac rehab locally last week and BP 170/80.  They called us and we increased her HCTZ to 25 mg from 12.5 mg.  States her BP's were fine there since - 130-140/70-80.\n\n   \n\n   \n\n   Saw Dr Oakley 4/5/67 - she was happy with results of ETT at Clarkfield.  To f/u 7/67.  No CP's since last admit.\n\n   \n\n   Back to work and starting to walk.  No wt loss and discouraged by this, but just starting to exercise.\n\n   \n\n   No smoking for 3 months now!\n\n   \n\n   Still with hotflashes, wakes her up at night.\n\nProblems\n\n      FH breast cancer   37 yo s \n\n\n\n      FH myocardial infarction   mother died 66 yo \n\n\n\n      Hypertension\n\n\n\n      Uterine fibroids   u/s 2062 \n\n\n\n      Smoking\n\n\n\n      hyperlipidemia   CRF mild chol, cigs, HTN, Fhx and known hx CAD in pt. \n\n\n\n      borderline diabetes mellitus   4/63 125 , follow hgbaic \n\n\n\n      VPB 

In [94]:
x = [((47, 49), {'I.HYPERTENSION.high_bp.before_DCT'}, 31),
   ((50, 56), {'I.HYPERTENSION.high_bp.before_DCT'}, 27),
   ((95, 99),
    {'I.MEDICATION.diuretic.after_DCT',
     'I.MEDICATION.diuretic.before_DCT',
     'I.MEDICATION.diuretic.during_DCT'},
    28)]

In [108]:
[(list(i[1])) for i in x]

[['I.HYPERTENSION.high_bp.before_DCT'],
 ['I.HYPERTENSION.high_bp.before_DCT'],
 ['I.MEDICATION.diuretic.before_DCT',
  'I.MEDICATION.diuretic.during_DCT',
  'I.MEDICATION.diuretic.after_DCT']]

In [114]:
max([len(list(i[1])) for i in x])

3

In [133]:
a = 0
for j, jj, jjj in x:
    print(j)
    print([xx for xx in jj] + ['']*((max([len(list(i[1])) for i in x]))-len(jj)))
    for y in [xx for xx in jj] + ['']*((max([len(list(i[1])) for i in x]))-len(jj)):
        print((j[0]+a)*' '+y)
    a += jjj + j[1] - j[0]

(47, 49)
['I.HYPERTENSION.high_bp.before_DCT', '', '']
                                               I.HYPERTENSION.high_bp.before_DCT
                                               
                                               
(50, 56)
['I.HYPERTENSION.high_bp.before_DCT', '', '']
                                                                                   I.HYPERTENSION.high_bp.before_DCT
                                                                                   
                                                                                   
(95, 99)
['I.MEDICATION.diuretic.before_DCT', 'I.MEDICATION.diuretic.during_DCT', 'I.MEDICATION.diuretic.after_DCT']
                                                                                                                                                                 I.MEDICATION.diuretic.before_DCT
                                                                                                                     

In [140]:
a = 0
h = []
for j, jj, jjj in x:
    h.append(j[0]*' ')
    h.append(list(jj)[0])

In [141]:
h

['                                               ',
 'I.HYPERTENSION.high_bp.before_DCT',
 '                                                  ',
 'I.HYPERTENSION.high_bp.before_DCT',
 '                                                                                               ',
 'I.MEDICATION.diuretic.before_DCT']

In [92]:
show = underline_and_tag(text, locations, labels)

In [7]:
text

"\n\n\nRecord date: 2067-05-03\n\nNarrative History\n\n   55 yo woman who presents for f/u \n\n   \n\n   Seen in Cardiac rehab locally last week and BP 170/80.  They called us and we increased her HCTZ to 25 mg from 12.5 mg.  States her BP's were fine there since - 130-140/70-80.\n\n   \n\n   \n\n   Saw Dr Oakley 4/5/67 - she was happy with results of ETT at Clarkfield.  To f/u 7/67.  No CP's since last admit.\n\n   \n\n   Back to work and starting to walk.  No wt loss and discouraged by this, but just starting to exercise.\n\n   \n\n   No smoking for 3 months now!\n\n   \n\n   Still with hotflashes, wakes her up at night.\n\nProblems\n\n      FH breast cancer   37 yo s \n\n\n\n      FH myocardial infarction   mother died 66 yo \n\n\n\n      Hypertension\n\n\n\n      Uterine fibroids   u/s 2062 \n\n\n\n      Smoking\n\n\n\n      hyperlipidemia   CRF mild chol, cigs, HTN, Fhx and known hx CAD in pt. \n\n\n\n      borderline diabetes mellitus   4/63 125 , follow hgbaic \n\n\n\n      VPB 

In [93]:
display(Markdown(show))

<br><br><br>Record date: 2067-05-03<br><br>Narrative History<br><br>   55 yo woman who presents for f/u <br><br>   <br><br>   Seen in Cardiac rehab locally last week and <u><b>BP</u></b><sub font-size: small><mark style="background-color: lightblue">HYPERTENSION.high_bp.before_DCT</mark></sub> <u><b>170/80</u></b><sub font-size: small><mark style="background-color: lightblue">HYPERTENSION.high_bp.before_DCT</mark></sub>.  They called us and we increased her <u><b>HCTZ</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.diuretic.before_DCT,MEDICATION.diuretic.during_DCT,MEDICATION.diuretic.after_DCT</mark></sub> to 25 mg from 12.5 mg.  States her BP's were fine there since - 130-140/70-80.<br><br>   <br><br>   <br><br>   Saw Dr Oakley 4/5/67 - she was happy with results of ETT at Clarkfield.  To f/u 7/67.  No CP's since last admit.<br><br>   <br><br>   Back to work and starting to walk.  No wt loss and discouraged by this, but just starting to exercise.<br><br>   <br><br>   <u><b>No</u></b><sub font-size: small><mark style="background-color: lightblue">SMOKER.past.NA</mark></sub> <u><b>smoking</u></b><sub font-size: small><mark style="background-color: lightblue">SMOKER.past.NA</mark></sub> <u><b>for</u></b><sub font-size: small><mark style="background-color: lightblue">SMOKER.past.NA</mark></sub> <u><b>3</u></b><sub font-size: small><mark style="background-color: lightblue">SMOKER.past.NA</mark></sub> <u><b>months</u></b><sub font-size: small><mark style="background-color: lightblue">SMOKER.past.NA</mark></sub> <u><b>now</u></b><sub font-size: small><mark style="background-color: lightblue">SMOKER.past.NA</mark></sub>!<br><br>   <br><br>   Still with hotflashes, wakes her up at night.<br><br>Problems<br><br>      FH breast cancer   37 yo s <br><br><br><br>      FH myocardial infarction   mother died 66 yo <br><br><br><br>      <u><b>Hypertension</u></b><sub font-size: small><mark style="background-color: lightblue">HYPERTENSION.mention.after_DCT,HYPERTENSION.mention.before_DCT,HYPERTENSION.mention.during_DCT</mark></sub><br><br><br><br>      Uterine fibroids   u/s 2062 <br><br><br><br>      Smoking<br><br><br><br>      <u><b>hyperlipidemia</u></b><sub font-size: small><mark style="background-color: lightblue">HYPERLIPIDEMIA.mention.after_DCT,HYPERLIPIDEMIA.mention.during_DCT,HYPERLIPIDEMIA.mention.before_DCT</mark></sub>   CRF mild chol, cigs, <u><b>HTN</u></b><sub font-size: small><mark style="background-color: lightblue">HYPERTENSION.mention.after_DCT,HYPERTENSION.mention.before_DCT,HYPERTENSION.mention.during_DCT</mark></sub>, Fhx and <u><b>known</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.mention.during_DCT,CAD.mention.after_DCT,CAD.mention.before_DCT</mark></sub> <u><b>hx</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.mention.during_DCT,CAD.mention.after_DCT,CAD.mention.before_DCT</mark></sub> <u><b>CAD</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.mention.during_DCT,CAD.mention.after_DCT,CAD.mention.before_DCT</mark></sub> in pt. <br><br><br><br>      borderline diabetes mellitus   4/63 125 , follow hgbaic <br><br><br><br>      VPB   2065 - ETT showed freq PVC <br><br><br><br>      <u><b>coronary</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.mention.during_DCT,CAD.mention.after_DCT,CAD.mention.before_DCT</mark></sub> <u><b>artery</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.mention.during_DCT,CAD.mention.after_DCT,CAD.mention.before_DCT</mark></sub> <u><b>disease</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.mention.during_DCT,CAD.mention.after_DCT,CAD.mention.before_DCT</mark></sub>   <u><b>s/p</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.event.before_DCT</mark></sub> <u><b>ant</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.event.before_DCT</mark></sub> <u><b>SEMI</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.event.before_DCT</mark></sub> <u><b>+</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.event.before_DCT</mark></sub> <u><b>stent</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.event.before_DCT</mark></sub> <u><b>LAD</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.event.before_DCT</mark></sub> <u><b>2/67</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.event.before_DCT</mark></sub>, <u><b>Dr</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.event.before_DCT</mark></sub> <u><b>Oakley</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.event.before_DCT</mark></sub> <br><br><br><br>      thyroid nodule   2065, hot, follow TSH. <br><br>Medications<br><br>      <u><b>NORVASC</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.calcium_channel_blocker.before_DCT,MEDICATION.calcium_channel_blocker.during_DCT,MEDICATION.calcium_channel_blocker.after_DCT</mark></sub> (<u><b>AMLODIPINE</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.calcium_channel_blocker.before_DCT,MEDICATION.calcium_channel_blocker.during_DCT,MEDICATION.calcium_channel_blocker.after_DCT</mark></sub>)     5MG  1 Tablet(s) PO QD  <br><br><br><br>      <u><b>PLAVIX</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.mention.during_DCT,CAD.mention.after_DCT,CAD.mention.before_DCT,MEDICATION.thienopyridine.before_DCT,MEDICATION.thienopyridine.during_DCT,MEDICATION.thienopyridine.after_DCT</mark></sub> (<u><b>CLOPIDOGREL</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.thienopyridine.after_DCT,MEDICATION.thienopyridine.before_DCT,MEDICATION.thienopyridine.during_DCT</mark></sub>)   75 MG     PO QD  <br><br><br><br>      <u><b>ATENOLOL</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.beta_blocker.after_DCT,MEDICATION.beta_blocker.during_DCT,MEDICATION.beta_blocker.before_DCT</mark></sub>     50MG  1 Tablet(s) PO QD  <br><br><br><br>      <u><b>ASA</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.aspirin.after_DCT,MEDICATION.aspirin.during_DCT,MEDICATION.aspirin.before_DCT</mark></sub> (<u><b>ACETYLSALICYLIC</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.aspirin.after_DCT,MEDICATION.aspirin.during_DCT,MEDICATION.aspirin.before_DCT</mark></sub> <u><b>ACID</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.aspirin.after_DCT,MEDICATION.aspirin.during_DCT,MEDICATION.aspirin.before_DCT</mark></sub>)     325MG  1 Tablet(s) PO QD  <br><br><br><br>      <u><b>ZESTRIL</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.ACE_inhibitor.during_DCT,MEDICATION.ACE_inhibitor.after_DCT,MEDICATION.ACE_inhibitor.before_DCT</mark></sub> (<u><b>LISINOPRIL</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.ACE_inhibitor.during_DCT,MEDICATION.ACE_inhibitor.after_DCT,MEDICATION.ACE_inhibitor.before_DCT</mark></sub>)     40MG  1 Tablet(s) PO QD   <br><br><br><br>      <u><b>LIPITOR</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.statin.after_DCT,MEDICATION.statin.before_DCT,MEDICATION.statin.during_DCT</mark></sub> (<u><b>ATORVASTATIN</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.statin.after_DCT,MEDICATION.statin.before_DCT,MEDICATION.statin.during_DCT</mark></sub>)     10MG  1 Tablet(s) PO QD  <br><br><br><br>      <u><b>HCTZ</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.diuretic.before_DCT,MEDICATION.diuretic.during_DCT,MEDICATION.diuretic.after_DCT</mark></sub> (<u><b>HYDROCHLOROTHIAZIDE</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.diuretic.before_DCT,MEDICATION.diuretic.during_DCT,MEDICATION.diuretic.after_DCT</mark></sub>)     25MG  1 Tablet(s) PO QD   <br><br><br><br>      <u><b>NITROGLYCERIN</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.nitrate.before_DCT,MEDICATION.nitrate.during_DCT,MEDICATION.nitrate.after_DCT</mark></sub> <u><b>1/150</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.nitrate.before_DCT,MEDICATION.nitrate.during_DCT,MEDICATION.nitrate.after_DCT</mark></sub> (<u><b>0.4</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.nitrate.before_DCT,MEDICATION.nitrate.during_DCT,MEDICATION.nitrate.after_DCT</mark></sub> <u><b>MG</u></b><sub font-size: small><mark style="background-color: lightblue">MEDICATION.nitrate.before_DCT,MEDICATION.nitrate.during_DCT,MEDICATION.nitrate.after_DCT</mark></sub>)   1 TAB     SL x1  PRN prn CP <br><br>Allergies<br><br>      CECLOR (CEFACLOR)   Rash   <br><br>Vital Signs<br><br>      BLOOD PRESSURE-SITTING   <u><b>150/70</u></b><sub font-size: small><mark style="background-color: lightblue">HYPERTENSION.high_bp.during_DCT</mark></sub> <br><br>         <u><b>repeat</u></b><sub font-size: small><mark style="background-color: lightblue">HYPERTENSION.high_bp.during_DCT</mark></sub> <u><b>145/80</u></b><sub font-size: small><mark style="background-color: lightblue">HYPERTENSION.high_bp.during_DCT</mark></sub>   HR 60 reg  WT  202 lbs<br><br>Physical Exam<br><br>   Looks well.  Lungs clear,  CVS  RRRs1s2, Ext - no edema	<br><br>Assessment and Plan<br><br>   1. H/M - Pe next appt in few months.  Overdue for mammo - will need to stress this.<br><br>   2.  <u><b>CAD</u></b><sub font-size: small><mark style="background-color: lightblue">CAD.mention.during_DCT,CAD.mention.after_DCT,CAD.mention.before_DCT</mark></sub> - cont with cardiac rehab, exercise, Dr Oakley, but doing well.<br><br>   3.  BP - better on increased HCTZ.  Check labs next visit, good K recently and on ACEI.<br><br>   4.  Cigs - great!<br><br>   5. Hot flashes - doesn't want to try any other meds - will try to put up with it.<br><br>   6. Hot thyroid nodule - recheck TSH next visit.<br><br>   7. Borderline glc - ok hgba1c 3/67.<br><br>   8.  Chol - ok, fasting on next visit.<br><br>   <br><br>   <br><br><br><br><br><br>

In [44]:
show

'\n\n\nRecord date: 2067-05-03\n\nNarrative History\n\n   55 yo woman who presents for f/u \n\n   \n\n   Seen in Cardiac rehab locally last week and <u><b>BP</u></b><sub><mark style="background-color: lightblue">I.HYPERTENSION.high_bp.before_DCT</mark></sub> <u><b>170/80</u></b><sub><mark style="background-color: lightblue">I.HYPERTENSION.high_bp.before_DCT</mark></sub>.  They called us and we increased her <u><b>HCTZ</u></b><sub><mark style="background-color: lightblue">I.MEDICATION.diuretic.during_DCT,I.MEDICATION.diuretic.after_DCT,I.MEDICATION.diuretic.before_DCT</mark></sub> to 25 mg from 12.5 mg.  States her BP\'s were fine there since - 130-140/70-80.\n\n   \n\n   \n\n   Saw Dr Oakley 4/5/67 - she was happy with results of ETT at Clarkfield.  To f/u 7/67.  No CP\'s since last admit.\n\n   \n\n   Back to work and starting to walk.  No wt loss and discouraged by this, but just starting to exercise.\n\n   \n\n   <u><b>No</u></b><sub><mark style="background-color: lightblue">I.SMOKE

In [9]:
Y_train, Y_test, mlb, num_labels = get_targets(labels_train, labels_test, category='flattened', verbose=1)
Y_gold_test, Y_gold_pred, gmlb = get_gold_label_targets(Y_test, gold_labels, gold_labels_test, mlb, category='flattened', verbose=1)

preparing targets ...
preparing gold label targets ...


In [10]:
mlb.classes_

array(['A1C', 'ACE_inhibitor', 'ARB', 'BMI', 'CAD', 'DIABETES',
       'DPP4_inhibitors', 'FAMILY_HIST', 'HYPERLIPIDEMIA', 'HYPERTENSION',
       'MEDICATION', 'NA', 'O', 'OBESE', 'SMOKER', 'after_DCT',
       'anti_diabetes', 'aspirin', 'before_DCT', 'beta_blocker',
       'calcium_channel_blocker', 'current', 'diuretic', 'during_DCT',
       'event', 'ever', 'ezetimibe', 'fibrate', 'glucose', 'high_LDL',
       'high_bp', 'high_chol', 'insulin', 'mention', 'metformin', 'never',
       'niacin', 'nitrate', 'past', 'present', 'statin', 'sulfonylureas',
       'symptom', 'test', 'thiazolidinedione', 'thienopyridine',
       'unknown'], dtype=object)

In [11]:
gmlb.classes_

array(['I.CAD.event.after_DCT', 'I.CAD.event.before_DCT',
       'I.CAD.event.during_DCT', 'I.CAD.mention.after_DCT',
       'I.CAD.mention.before_DCT', 'I.CAD.mention.during_DCT',
       'I.CAD.symptom.after_DCT', 'I.CAD.symptom.before_DCT',
       'I.CAD.symptom.during_DCT', 'I.CAD.test.before_DCT',
       'I.CAD.test.during_DCT', 'I.DIABETES.A1C.before_DCT',
       'I.DIABETES.A1C.during_DCT', 'I.DIABETES.glucose.before_DCT',
       'I.DIABETES.glucose.during_DCT', 'I.DIABETES.mention.after_DCT',
       'I.DIABETES.mention.before_DCT', 'I.DIABETES.mention.during_DCT',
       'I.FAMILY_HIST.present.NA', 'I.HYPERLIPIDEMIA.high_LDL.before_DCT',
       'I.HYPERLIPIDEMIA.high_LDL.during_DCT',
       'I.HYPERLIPIDEMIA.high_chol.before_DCT',
       'I.HYPERLIPIDEMIA.high_chol.during_DCT',
       'I.HYPERLIPIDEMIA.mention.after_DCT',
       'I.HYPERLIPIDEMIA.mention.before_DCT',
       'I.HYPERLIPIDEMIA.mention.during_DCT',
       'I.HYPERTENSION.high_bp.before_DCT',
       'I.HYPERTENSION.

In [12]:
print(len(gmlb.classes_), len(mlb.classes_), len(Y_train), len(Y_test), len(Y_gold_test), len(Y_gold_pred), num_labels)

96 47 790 514 514 514 47


In [10]:
Y_gold_test, Y_gold_pred, gmlb = get_gold_label_targets(Y_test, gold_labels, gold_labels_test, mlb, category='time_flattened', verbose=1)

preparing gold label targets ...


In [12]:
gmlb.classes_

array(['I.CAD.event.after_DCT', 'I.CAD.event.before_DCT',
       'I.CAD.event.during_DCT', 'I.CAD.mention.after_DCT',
       'I.CAD.mention.before_DCT', 'I.CAD.mention.during_DCT',
       'I.CAD.symptom.after_DCT', 'I.CAD.symptom.before_DCT',
       'I.CAD.symptom.during_DCT', 'I.CAD.test.before_DCT',
       'I.CAD.test.during_DCT', 'I.DIABETES.A1C.before_DCT',
       'I.DIABETES.A1C.during_DCT', 'I.DIABETES.glucose.before_DCT',
       'I.DIABETES.glucose.during_DCT', 'I.DIABETES.mention.after_DCT',
       'I.DIABETES.mention.before_DCT', 'I.DIABETES.mention.during_DCT',
       'I.FAMILY_HIST.present.NA', 'I.HYPERLIPIDEMIA.high_LDL.before_DCT',
       'I.HYPERLIPIDEMIA.high_LDL.during_DCT',
       'I.HYPERLIPIDEMIA.high_chol.before_DCT',
       'I.HYPERLIPIDEMIA.high_chol.during_DCT',
       'I.HYPERLIPIDEMIA.mention.after_DCT',
       'I.HYPERLIPIDEMIA.mention.before_DCT',
       'I.HYPERLIPIDEMIA.mention.during_DCT',
       'I.HYPERTENSION.high_bp.before_DCT',
       'I.HYPERTENSION.

In [None]:
notes_train_1, labels_train_1, gold_labels_train_1 = process_data('/host_home/data/i2b2/2014/training/training-RiskFactors-Complete-Set1') 
notes_train_2, labels_train_2, gold_labels_train_2 = process_data('/host_home/data/i2b2/2014/training/training-RiskFactors-Complete-Set2') 
notes_train = notes_train_1 + notes_train_2
labels_train = labels_train_1 + labels_train_2
gold_labels_train = gold_labels_train_1 + gold_labels_train_2
notes_test, labels_test, gold_labels_test = process_data('/host_home/data/i2b2/2014/testing/testing-RiskFactors-Complete') 
notes = notes_train + notes_test
labels = labels_train + labels_test
gold_labels = gold_labels_train + gold_labels_test