In [1]:
import numpy as np
import os
from random import shuffle
import re
import random

import urllib.request
import zipfile
import lxml.etree

In [2]:
def listdir_nohidden(path):
    for f in os.listdir(path):
        if not f.startswith('.'):
            yield f

In [3]:
class DS:
    
    def __init__(self, name='', challenge='', stage='', raw_text=[]):
        self.name = name
        self.challenge = challenge
        self.stage = stage
        self.labelled = 'no'
        self.label_type = 'none'
        self.raw_text = raw_text
        self.emb_text = []
        self.test_text = []
        self.raw_labels = []
    
    def processForEmbedding(self):
        self.emb_text = []
        temp = self.raw_text
        temp = re.sub(r'\d+', '<NUM>', temp)
        temp = re.sub(r'([A-Za-z]):', r'\1 :', temp)
        temp = re.sub(r'\n', ' ', temp)
        temp = re.sub(r'Dr.', 'Dr', temp)
        temp = re.sub(r'Mr.', 'Mr', temp)
        temp = re.sub(r'\. ([A-Z])', r'. A\1', temp)
        temp = re.split(r'\. [A-Z]', temp)
        for i in range(len(temp)):
            temp[i] = temp[i].lower()
            self.emb_text.append(temp[i].split())
        
    def showInfo(self):
        print('Name: ', self.name,)
        print('Challenge: ', self.challenge,)
        print('Train or Test Set: ', self.stage,)
        print('Labelled: ', self.labelled,)
        print('Labeling Type: ', self.label_type,)

In [4]:
class Set:
    
    def __init__(self, data=[]):
        self.data = []
        self.size = len(data)
        
    def present(self, name):
        for case in self.data:
            if name == case.name:
                return(True)
        return(False)
        
    def add(self, item, printdup=0):
        if not self.present(item.name):
            self.data.append(item)
            self.size += 1
        else:
            if printdup == 1:
                print('ID %s from %s is a duplicate' %(item.name, item.challenge))

    def numberOf(self, challenge = r'.', stage=r'.', labelled=r'.', label_type=r'.'):
        n = 0
        for case in self.data:
            if (re.match(challenge, case.challenge) != None) & \
            (re.match(stage, case.stage) != None) & \
            (re.match(labelled, case.labelled) != None) & \
            (re.match(label_type, case.label_type) != None):
                n += 1
        return(n)
    
    def getDS(self, name=r'.', challenge = r'.', stage=r'.', labelled=r'.', label_type=r'.'):
        output = Set()
        for case in self.data:
            if (re.match(name, case.name) != None) & \
            (re.match(challenge, case.challenge) != None) & \
            (re.match(stage, case.stage) != None) & \
            (re.match(labelled, case.labelled) != None) & \
            (re.match(label_type, case.label_type) != None):
                output.add(case)
        return(output)
    
    def showInfo(self):
        for case in self.data:
            case.showInfo()
            print('\n')
    
    def addLabels(self, name, case, raw_labels):
        for i in range(self.size):
            if self.data[i].name == name:
                self.data[i].labelled = 'yes'
                if case == 'train':
                    self.data[i].label_type = 'train'
                if case == 'test':
                    self.data[i].label_type = 'test'
                self.data[i].raw_labels = raw_labels
                break       
                
    def processForEmbedding(self):
        for i in range(self.size):
            self.data[i].processForEmbedding()
    
    def getSentences(self, challenge = r'.', stage=r'.'):
        sentences = []
        pool = Dataset.getDS(challenge=challenge, stage=stage)
        for case in pool.data:
            for sent in case.emb_text:
                sentences.append(sent)
        return(sentences)

In [5]:
Dataset = Set()

In [6]:
challenge = '2009 Medication Challenge'

path = challenge + '/training.sets.released/'
for folder in listdir_nohidden(path):
    for filename in listdir_nohidden(path + folder + '/'):
        with open(path + folder + '/' + filename, 'r') as file:
            temp = DS(name=filename, challenge=challenge, stage='train', raw_text=file.read())
            Dataset.add(temp, printdup=0)
            
path = challenge + '/test.released.8.17.09/'
for filename in listdir_nohidden(path):
    with open(path + filename, 'r') as file:
        temp = DS(name=filename, challenge=challenge, stage='test', raw_text=file.read())
        Dataset.add(temp, printdup=0)
        
path = challenge + '/training.ground.truth/'
for filename in listdir_nohidden(path):
    index = filename.split('_')[0]
    with open(path + filename, 'r') as file:
        Dataset.addLabels(name=index, case='train', raw_labels=file.read())

path = challenge + '/test.BYparticipant.ground_truth/converted.noduplicates.sorted/'
for filename in listdir_nohidden(path):
    index = filename.split('.')[0]
    with open(path + filename, 'r') as file:
        Dataset.addLabels(name=index, case='test', raw_labels=file.read())

In [7]:
challenge = '2007 Smoking Challenge'

tree = lxml.etree.parse(challenge + '/smokers_surrogate_train_all_version2.xml')
root = tree.getroot()
IDs = []
contents = []
for name in root.iter('RECORD'):
    IDs.append(name.attrib.get(name.attrib.keys()[0]))
for summary in root.iter('TEXT'):
    contents.append(summary.text)
for i in range(len(IDs)):
    temp = DS(name=IDs[i], challenge=challenge, stage='test', raw_text=contents[0])
    Dataset.add(temp, printdup=0)
    
tree = lxml.etree.parse(challenge + '/smokers_surrogate_test_all_groundtruth_version2.xml')
root = tree.getroot()
IDs = []
contents = []
for name in root.iter('RECORD'):
    IDs.append(name.attrib.get(name.attrib.keys()[0]))
for summary in root.iter('TEXT'):
    contents.append(summary.text)
for i in range(len(IDs)):
    temp = DS(name=IDs[i], challenge=challenge, stage='test', raw_text=contents[0])
    Dataset.add(temp, printdup=0)
    
with zipfile.ZipFile(challenge + '/1C smokers_surrogate_train_all_version2.zip', 'r') as z:
    tree = lxml.etree.parse(z.open('smokers_surrogate_train_all_version2.xml'))
root = tree.getroot()
IDs = []
contents = []
for name in root.iter('RECORD'):
    IDs.append(name.attrib.get(name.attrib.keys()[0]))
for summary in root.iter('TEXT'):
    contents.append(summary.text)
for i in range(len(IDs)):
    temp = DS(name=IDs[i], challenge=challenge, stage='test', raw_text=contents[0])
    Dataset.add(temp, printdup=0)

In [8]:
print('Number of Texts: ', Dataset.size)
print('Number of 2007 Challenge texts: ', Dataset.numberOf(challenge='2007 Smoking Challenge'))
print('Number of Train Texts: ', Dataset.numberOf(stage='train'))
print('Number of Test Texts: ', Dataset.numberOf(stage='test'))
print('Number of Labeled Texts: ', Dataset.numberOf(labelled='yes'))
print('Number of Initially Labeled Texts: ', Dataset.numberOf(labelled='yes', label_type='train'))
print('Number of Competitor Labeled Texts Texts: ', Dataset.numberOf(labelled='yes', label_type='test'))

Number of Texts:  1749
Number of 2007 Challenge texts:  500
Number of Train Texts:  696
Number of Test Texts:  1053
Number of Labeled Texts:  258
Number of Initially Labeled Texts:  10
Number of Competitor Labeled Texts Texts:  248


In [9]:
medications = []
labelled = Dataset.getDS(labelled='yes')

for case in labelled.data:
    for term in re.finditer(r'm="[a-z0-9 ]+"', case.raw_labels):
        temp = term.group()[3:-1]
        temp = temp.split()
        for word in temp:
            if word not in medications:
                medications.append(word)

In [10]:
Dataset.processForEmbedding()
sentences = Dataset.getSentences()

In [11]:
from gensim.models import Word2Vec



In [12]:
model_I2B2 = Word2Vec(sentences, min_count=1, size=100)

In [13]:
model_I2B2.most_similar("zestril")

[('hydrochlorothiazide', 0.9161497354507446),
 ('prozac', 0.9061295986175537),
 ('klonopin', 0.9030982851982117),
 ('lipitor', 0.9019649028778076),
 ('zantac', 0.901008129119873),
 ('cozaar', 0.8987671136856079),
 ('pravachol', 0.8979552388191223),
 ('isordil', 0.8881779313087463),
 ('celexa', 0.8871161937713623),
 ('zoloft', 0.8838896155357361)]

In [14]:
num = np.zeros(10)
for med in medications:
    num[len(med.split())] += 1
num

array([   0.,  946.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.])

In [15]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [16]:
from collections import Counter

words = []
for sent in sentences:
    for word in sent:
        words.append(word)

cnt = Counter(words).most_common(1000)
cnt = np.array(cnt)
topwords = np.ndarray.tolist(cnt[:,0])
len(topwords)

1000

In [17]:
visualisation = []

for word in medications:
    if word in model_I2B2.wv.vocab.keys():
        visualisation.append(word)

colormap = np.full(len(visualisation), "red")

for word in topwords:
    if not word in visualisation:
        visualisation.append(word)
        colormap = np.append(colormap, ["blue"])


# This assumes words_top_ted is a list of strings, the top 1000 words
words_vec = model_I2B2[visualisation]

print(len(visualisation))
print(len(words_vec))
print(len(colormap))

1800
1800
1800


In [18]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_tsne = tsne.fit_transform(words_vec)

In [19]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_tsne[:,0],
                                    x2=words_tsne[:,1],
                                    names=visualisation,
                                    coloring=colormap))

p.scatter(x="x1", y="x2", color="coloring", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)