In [60]:
import numpy as np
import os
from random import shuffle
import re
import sklearn as sk
import matplotlib.pyplot as plt
import random

In [61]:
import urllib.request
import zipfile
import lxml.etree

In [62]:
def listdir_nohidden(path):
    for f in os.listdir(path):
        if not f.startswith('.'):
            yield f

In [63]:
train_released = []
a = np.arange(10) + 1

for num in a:
    path = '2009 Medication Challenge/training.sets.released/' + str(num) + '/'
    for filename in listdir_nohidden(path):
        with open(path + filename, 'r') as file:
            temp = file.read()
            train_released.append(temp)
            
len(train_released)

696

In [64]:
test_released = []

path = '2009 Medication Challenge/test.released.8.17.09/'
for filename in listdir_nohidden(path):
    with open(path + filename, 'r') as file:
        temp = file.read()
        test_released.append(temp)
            
len(test_released)

1249

In [65]:
train_labeled = []
a = np.arange(10) + 1

for filename_1 in listdir_nohidden('2009 Medication Challenge/training.ground.truth'):
    index = filename_1.split('_')[0]
    for num in a:
        path = '2009 Medication Challenge/training.sets.released/' + str(num) + '/'
        for filename_2 in listdir_nohidden(path):
            if index == filename_2:
                with open(path + index, 'r') as file:
                    temp = file.read()
                    train_labeled.append(temp)

len(train_labeled)

10

In [66]:
train_truth = []

path = '2009 Medication Challenge/training.ground.truth/'
for filename in listdir_nohidden(path):
    with open(path + filename, 'r') as file:
        temp = file.read()
        train_truth.append(temp)

del train_truth[-1]
len(train_truth)

10

In [67]:
test_labeled = []
a = np.arange(10) + 1

for filename_1 in listdir_nohidden('2009 Medication Challenge/test.BYparticipant.ground_truth/converted.noduplicates.sorted'):
    index = filename_1.split('.')[0]
    for num in a:
        path = '2009 Medication Challenge/training.sets.released/' + str(num) + '/'
        for filename_2 in listdir_nohidden(path):
            if index == filename_2:
                with open(path + index, 'r') as file:
                    temp = file.read()
                    test_labeled.append(temp)
    path = '2009 Medication Challenge/test.released.8.17.09/'
    for filename_2 in listdir_nohidden(path):
        if index == filename_2:
            with open(path + index, 'r') as file:
                temp = file.read()
                test_labeled.append(temp)

len(test_labeled)

249

In [68]:
test_truth = []

path = '2009 Medication Challenge/test.BYparticipant.ground_truth/converted.noduplicates.sorted/'
for filename in listdir_nohidden(path):
    with open(path + filename, 'r') as file:
        temp = file.read()
        test_truth.append(temp)

del test_truth[-1]
len(test_truth)

249

In [69]:
sentences = []

for text in train_released+test_released:
    temp = text
    temp = re.sub(r'\d+', '<NUM>', temp)
    temp = re.sub(r'([A-Za-z]):', r'\1 :', temp)
    temp = re.sub(r'\n', ' ', temp)
    temp = re.sub(r'Dr.', 'Dr', temp)
    temp = re.sub(r'Mr.', 'Mr', temp)
    temp = re.sub(r'\. ([A-Z])', r'. A\1', temp)
    temp = re.split(r'\. [A-Z]', temp)
    for i in range(len(temp)):
        temp[i] = temp[i].lower()
        sentences.append(temp[i].split())

In [70]:
from gensim.models import Word2Vec

In [71]:
model_I2B2 = Word2Vec(sentences, min_count=1, size=100)

In [72]:
model_I2B2.most_similar("zestril")

[('imdur', 0.8462045192718506),
 ('norvasc', 0.8405196666717529),
 ('isosorbide', 0.8389357924461365),
 ('celexa', 0.8344588279724121),
 ('vasotec', 0.8293226957321167),
 ('cozaar', 0.8268866539001465),
 ('lipitor', 0.8237907886505127),
 ('prozac', 0.8234665393829346),
 ('hydrochlorothiazide', 0.8233591318130493),
 ('amlodipine', 0.8149084448814392)]

In [115]:
medications = []

for text in train_truth+test_truth:
    for term in re.finditer(r'm="[a-z0-9 ]+"', text):
        temp = term.group()[3:-1]
        temp = temp.split()
        for word in temp:
            if word not in medications:
                medications.append(word)

In [116]:
num = np.zeros(10)

for med in medications:
    num[len(med.split())] += 1

num

array([   0.,  946.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.])

In [113]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [156]:
from collections import Counter

words = []
for sent in sentences:
    for word in sent:
        words.append(word)

cnt = Counter(words).most_common(1000)
cnt = np.array(cnt)
present = np.ndarray.tolist(cnt[:,0])

In [157]:
for word in medications:
    if word in model_I2B2.wv.vocab.keys():
        present.append(word)

# This assumes words_top_ted is a list of strings, the top 1000 words
words_vec = model_I2B2[present]

In [158]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_tsne = tsne.fit_transform(words_vec)

In [159]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_tsne[:,0],
                                    x2=words_tsne[:,1],
                                    names=present))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)