# Baseline (part 1)

Implementing second part of solution described here: 
"Franck Dernoncourt, Ji Young Lee, and Peter
Szolovits. 2016. Neural networks for joint sentence
classification in medical paper abstracts. European
Chapter of the Association for Computational Linguistics
(EACL) 2017."

In [1]:
import numpy as np
import pandas as pd
import os
import gc

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

import matplotlib.pyplot as plt
%matplotlib inline  

Using TensorFlow backend.


## Get Data

In [2]:
# file PubMed_20k_RCT.csv created by script01_create_single_dataset
df_all = pd.read_csv('input/PubMed_20k_RCT.csv')
df_train = df_all[df_all['partition']=='train']
df_valid = df_all[df_all['partition']=='dev']
df_test = df_all[df_all['partition']=='test']
pd.set_option('max_colwidth',500)
df_all.head()

Unnamed: 0,partition,abstract_id,seq,text,label
0,train,4293578,0,"To investigate the efficacy of 6 weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at 12 weeks in older adults with moderate to severe knee osteoarthritis ( OA ) .",OBJECTIVE
1,train,4293578,1,A total of 125 patients with primary knee OA were randomized 1:1 ; 63 received 7.5 mg/day of prednisolone and 62 received placebo for 6 weeks .,METHODS
2,train,4293578,2,Outcome measures included pain reduction and improvement in function scores and systemic inflammation markers .,METHODS
3,train,4293578,3,Pain was assessed using the visual analog pain scale ( 0-100 mm ) .,METHODS
4,train,4293578,4,"Secondary outcome measures included the Western Ontario and McMaster Universities Osteoarthritis Index scores , patient global assessment ( PGA ) of the severity of knee OA , and 6-min walk distance ( 6MWD ) .",METHODS


In [3]:
X_train_cnt = df_train.shape[0]
X_valid_cnt = df_valid.shape[0]
X_test_cnt = df_test.shape[0]

X_all = df_all.text.values

print('Train partition size: {}'.format(X_train_cnt))
print('Valid partition size: {}'.format(X_valid_cnt))
print('Test partition size: {}'.format(X_test_cnt))
print('Total dataset size: {}'.format(X_all.shape[0]))

Train partition size: 180040
Valid partition size: 30212
Test partition size: 30135
Total dataset size: 240387


## Create token sequences

In [4]:
%%time

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_all)
sequences = tokenizer.texts_to_sequences(X_all)

word_index = tokenizer.word_index
VOC_SIZE = len(word_index)
print('Vocabulary size = {}'.format(VOC_SIZE))

Vocabulary size = 67356
CPU times: user 11.8 s, sys: 82.5 ms, total: 11.9 s
Wall time: 11.9 s


In [5]:
MAX_SEQ_LEN = 50
X_token_seq_all = pad_sequences(sequences, maxlen=MAX_SEQ_LEN, dtype='int32', padding='pre', truncating='post', value=0)

## Vectorize output labels

In [6]:
%%time

labels = df_all.label.values
label_dict = {label: no for no, label in enumerate(set(labels))}
number_of_classes = len(label_dict)

# get labels as integers
y_all = [label_dict[label] for label in labels]

# change y to categorical (vectorize output)
y_all = np.array([to_categorical(i, num_classes=number_of_classes) for i in y_all])

CPU times: user 1.84 s, sys: 58 ms, total: 1.9 s
Wall time: 1.9 s


## Prepare embedding layer with pre-trained glove word vectors

I use glove pre-trained (on Twiter) embeddings. <br> 
My code follows guidelines from Keras tutorial: <br>
https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [7]:
%%time

# read pre-trained glove embeddings

# create dictionary with word embeddings
embedding_file = {}
embedding_file[25] = 'glove/glove.twitter.27B.25d.txt'
embedding_file[50] = 'glove/glove.twitter.27B.50d.txt'
embedding_file[100] = 'glove/glove.twitter.27B.100d.txt'
embedding_file[200] = 'glove/glove.twitter.27B.200d.txt'

# set embedding dimension
EMBEDDING_DIM = 200

embeddings_index = {}
print('Using embedding dim = {}'.format(EMBEDDING_DIM))

f = open(embedding_file[EMBEDDING_DIM], 'r')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Number of word vectors found: {}'.format(len(embeddings_index)))

Using embedding dim = 200
Number of word vectors found: 1193514
CPU times: user 1min 3s, sys: 2.08 s, total: 1min 5s
Wall time: 1min 6s


In [8]:
%%time

# create matrix with embedding coefs for each word in vocabulary
# row position of word representation in the matrix will be the word number

# initiazes matrix with zeros
embedding_matrix = np.zeros((VOC_SIZE + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

CPU times: user 122 ms, sys: 30.7 ms, total: 153 ms
Wall time: 153 ms


In [13]:
%%time

from keras.models import Model
from keras.layers import Embedding, Input, LSTM, Flatten, Dropout, Dense
from keras.callbacks import EarlyStopping

embedding_layer = Embedding(VOC_SIZE + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQ_LEN,
                            trainable=False)

sequence_input = Input(shape=(MAX_SEQ_LEN,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
lstm = LSTM(units=800, return_sequences=False)(embedded_sequences)
D1 = Dense(100, activation='relu', name='sentence_vector_1')(lstm)
D1 = Dropout(0.5)(D1)
D1 = Dense(20, activation='relu', name='sentence_vector_2')(D1)
D1 = Dropout(0.5)(D1)
preds = Dense(len(label_dict), activation='softmax')(D1)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.summary()

# learn
model.fit(X_token_seq_all[:X_train_cnt], y_all[:X_train_cnt], \
          validation_data=(X_token_seq_all[X_train_cnt:(X_train_cnt+X_valid_cnt)], \
                                     y_all[X_train_cnt:(X_train_cnt+X_valid_cnt)]), \
          callbacks=[EarlyStopping(patience=1, monitor='val_loss')], \
          verbose=1, epochs=20, batch_size=256)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 50, 200)           13471400  
_________________________________________________________________
lstm_4 (LSTM)                (None, 800)               3203200   
_________________________________________________________________
sentence_vector_1 (Dense)    (None, 100)               80100     
_________________________________________________________________
dropout_7 (Dropout)          (None, 100)               0         
_________________________________________________________________
sentence_vector_2 (Dense)    (None, 20)                2020      
_________________________________________________________________
dropout_8 (Dropout)          (None, 20)                0         
__________

## Save model

In [14]:
model.save('input/Baseline_Part1_LSTM_MPL.h5')