In [1]:
import warnings
import numpy as np
import gc
import json
import h5py

import tensorflow as tf
from keras.utils import Sequence
from keras.utils.data_utils import get_file
from keras.preprocessing import sequence
from keras.preprocessing.sequence import _remove_long_seq
from keras.models import Sequential
from keras.layers import Embedding, Input, Dense, Dropout, CuDNNLSTM, Bidirectional
from keras import metrics
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

gpu_options = tf.GPUOptions(
    per_process_gpu_memory_fraction=0.95, allow_growth=False)
config = tf.ConfigProto(intra_op_parallelism_threads=16,
                        inter_op_parallelism_threads=16, allow_soft_placement=True, gpu_options=gpu_options)
session = tf.Session(config=config)
K.set_session(session)


Using TensorFlow backend.


In [2]:
def get_word_index():
    f = open('data/ams_word_index.json')
    data = json.load(f)
    f.close()
    return data


def load_vocab():
    with open('data/ams_word_index.json') as json_data:
        return json.load(json_data)

    
def load_glove():
    glove = {}
    with open('data/glove.model.txt') as glove_data:
        for line in glove_data:
            items = line.split()
            key = items[0]
            glove[key] = np.asarray(items[1:], dtype='float32')
    return glove


def build_embedding_layer(with_input=False, maxlen=480, vocab_dim=300, mask_zero=True):
    print("-- loading word embeddings, this may take a little while...")
    index_dict = load_vocab()
    word_vectors = load_glove()
    # adding 1 to account for 0th index (for masking)
    n_symbols = len(index_dict) + 1
    print("-- known dictionary items: ", n_symbols)
    embedding_weights = np.zeros((n_symbols, vocab_dim))
    for word, index in index_dict.items():
        embedding_weights[index, :] = word_vectors[word]
    print("-- embeddings ")
    if not with_input:
        embedding_layer = Embedding(
            mask_zero=mask_zero,
            output_dim=vocab_dim, input_dim=n_symbols, input_length=maxlen, trainable=False, weights=[embedding_weights])
        return embedding_layer
    else:
        # define inputs here
        input_1 = Input(shape=(maxlen,), dtype='int32')
        embedding_layer = Embedding(
            weights=[embedding_weights],
            mask_zero=mask_zero,
            output_dim=vocab_dim, input_dim=n_symbols, input_length=maxlen, trainable=False)(input_1)
        return (embedding_layer, input_1)


In [3]:
# Input data is obtained via batch loading from HDF5
class DataGenerator(Sequence):
    def __init__(self, file_name, batch_size=1024, data_split=100):
        self.hf = h5py.File(file_name, 'r')
        self.total_len = self.hf["y_train"].shape[0]
        self.batch_size = batch_size
        self.idx = 0
        self.len_segment = int(self.total_len / data_split)
        self.cur_seg_idx = 0
        self.x_cur = self.hf['x_train'][:self.len_segment]
        self.y_cur = self.hf['y_train'][:self.len_segment]

    def data_size(self):
        return self.total_len
        
    def next_seg(self):
        self.cur_seg_idx += self.len_segment
        self.x_cur = self.hf['x_train'][self.cur_seg_idx:self.cur_seg_idx+self.len_segment]
        self.y_cur = self.hf['y_train'][self.cur_seg_idx:self.cur_seg_idx+self.len_segment]
        
    def generate(self):
        while 1:
            idx = self.idx
            if idx >= self.len_segment:
                self.next_seg()
                idx = 0
            
            if idx + self.batch_size >= self.len_segment:
                batch_x = self.x_cur[idx:]
                batch_y = self.y_cur[idx:]
            else:
                batch_x = self.x_cur[idx:(idx + self.batch_size)]
                batch_y = self.y_cur[idx:(idx + self.batch_size)]
            self.idx = idx + self.batch_size
            yield batch_x, batch_y

batch_size = 128
training_generator = DataGenerator("data/full_ams.hdf5", batch_size=batch_size)
train_len = training_generator.data_size()
# model_generator = training_generator.generate()

# x_len = int(train_len / batch_size)


# model.fit_generator(generator=training_generator, 
#                     epochs=1,
#                     steps_per_epoch=x_len, workers=1, 
#                     use_multiprocessing=False, 
#                     verbose=1)

In [18]:
all = training_generator.hf['y_train'][:]
new = all[all > 49]
len(new)

0

In [3]:
# preparing word embeddings
class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)
# 08.2018 (subformula lexemes)
# Analyzing the arxiv dataset seems to indicate 
#   a maxlen of 960 is needed to fit 99.2% of the data
#   a maxlen of 480 fits 96.03%, and a maxlen of 300 covers 90.0% of paragraphs
maxlen = 480
embedding_layer = build_embedding_layer(maxlen=maxlen, mask_zero=False)

NameError: name 'y_train' is not defined

In [11]:
# BiLSTM Model Setup
n_classes = 8
layer_size = 128  # ~maxlen // 4

print("-- setting up model layout...")
use_dropout = True

model = Sequential()
model.add(embedding_layer)
if use_dropout:
    model.add(Dropout(0.2))

model.add(Bidirectional(CuDNNLSTM(layer_size, return_sequences=True)))
if use_dropout:
    model.add(Dropout(0.2))

model.add(Bidirectional(CuDNNLSTM(layer_size // 2, return_sequences=True)))
if use_dropout:
    model.add(Dropout(0.1))

model.add(CuDNNLSTM(layer_size // 2))
if use_dropout:
    model.add(Dropout(0.1))

model.add(Dense(n_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer="adam",
              weighted_metrics=[metrics.sparse_categorical_accuracy])

# Print model summary
print(model.summary())

-- setting up model layout...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 480, 300)          300088800 
_________________________________________________________________
dropout_5 (Dropout)          (None, 480, 300)          0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 480, 256)          440320    
_________________________________________________________________
dropout_6 (Dropout)          (None, 480, 256)          0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 480, 128)          164864    
_________________________________________________________________
dropout_7 (Dropout)          (None, 480, 128)          0         
_________________________________________________________________
cu_dnnlstm_6 (CuDNNLSTM)     (None, 64)       

In [15]:
# Checkpoints: 1) save best model at epoch end, 2) stop early when metric stops improving
checkpoint = ModelCheckpoint(model_file+"-checkpoint.h5",
                             monitor='val_weighted_sparse_categorical_accuracy',
                             verbose=1,
                             save_best_only=True,
                             mode='max')

earlystop = EarlyStopping(monitor='val_weighted_sparse_categorical_accuracy',
                          min_delta=0.001,
                          patience=3,
                          verbose=0, mode='auto')


In [13]:
batch_size = 128
model_file = "bilstm%d_batch%d_cat%d_gpu" % (
    layer_size, batch_size, n_classes)

# Perform training
print('-- training model...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          class_weight=class_weights,
          epochs=50,
          verbose=1,
          callbacks=[checkpoint, earlystop],
          validation_split=0.2)
# serialize model to JSON
print("-- saving model to disk : %s " % model_file)
model.save(model_file+'_notebook.h5')

-- training model...
Train on 1957142 samples, validate on 489286 samples
Epoch 1/50

Epoch 00001: val_weighted_sparse_categorical_accuracy improved from -inf to 0.90934, saving model to bilstm128_batch64_cat8_gpu-checkpoint.h5
Epoch 2/50

Epoch 00002: val_weighted_sparse_categorical_accuracy improved from 0.90934 to 0.91820, saving model to bilstm128_batch64_cat8_gpu-checkpoint.h5
Epoch 3/50

Epoch 00003: val_weighted_sparse_categorical_accuracy improved from 0.91820 to 0.92188, saving model to bilstm128_batch64_cat8_gpu-checkpoint.h5
Epoch 4/50

Epoch 00004: val_weighted_sparse_categorical_accuracy improved from 0.92188 to 0.92429, saving model to bilstm128_batch64_cat8_gpu-checkpoint.h5
Epoch 5/50

Epoch 00005: val_weighted_sparse_categorical_accuracy improved from 0.92429 to 0.92460, saving model to bilstm128_batch64_cat8_gpu-checkpoint.h5
Epoch 6/50

Epoch 00006: val_weighted_sparse_categorical_accuracy improved from 0.92460 to 0.92572, saving model to bilstm128_batch64_cat8_gpu-c

In [14]:
print("Per-class test measures:")
y_pred = model.predict_classes(x_test, verbose=1, batch_size=batch)
print(classification_report(y_test, y_pred))

Per-class test measures:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9000
           1       0.95      0.95      0.95    200000
           2       0.92      0.95      0.94    200000
           3       0.93      0.94      0.93    141479
           4       0.86      0.75      0.80     51574
           5       0.84      0.76      0.80      3016
           6       0.86      0.82      0.84      6169
           7       0.72      0.80      0.76       370

   micro avg       0.93      0.93      0.93    611608
   macro avg       0.88      0.87      0.88    611608
weighted avg       0.93      0.93      0.93    611608



## Per-class test measures:
```
        611608/611608 [==============================] - 398s 650us/step
                      precision    recall  f1-score   support

acknowledgement       1.00      1.00      1.00      9000
          proof       0.95      0.95      0.95    200000
    proposition       0.92      0.95      0.94    200000
     definition       0.93      0.94      0.93    141479
        example       0.86      0.75      0.80     51574
   introduction       0.84      0.76      0.80      3016
        problem       0.86      0.82      0.84      6169
   related work       0.72      0.80      0.76       370

   micro avg       0.93      0.93      0.93    611608
   macro avg       0.88      0.87      0.88    611608
weighted avg       0.93      0.93      0.93    611608
```

Trained on 2.44 million paragraphs and evaluated on a test set of 0.6 million paragraphs, for a weighted 0.93 F1-score.