In [1]:
import warnings
import numpy as np
import gc
import json
import h5py

import tensorflow as tf
from keras.utils.data_utils import get_file
from keras.preprocessing import sequence
from keras.preprocessing.sequence import _remove_long_seq
from keras.models import Sequential
from keras.layers import Embedding, Input, Dense, Dropout, CuDNNLSTM, Bidirectional
from keras import metrics
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

gpu_options = tf.GPUOptions(
    per_process_gpu_memory_fraction=0.95, allow_growth=False)
config = tf.ConfigProto(intra_op_parallelism_threads=16,
                        inter_op_parallelism_threads=16, allow_soft_placement=True, gpu_options=gpu_options)
session = tf.Session(config=config)
K.set_session(session)


Using TensorFlow backend.


In [2]:
    # Assumption: final label choice, setup_labels == "confusion-envs-v3":
    confusion_map = {
        # 0: drop,  # abstract, unclear separation with introduction?
        1: 0,  # acknowledgement
        # 2: drop, # algorithm, bad data
        # 3: drop, # POST2: drop, too correlated with proposition  # POST: assumption, # 0.77 assumption but "condition" seems hard to separate
        # 4: drop # caption, bad data
        5: 1,  # case + proof + step
        # 6: drop, # POST2: drop, too correlated with proof, proposition, example  # conclusion + discussion + remark
        # 7: drop,  # POST: condition
        8: 2,  # lemma + theorem + corollary + proposition + conjecture + fact
        9: 2,  # lemma + theorem + corollary + proposition + conjecture + fact
        10: 3,  # POST: definition
        # 11: drop, #POST2  # conclusion + discussion + remark
        12: 4,  # example
        13: 2,  # lemma + theorem + corollary + proposition + conjecture + fact
        14: 5,  # introduction
        15: 2,  # lemma + theorem + corollary + proposition + conjecture + fact
        # 16: drop,  # method, too correlated
        # 17: drop,  #POST: notation is too noisy / corellated with proof, drop
        # 18: drop, other class
        # 19: drop, paragraph seems badly separable
        20: 6,  # POST: problem
        21: 1,  # case + proof + step
        22: 2,  # lemma + theorem + corollary + proposition + conjecture + fact
        # 23: drop, POST: question too correlated with definition? # problem + question
        24: 7,  # related work
        # 25: drop, # POST 2  # conclusion + discussion + remark
        # 26: drop,  # result, too correlated
        27: 1,  # case + proof + step
        28: 2,  # lemma + theorem + corollary + proposition + conjecture + fact
    }
    # Classes v1:
    # 0 - acknowledgement
    # 1 - assumption = assumption + condition
    # 2 - proof = case + proof + step
    # 3 - remark = conclusion + discussion + remark
    # 4 - proposition = lemma + theorem + corollary + proposition + conjecture + fact
    # 5 - definition = definition + notation
    # 6 - example
    # 7 - introduction
    # 8 - problem = problem + question
    # 9 - related work
    # drop - abstract + algorithm + caption + method + other + paragraph + result

    # classes v3:
    # 0 - acknowledgement
    # 1 - proof = case + proof + step
    # 2 - proposition = lemma + theorem + corollary + proposition + conjecture + fact
    # 3 - definition
    # 4 - example
    # 5 - introduction
    # 6 - problem
    # 7 - related work


In [3]:
# Data-management before main training
def load_data(path='data/sandbox_ams_1k.npz',  # _1m
              test_split=0.2, seed=521, shuffle=True,
              max_per_class=1_000_000,
              **kwargs):
    """Loads the sandbox of AMS paragraph data, as prepared by `sandbox_data_for_keras.py`
       and adapts it for use by Keras, with additional preprocessing where needed

    # Returns
        List containing train-test split of inputs
        Four arrays: `x_train, y_train, x_test, y_test`.

    Words that were not seen in the training set but are in the test set
    have simply been skipped.
    """

    print('-- loading data...')
    xs, labels = [], []
    with np.load(path) as f:
        xs, labels = f['x'].tolist(), f['y'].tolist()
    
    other_label = len(set(confusion_map.values()))
    label_summary = dict.fromkeys(range(0, 28), 0)
    print("-- reducing to %d label classes" % (other_label))
    iterations = 0
    xs_reduced = []
    labels_reduced = []
    while len(labels) > 0:
        iterations += 1
        x = xs.pop()
        label = labels.pop()
        if iterations % 1_000_000 == 0:
            print("-- %d iterations" % iterations)
        if label in confusion_map:
            mapped_label = confusion_map[label]
            if label_summary[mapped_label] < max_per_class:
                xs_reduced.append(x)
                labels_reduced.append(mapped_label)
                label_summary[mapped_label] += 1
    print("-- assigning to arrays")
    xs = np.array(xs_reduced)
    xs_reduced = []
    labels = np.array(labels_reduced)
    labels_reduced = []
    gc.collect()
    print("loaded %d paragagraphs and %d labels" % (len(xs), len(labels)))

    if shuffle:
        print("-- shuffling data...")
        np.random.seed(seed)
        indices = np.arange(len(xs))
        np.random.shuffle(indices)

        xs = xs[indices]
        labels = labels[indices]
        gc.collect()

    # Might as well report a summary of what is in the labels...      
    label_summary = {k: v for k, v in label_summary.items() if v > 0}
    print("-- Label summary: ", label_summary)

    print("-- performing train/test cutoff")
    return train_test_split(xs, labels, stratify=labels, test_size=test_split)


def get_word_index():
    f = open('data/ams_word_index.json')
    data = json.load(f)
    f.close()
    return data


def load_vocab():
    with open('data/ams_word_index.json') as json_data:
        return json.load(json_data)

    
def load_glove():
    glove = {}
    with open('data/glove.model.txt') as glove_data:
        for line in glove_data:
            items = line.split()
            key = items[0]
            glove[key] = np.asarray(items[1:], dtype='float32')
    return glove


def build_embedding_layer(with_input=False, maxlen=256, vocab_dim=300, mask_zero=True):
    print("-- loading word embeddings, this may take a little while...")
    index_dict = load_vocab()
    word_vectors = load_glove()
    # adding 1 to account for 0th index (for masking)
    n_symbols = len(index_dict) + 1
    print("-- known dictionary items: ", n_symbols)
    embedding_weights = np.zeros((n_symbols, vocab_dim))
    for word, index in index_dict.items():
        embedding_weights[index, :] = word_vectors[word]
    print("-- embeddings ")
    if not with_input:
        embedding_layer = Embedding(
            mask_zero=mask_zero,
            output_dim=vocab_dim, input_dim=n_symbols, input_length=maxlen, trainable=False, weights=[embedding_weights])
        return embedding_layer
    else:
        # define inputs here
        input_1 = Input(shape=(maxlen,), dtype='int32')
        embedding_layer = Embedding(
            weights=[embedding_weights],
            mask_zero=mask_zero,
            output_dim=vocab_dim, input_dim=n_symbols, input_length=maxlen, trainable=False)(input_1)
        return (embedding_layer, input_1)


In [4]:
# Eager-Load the data
gc.collect()

x_train, x_test, y_train, y_test = load_data(path="data/sandbox_ams_1m.npz", shuffle=False)

y_train = np.array(y_train)
y_test = np.array(y_test)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

#

-- loading data...
-- reducing to 8 label classes
-- 1000000 iterations
-- 2000000 iterations
-- 3000000 iterations
-- 4000000 iterations
-- 5000000 iterations
-- 6000000 iterations
-- 7000000 iterations
-- assigning to arrays
loaded 3058036 paragagraphs and 3058036 labels
-- Label summary:  {0: 44997, 1: 1000000, 2: 1000000, 3: 707396, 4: 257868, 5: 15081, 6: 30843, 7: 1851}
-- preparing sets...
-- performing train/test cutoff
x_train shape: (2446428, 480)
x_test shape: (611608, 480)
y_train shape: (2446428,)
y_test shape: (611608,)


In [7]:
# preparing word embeddings
class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)
# 08.2018 (subformula lexemes)
# Analyzing the arxiv dataset seems to indicate 
#   a maxlen of 960 is needed to fit 99.2% of the data
#   a maxlen of 480 fits 96.03%, and a maxlen of 300 covers 90.0% of paragraphs
maxlen = 480
embedding_layer = build_embedding_layer(maxlen=maxlen, mask_zero=False)

-- loading word embeddings, this may take a little while...
-- known dictionary items:  1000296
-- embeddings 


In [11]:
# BiLSTM Model Setup
n_classes = 8
layer_size = 128  # ~maxlen // 4

print("-- setting up model layout...")
use_dropout = True

model = Sequential()
model.add(embedding_layer)
if use_dropout:
    model.add(Dropout(0.2))

model.add(Bidirectional(CuDNNLSTM(layer_size, return_sequences=True)))
if use_dropout:
    model.add(Dropout(0.2))

model.add(Bidirectional(CuDNNLSTM(layer_size // 2, return_sequences=True)))
if use_dropout:
    model.add(Dropout(0.1))

model.add(CuDNNLSTM(layer_size // 2))
if use_dropout:
    model.add(Dropout(0.1))

model.add(Dense(n_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer="adam",
              weighted_metrics=[metrics.sparse_categorical_accuracy])

# Print model summary
print(model.summary())

-- setting up model layout...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 480, 300)          300088800 
_________________________________________________________________
dropout_5 (Dropout)          (None, 480, 300)          0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 480, 256)          440320    
_________________________________________________________________
dropout_6 (Dropout)          (None, 480, 256)          0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 480, 128)          164864    
_________________________________________________________________
dropout_7 (Dropout)          (None, 480, 128)          0         
_________________________________________________________________
cu_dnnlstm_6 (CuDNNLSTM)     (None, 64)       

In [15]:
# Checkpoints: 1) save best model at epoch end, 2) stop early when metric stops improving
checkpoint = ModelCheckpoint(model_file+"-checkpoint.h5",
                             monitor='val_weighted_sparse_categorical_accuracy',
                             verbose=1,
                             save_best_only=True,
                             mode='max')

earlystop = EarlyStopping(monitor='val_weighted_sparse_categorical_accuracy',
                          min_delta=0.001,
                          patience=3,
                          verbose=0, mode='auto')


In [13]:
batch_size = 128
model_file = "bilstm%d_batch%d_cat%d_gpu" % (
    layer_size, batch_size, n_classes)

# Perform training
print('-- training model...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          class_weight=class_weights,
          epochs=50,
          verbose=1,
          callbacks=[checkpoint, earlystop],
          validation_split=0.2)
# serialize model to JSON
print("-- saving model to disk : %s " % model_file)
model.save(model_file+'_notebook.h5')

-- training model...
Train on 1957142 samples, validate on 489286 samples
Epoch 1/50

Epoch 00001: val_weighted_sparse_categorical_accuracy improved from -inf to 0.90934, saving model to bilstm128_batch64_cat8_gpu-checkpoint.h5
Epoch 2/50

Epoch 00002: val_weighted_sparse_categorical_accuracy improved from 0.90934 to 0.91820, saving model to bilstm128_batch64_cat8_gpu-checkpoint.h5
Epoch 3/50

Epoch 00003: val_weighted_sparse_categorical_accuracy improved from 0.91820 to 0.92188, saving model to bilstm128_batch64_cat8_gpu-checkpoint.h5
Epoch 4/50

Epoch 00004: val_weighted_sparse_categorical_accuracy improved from 0.92188 to 0.92429, saving model to bilstm128_batch64_cat8_gpu-checkpoint.h5
Epoch 5/50

Epoch 00005: val_weighted_sparse_categorical_accuracy improved from 0.92429 to 0.92460, saving model to bilstm128_batch64_cat8_gpu-checkpoint.h5
Epoch 6/50

Epoch 00006: val_weighted_sparse_categorical_accuracy improved from 0.92460 to 0.92572, saving model to bilstm128_batch64_cat8_gpu-c

In [14]:
print("Per-class test measures:")
y_pred = model.predict_classes(x_test, verbose=1, batch_size=batch)
print(classification_report(y_test, y_pred))

Per-class test measures:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9000
           1       0.95      0.95      0.95    200000
           2       0.92      0.95      0.94    200000
           3       0.93      0.94      0.93    141479
           4       0.86      0.75      0.80     51574
           5       0.84      0.76      0.80      3016
           6       0.86      0.82      0.84      6169
           7       0.72      0.80      0.76       370

   micro avg       0.93      0.93      0.93    611608
   macro avg       0.88      0.87      0.88    611608
weighted avg       0.93      0.93      0.93    611608



## Per-class test measures:
```
        611608/611608 [==============================] - 398s 650us/step
                      precision    recall  f1-score   support

acknowledgement       1.00      1.00      1.00      9000
          proof       0.95      0.95      0.95    200000
    proposition       0.92      0.95      0.94    200000
     definition       0.93      0.94      0.93    141479
        example       0.86      0.75      0.80     51574
   introduction       0.84      0.76      0.80      3016
        problem       0.86      0.82      0.84      6169
   related work       0.72      0.80      0.76       370

   micro avg       0.93      0.93      0.93    611608
   macro avg       0.88      0.87      0.88    611608
weighted avg       0.93      0.93      0.93    611608
```

Trained on 2.44 million paragraphs and evaluated on a test set of 0.6 million paragraphs, for a weighted 0.93 F1-score.