In [1]:
# Setup HAN prior running here:
# $ git clone https://github.com/FlorisHoogenboom/keras-han-for-docla
# $ cd keras-han-for-docla
# $ pip install .

import warnings
import numpy as np
import gc
import json
import h5py

import tensorflow as tf
from keras.utils.data_utils import get_file
from keras.utils.np_utils import to_categorical
from keras.preprocessing import sequence
from keras.preprocessing.sequence import _remove_long_seq
from keras.models import Sequential
from keras.layers import Embedding, Input, Dense, Dropout
from keras import metrics
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras_han.model import HAN
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

gpu_options = tf.GPUOptions(
    per_process_gpu_memory_fraction=0.95, allow_growth=False)
config = tf.ConfigProto(intra_op_parallelism_threads=16,
                        inter_op_parallelism_threads=16, allow_soft_placement=True, gpu_options=gpu_options)
session = tf.Session(config=config)
K.set_session(session)


Using TensorFlow backend.


In [2]:
    # For details on this confusion map problem redefinition, see:
    # https://github.com/dginev/arxiv-ams-paragraph-classification/blob/master/EXPERIMENT.md
    # Assumption: final label choice, setup_labels == "confusion-envs-v3":
    confusion_map = {
        # 0: drop,  # abstract, unclear separation with introduction?
        1: 0,  # acknowledgement
        # 2: drop, # algorithm, bad data
        # 3: drop, # POST2: drop, too correlated with proposition  # POST: assumption, # 0.77 assumption but "condition" seems hard to separate
        # 4: drop # caption, bad data
        5: 1,  # case + proof + step
        # 6: drop, # POST2: drop, too correlated with proof, proposition, example  # conclusion + discussion + remark
        # 7: drop,  # POST: condition
        8: 2,  # lemma + theorem + corollary + proposition + conjecture + fact
        9: 2,  # lemma + theorem + corollary + proposition + conjecture + fact
        10: 3,  # POST: definition
        # 11: drop, #POST2  # conclusion + discussion + remark
        12: 4,  # example
        13: 2,  # lemma + theorem + corollary + proposition + conjecture + fact
        14: 5,  # introduction
        15: 2,  # lemma + theorem + corollary + proposition + conjecture + fact
        # 16: drop,  # method, too correlated
        # 17: drop,  #POST: notation is too noisy / corellated with proof, drop
        # 18: drop, other class
        # 19: drop, paragraph seems badly separable
        20: 6,  # POST: problem
        21: 1,  # case + proof + step
        22: 2,  # lemma + theorem + corollary + proposition + conjecture + fact
        # 23: drop, POST: question too correlated with definition? # problem + question
        24: 7,  # related work
        # 25: drop, # POST 2  # conclusion + discussion + remark
        # 26: drop,  # result, too correlated
        27: 1,  # case + proof + step
        28: 2,  # lemma + theorem + corollary + proposition + conjecture + fact
    }
    # Classes v1:
    # 0 - acknowledgement
    # 1 - assumption = assumption + condition
    # 2 - proof = case + proof + step
    # 3 - remark = conclusion + discussion + remark
    # 4 - proposition = lemma + theorem + corollary + proposition + conjecture + fact
    # 5 - definition = definition + notation
    # 6 - example
    # 7 - introduction
    # 8 - problem = problem + question
    # 9 - related work
    # drop - abstract + algorithm + caption + method + other + paragraph + result

    # classes v3:
    # 0 - acknowledgement
    # 1 - proof = case + proof + step
    # 2 - proposition = lemma + theorem + corollary + proposition + conjecture + fact
    # 3 - definition
    # 4 - example
    # 5 - introduction
    # 6 - problem
    # 7 - related work


In [3]:
# Data-management before main training
MAX_WORDS_PER_SENT = 20
MAX_SENT = 24

def load_data(path='data/sandbox_ams_1k.npz',  # _1m
              test_split=0.2, seed=521, shuffle=True,
              max_per_class=1_000_000,
              **kwargs):
    """Loads the sandbox of AMS paragraph data, as prepared by `sandbox_data_for_keras.py`
       and adapts it for use by Keras, with additional preprocessing where needed

    # Returns
        List containing train-test split of inputs
        Four arrays: `x_train, y_train, x_test, y_test`.

    Words that were not seen in the training set but are in the test set
    have simply been skipped.
    """

    print('-- loading data...')
    xs, labels = [], []
    with np.load(path) as f:
        xs, labels = f['x'].tolist(), f['y'].tolist()
    
    other_label = len(set(confusion_map.values()))
    label_summary = dict.fromkeys(range(0, 28), 0)
    print("-- reducing to %d label classes" % (other_label))
    iterations = 0
    xs_reduced = []
    labels_reduced = []
    while len(labels) > 0:
        iterations += 1
        x = xs.pop()
        label = labels.pop()
        if iterations % 1_000_000 == 0:
            print("-- %d iterations" % iterations)
        if label in confusion_map:
            mapped_label = confusion_map[label]
            if label_summary[mapped_label] < max_per_class:
                # Also reshape x to a "24 sentences of 20 words" shape
                x = np.array(x).reshape(24,20)
                xs_reduced.append(x)
                labels_reduced.append(mapped_label)
                label_summary[mapped_label] += 1
    print("-- assigning to arrays")
    xs = np.array(xs_reduced)
    xs_reduced = []
    labels = np.array(labels_reduced)
    labels_reduced = []
    gc.collect()
    print("loaded %d paragagraphs and %d labels" % (len(xs), len(labels)))

    if shuffle:
        print("-- shuffling data...")
        np.random.seed(seed)
        indices = np.arange(len(xs))
        np.random.shuffle(indices)

        xs = xs[indices]
        labels = labels[indices]
        gc.collect()

    # Might as well report a summary of what is in the labels...      
    label_summary = {k: v for k, v in label_summary.items() if v > 0}
    print("-- Label summary: ", label_summary)

    print("-- performing train/test cutoff")
    return train_test_split(xs, labels, stratify=labels, test_size=test_split)


def get_word_index():
    f = open('data/ams_word_index.json')
    data = json.load(f)
    f.close()
    return data


def load_vocab():
    with open('data/ams_word_index.json') as json_data:
        return json.load(json_data)

    
def load_glove():
    glove = {}
    with open('data/glove.model.txt') as glove_data:
        for line in glove_data:
            items = line.split()
            key = items[0]
            glove[key] = np.asarray(items[1:], dtype='float32')
    return glove


def build_embedding_matrix(vocab_dim=300):
    print("-- loading word embeddings, this may take a little while...")
    index_dict = load_vocab()
    word_vectors = load_glove()
    # adding 1 to account for 0th index (for masking)
    n_symbols = len(index_dict) + 1
    print("-- known dictionary items: ", n_symbols)
    embedding_weights = np.zeros((n_symbols, vocab_dim))
    embedding_weights[0] = 0
    for word, index in index_dict.items():
        embedding_weights[index, :] = word_vectors[word]
    return embedding_weights

In [4]:
# Eager-Load the data
gc.collect()

x_train, x_test, y_train, y_test = load_data(path="data/sandbox_ams_1m.npz", shuffle=False)

y_train = np.array(y_train)
y_test = np.array(y_test)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

#

-- loading data...
-- reducing to 8 label classes
-- 1000000 iterations
-- 2000000 iterations
-- 3000000 iterations
-- 4000000 iterations
-- 5000000 iterations
-- 6000000 iterations
-- 7000000 iterations
-- assigning to arrays
loaded 3058036 paragagraphs and 3058036 labels
-- Label summary:  {0: 44997, 1: 1000000, 2: 1000000, 3: 707396, 4: 257868, 5: 15081, 6: 30843, 7: 1851}
-- performing train/test cutoff
x_train shape: (2446428, 24, 20)
x_test shape: (611608, 24, 20)
y_train shape: (2446428,)
y_test shape: (611608,)


In [5]:
# preparing word embeddings
class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)
# 08.2018 (subformula lexemes)
# Analyzing the arxiv dataset seems to indicate 
#   a maxlen of 960 is needed to fit 99.2% of the data
#   a maxlen of 480 fits 96.03%, and a maxlen of 300 covers 90.0% of paragraphs
embedding_matrix = build_embedding_matrix(vocab_dim=300)

-- loading word embeddings, this may take a little while...
-- known dictionary items:  1000296


In [10]:
# HAN Model Setup
print("-- setting up model layout...")
n_classes = 8

model = HAN(
    MAX_WORDS_PER_SENT, MAX_SENT, n_classes, embedding_matrix,
    word_encoding_dim=300, sentence_encoding_dim=300
)

model.compile(loss='sparse_categorical_crossentropy',
              optimizer="adam",
              weighted_metrics=[metrics.sparse_categorical_accuracy])

# Print model summary
print(model.summary())

-- setting up model layout...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 24, 20)            0         
_________________________________________________________________
word_encoder (TimeDistribute (None, 24, 20, 300)       300494700 
_________________________________________________________________
word_attention (TimeDistribu (None, 24, 300)           30100     
_________________________________________________________________
sentence_encoder (Model)     (None, 24, 300)           405900    
_________________________________________________________________
sentence_attention (Attentio (None, 300)               30100     
_________________________________________________________________
class_prediction (Dense)     (None, 8)                 2408      
Total params: 300,963,208
Trainable params: 874,408
Non-trainable params: 300,088,800
__________________________

In [11]:
batch_size = 64
model_file = "han_batch%d_cat%d_gpu" % (
    batch_size, n_classes)

# Checkpoints: 1) save best model at epoch end, 2) stop early when metric stops improving
earlystop = EarlyStopping(monitor='val_weighted_sparse_categorical_accuracy',
                          min_delta=0.001,
                          patience=3,
                          verbose=0, mode='auto')

In [12]:
# Perform training
print('-- training model...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          class_weight=class_weights,
          epochs=50,
          verbose=1,
          callbacks=[earlystop],
          validation_split=0.2)


-- training model...
Train on 1957142 samples, validate on 489286 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


<keras.callbacks.History at 0x7f3cc46f1b70>

In [13]:
# save model weights
# CAUTION: a regular model .save, including a checkpoint, seemed to produce an error??
#          .save_weights works better...
print("-- saving model weights to disk : %s " % model_file)
model.save_weights(model_file+'_notebook.h5')

-- saving model weights to disk : han_batch64_cat8_gpu 


In [14]:
print("Per-class test measures:")
y_prob = model.predict(x_test, batch_size=batch_size, verbose=1)
y_pred = y_prob.argmax(axis=-1)

print(classification_report(y_test, y_pred))


Per-class test measures:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      9000
           1       0.93      0.94      0.94    200000
           2       0.91      0.94      0.92    200000
           3       0.93      0.91      0.92    141479
           4       0.82      0.73      0.77     51574
           5       0.78      0.75      0.76      3016
           6       0.83      0.78      0.80      6169
           7       0.60      0.83      0.70       370

   micro avg       0.91      0.91      0.91    611608
   macro avg       0.85      0.86      0.85    611608
weighted avg       0.91      0.91      0.91    611608



Note that the data of 480 max-words per paragraph is reshaped to a fixed 24 sentences of 20 words each (padded with 0 for missing entries), and hence does *not* use the real sentence breaks from tokenization - a possible direction for improvement.

## Per-class test measures, model han_batch64_cat8_gpu (dim 300):

 - With 1k sandbox: F1 0.83 on test data, but training data was fully overfit (100% accuracy).
 - With 1 million class-max, 2.44 million training paragraphs, 0.6 million test paragraphs.
   - weighted F1-score of 0.91 on test data.
   - (training validation at accuracy 0.9145, after 6 epochs)

```
Per-class test measures:
611608/611608 [==============================] - 334s 547us/step
                 precision    recall  f1-score   support

acknowledgement       1.00      0.99      1.00      9000
          proof       0.93      0.94      0.94    200000
    proposition       0.91      0.94      0.92    200000
     definition       0.93      0.91      0.92    141479
        example       0.82      0.73      0.77     51574
   introduction       0.78      0.75      0.76      3016
        problem       0.83      0.78      0.80      6169
   related work       0.60      0.83      0.70       370

   micro avg       0.91      0.91      0.91    611608
   macro avg       0.85      0.86      0.85    611608
weighted avg       0.91      0.91      0.91    611608

```
     