In [1]:
import re
import json
import numpy as np
from pathlib import Path

# running with installed tensorflow-gpu 2.3.0, need compat to v1
import tensorflow.compat.v1 as tf
import keras
gpu_options = tf.GPUOptions(
    per_process_gpu_memory_fraction=0.95, allow_growth=False)
config = tf.ConfigProto(intra_op_parallelism_threads=16,
                        inter_op_parallelism_threads=16, allow_soft_placement=True, gpu_options=gpu_options)
session = tf.Session(config=config)
tf.keras.backend.set_session(session)

In [2]:
data_dir = Path('../data/mathoverflow')
with open(data_dir / 'mathoverflow.json') as json_file: 
    data = json.load(json_file)


In [3]:
# Reusing the original statement classification paper at:
# https://github.com/dginev/arxiv-statement-classification
# via local paths for now:
statement_repo_path = Path.home() / "git/arxiv-statement-classification"

# Load the indexed vocabulary compatible with the model
with open(statement_repo_path / 'data/word_index.json') as json_data:
    word_index = json.load(json_data)
    
def embed_text(text,maxlen = 480):
    # lowercase, split by non-word chars
    words = re.split('[-\W]+', text.lower())
    # map words to indexes and cap at maxlen
    indexed = []
    current = 0
    for w in words:
        try:
            w_ind = word_index[w]
            indexed.append(w_ind)
            current+=1
            if current>=maxlen:
                break
        except KeyError:
            continue
            # nothing to do 
    npad = maxlen-len(indexed)
    indexed = np.array(indexed)
    if npad > 0:
        indexed = np.pad(indexed, (0, npad), mode='constant')
    return indexed.reshape(1,480)


In [4]:
index_to_label = ["abstract","acknowledgement","conclusion","definition","example","introduction","keywords",
                  "problem","proof","proposition","related_work","remark","result"]

In [5]:
statement_model_path = statement_repo_path / "models/confusion_bilstm128_batch128_cat13_gpu_notebook.h5"
model = keras.models.load_model(statement_model_path)

In [6]:
batch_size=128
embedded = np.ndarray(shape=(len(data),480), dtype=int)
# This approach needs ~8 GB of RAM for the MathOverflow data
for index,datum in enumerate(data):
    embedded[index] = embed_text(datum['text'])
print("embedded input shape: ", embedded.shape)

# ~2 minutes of runtime to classify 270k paragraphs on a 1080 TI card.
predictions = model.predict(embedded, verbose=1, batch_size=batch_size)

embedded input shape:  (266441, 480)


In [13]:
for index,(datum,pred) in enumerate(zip(data,predictions)):
    pred_label_id = np.argmax(pred, axis=-1)
    datum['statement_label'] = index_to_label[pred_label_id]
    datum['statement_confidence'] = str(pred[pred_label_id])
    data[index] = datum

In [15]:
with open(data_dir / 'mathoverflow_statements.json',"w") as json_out_file: 
      json.dump(data, json_out_file)