In [1]:
import numpy as np
import os
import json_lines
import codecs
from keras_bert import load_trained_model_from_checkpoint, Tokenizer

from sklearn.model_selection import train_test_split

from keras.layers import *
from keras.optimizers import Adam
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
pretrained_path = 'pretrained/uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [3]:
SEQ_LEN = 128
BATCH_SIZE = 25
EPOCHS = 5
LR = 5e-6

In [4]:
bert_model = load_trained_model_from_checkpoint(
    config_path,
    checkpoint_path,
    training=True,
    trainable=True,
    seq_len=SEQ_LEN,
)

In [5]:
class Tokeniser(Tokenizer):
    def __init__(self, dict_path):
        token_dict = {}
        with codecs.open(dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                token_dict[token] = len(token_dict)
        super().__init__(token_dict)

def load_data(tokenizer: Tokeniser, file_path,
    text_label='trans_en', target_label='voted_up', max_len=100, batch_size=20):
    indices, sentiments = [], []
    with open(file_path, 'rb') as f:
        for item in json_lines.reader(f):
            ids, segments = tokenizer.encode(item[text_label].lower(), max_len=max_len)
            indices.append(ids)
            sentiments.append(int(item[target_label]))
    items = list(zip(indices, sentiments))
    np.random.shuffle(items)
    indices, sentiments = zip(*items)
    indices = np.array(indices)
    mod = indices.shape[0] % batch_size
    if mod > 0:
        indices, sentiments = indices[:-mod], sentiments[:-mod]
    return indices, np.array(sentiments)

In [6]:
def get_model():
    inputs = bert_model.inputs[:2]
    dense = bert_model.get_layer('NSP-Dense').output
    outputs = Dense(units=2, activation='softmax')(dense)

    model = Model(inputs, outputs)
    model.compile(
        optimizer=Adam(LR),
        loss='sparse_categorical_crossentropy',
        metrics=['sparse_categorical_accuracy'],
    )
    return model

In [None]:
tokeniser = Tokeniser(vocab_path)

In [None]:
def model_train(model, target_label='voted_up', model_name='bert_voted_up'):
    tokenizer = Tokenizer(paths.vocab)
    X, y = load_data(tokeniser, 'data/reviews_112_trans-en.jl',
        target_label=target_label, max_len=SEQ_LEN, batch_size=BATCH_SIZE)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    mcp_save = ModelCheckpoint("result/"+MODEL_NAME+'.best.h5',
        save_best_only=True, monitor='val_sparse_categorical_accuracy', mode='max')
    model.fit(
        [X_train, np.zeros_like(X_train)],
        y_train,
        epochs=EPOCHS,
        validation_split=0.1,
        batch_size=BATCH_SIZE,
        callbacks=[EarlyStopping(monitor='val_loss', patience=4), mcp_save]
    )
    model.save_weights("result/"+model_name+".h5")

# BERT Voted up

In [7]:
model = get_model()
model_train(model, 'voted_up', 'bert_voted_up')

Evaluation

In [None]:
import json
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

In [None]:
def plot_roc(pred, y):
    fpr, tpr, thresholds = roc_curve(y, pred)
    try:
        auc = roc_auc_score(y, pred)
    except ValueError:
        auc = "undefined"

    fig, ax = plt.subplots(1, figsize=(8,8))
    ax.plot(fpr, tpr, color='red')
    ax.plot([0,1], [0,1], color='black', linestyle='--')
    ax.set_title(f"AUC: {auc}")
    return fig

In [None]:
model.load_weights("result/bert_voted_up.best.h5")

In [None]:
preds = model.predict(get_X_array(X), verbose=True, batch_size=BATCH_SIZE)
result_max = preds.argmax(axis=-1)
tn, fp, fn, tp = confusion_matrix(y, result_max).ravel()

print('Confusion matrix:')
print('[{}, {}]'.format(tp, fp))
print('[{}, {}]'.format(fn, tn))

In [None]:
print('Accuracy: %.4f' % accuracy_score(y, result_max))

In [None]:
plot_roc(preds[:,1], y)

# BERT Early access

Training process

In [None]:
model = get_model()
model_train(model, 'early_access', 'bert_early_access')

In [None]:
model.load_weights("result/bert_early_access.best.h5")

In [None]:
preds = model.predict(get_X_array(X), verbose=True, batch_size=BATCH_SIZE)
result_max = preds.argmax(axis=-1)
tn, fp, fn, tp = confusion_matrix(y, result_max).ravel()

print('Confusion matrix:')
print('[{}, {}]'.format(tp, fp))
print('[{}, {}]'.format(fn, tn))

In [None]:
print('Accuracy: %.4f' % accuracy_score(y, result_max))

In [None]:
plot_roc(preds[:,1], y)