In [1]:
from bert_serving.client import BertClient
import numpy as np
import pandas as pd

In [2]:
from tqdm import tqdm
from preprocess import read_articles_from_file_list, read_predictions_from_file, label2index
bc = BertClient(ip='10.2.1.51')

Using TensorFlow backend.


In [3]:
a = bc.encode(['First do it', 'then do it right', 'then do it better'])

In [4]:
train_folder = "datasets/train-articles"  # check that the path to the datasets folder is correct,
dev_folder = "datasets/dev-articles"  # if not adjust these variables accordingly
train_labels_file = "datasets/train-task2-TC.labels"
dev_template_labels_file = "datasets/dev-task-TC-template.out"
task_TC_output_file = "baseline-output-TC.txt"

In [5]:
def clean_text(text):
    # text = text.lower()
    # text = text.replace('\'', '')
    text = text.replace('‘', ' \' ')
    text = text.replace('’', ' \' ')
    text = text.replace('“', ' \" ')
    text = text.replace('”', ' \" ')
    text = text.replace('â', ' \' ')

    text = text.replace('"', ' " ')
    text = text.replace('\'', ' \' ')

    text = text.replace('—', ' - ')
    text = text.replace('–', ' - ')
    text = text.replace('…', '...')
    text = text.replace('  ', ' ')
    text = text.strip()
    return text

In [6]:
articles = read_articles_from_file_list(train_folder)
dev_articles = read_articles_from_file_list(dev_folder)
ref_articles_id, ref_span_starts, ref_span_ends, train_gold_labels = read_predictions_from_file(train_labels_file)
dev_article_ids, dev_span_starts, dev_span_ends, dev_labels = read_predictions_from_file(dev_template_labels_file)
print("Loaded %d annotations from %d articles" % (len(ref_span_starts), len(set(ref_articles_id))))


Loaded 6129 annotations from 357 articles


In [7]:
def compute_features(articles, ref_articles_id, span_starts, span_ends):
    # only one feature, the length of the span
    print(type(span_starts), len(span_starts))
    print(type(span_ends), len(span_ends))
    data = []
    article_spans = []
    for i, ref_id in tqdm(enumerate(ref_articles_id)):
        # print(articles[ref_id], span_starts[i], span_ends[i])
        article = articles[ref_id]
        article_span = clean_text(article[int(span_starts[i]):int(span_ends[i])])
        data.append([article_span])
        article_spans.append(article_span)
        
    return article_spans

In [8]:
articles = compute_features(articles, ref_articles_id, ref_span_starts, ref_span_ends)
dev_articles = compute_features(dev_articles, dev_article_ids, dev_span_starts, dev_span_ends)

6129it [00:00, 321448.62it/s]
1063it [00:00, 275951.30it/s]

<class 'list'> 6129
<class 'list'> 6129
<class 'list'> 1063
<class 'list'> 1063





In [9]:
from keras.utils import to_categorical
from pprint import pprint
import pickle
from preprocess import label2index, index2label

In [10]:
GET_NEW_BERT_EMB = False

In [11]:
data_path = './bert_processed_data/'
if GET_NEW_BERT_EMB:
    articles_emb = bc.encode(articles)
    dev_articles_emb = bc.encode(dev_articles)
    pprint(set(train_gold_labels))

    labels = [label2index[x] for x in train_gold_labels]
    labels = to_categorical(np.asarray(labels))

    # save train data
    pickle.dump(articles_emb, open(data_path + 'train_x.p', 'wb'))
    pickle.dump(labels, open(data_path + 'train_y.p', 'wb'))

    # save dev data
    pickle.dump(dev_articles_emb, open(data_path + 'dev_x.p', 'wb'))
    
else:
    articles_emb = pickle.load(open(data_path + 'train_x.p', 'rb'))
    labels = pickle.load(open(data_path + 'train_y.p', 'rb'))
    dev_articles_emb = pickle.load(open(data_path + 'dev_x.p', 'rb'))

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train_x, test_x, train_y, test_y = train_test_split(articles_emb,
                                                  labels,
                                                  test_size=0.20,
                                                  shuffle=True,
                                                  stratify=labels)

In [14]:
from keras.layers import Conv2D, MaxPool2D, Bidirectional
from keras.layers import Embedding, Concatenate
from keras.layers.core import *
from keras.layers.recurrent import LSTM
from keras.models import *

def model_MLP(emb_size, out_size):
    model = Sequential()
    model.add(Dense(128, input_dim=emb_size, activation='relu'))
    model.add(Dropout(0.5))
    # model.add(Dense(64, activation='relu'))
    # model.add(Dropout(0.5))
    model.add(Dense(out_size, activation='softmax'))
    print(model.summary())
    return model

In [15]:
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam, SGD
from models import load_model
dev_template_labels_file = "datasets/dev-task-TC-template.out"


emb_size = 1024
out_size = 14

model = model_MLP(emb_size, out_size)

lr = 0.0001
bz = 256
epochs = 300

opt = Adam(lr=lr)
# opt = SGD(0.01)
# print(str(opt))
# exit()
model_name = 'bert_text_Adam_lr%s_bz%s' % (lr, bz)
model_path = 'models/%s' % (model_name)
checkpoint = ModelCheckpoint('%s.{epoch:02d}.hdf5' % (model_path), monitor='loss', verbose=1,
                             save_best_only=False, mode='auto')

model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['acc'])
try:
    model.fit(train_x, train_y, validation_data=[test_x, test_y], epochs=epochs,
              batch_size=bz,
              shuffle=True, callbacks=[checkpoint])
except:
    pass






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               131200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 14)                1806      
Total params: 133,006
Trainable params: 133,006
Non-trainable params: 0
_________________________________________________________________
None


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 4903 samples, validate on 1226 samples
Epoch 1/300

Epoch 00001: saving model to models/bert_text_Adam_lr0.0001_bz256.01.hdf5
Epoch 2/300

Epoch 0000


Epoch 00027: saving model to models/bert_text_Adam_lr0.0001_bz256.27.hdf5
Epoch 28/300

Epoch 00028: saving model to models/bert_text_Adam_lr0.0001_bz256.28.hdf5
Epoch 29/300

Epoch 00029: saving model to models/bert_text_Adam_lr0.0001_bz256.29.hdf5
Epoch 30/300

Epoch 00030: saving model to models/bert_text_Adam_lr0.0001_bz256.30.hdf5
Epoch 31/300

Epoch 00031: saving model to models/bert_text_Adam_lr0.0001_bz256.31.hdf5
Epoch 32/300

Epoch 00032: saving model to models/bert_text_Adam_lr0.0001_bz256.32.hdf5
Epoch 33/300

Epoch 00033: saving model to models/bert_text_Adam_lr0.0001_bz256.33.hdf5
Epoch 34/300

Epoch 00034: saving model to models/bert_text_Adam_lr0.0001_bz256.34.hdf5
Epoch 35/300

Epoch 00035: saving model to models/bert_text_Adam_lr0.0001_bz256.35.hdf5
Epoch 36/300

Epoch 00036: saving model to models/bert_text_Adam_lr0.0001_bz256.36.hdf5
Epoch 37/300

Epoch 00037: saving model to models/bert_text_Adam_lr0.0001_bz256.37.hdf5
Epoch 38/300

Epoch 00038: saving model to mo

Epoch 66/300

Epoch 00066: saving model to models/bert_text_Adam_lr0.0001_bz256.66.hdf5
Epoch 67/300

Epoch 00067: saving model to models/bert_text_Adam_lr0.0001_bz256.67.hdf5
Epoch 68/300

Epoch 00068: saving model to models/bert_text_Adam_lr0.0001_bz256.68.hdf5
Epoch 69/300

Epoch 00069: saving model to models/bert_text_Adam_lr0.0001_bz256.69.hdf5
Epoch 70/300

Epoch 00070: saving model to models/bert_text_Adam_lr0.0001_bz256.70.hdf5
Epoch 71/300

Epoch 00071: saving model to models/bert_text_Adam_lr0.0001_bz256.71.hdf5
Epoch 72/300

Epoch 00072: saving model to models/bert_text_Adam_lr0.0001_bz256.72.hdf5
Epoch 73/300

Epoch 00073: saving model to models/bert_text_Adam_lr0.0001_bz256.73.hdf5
Epoch 74/300

Epoch 00074: saving model to models/bert_text_Adam_lr0.0001_bz256.74.hdf5
Epoch 75/300

Epoch 00075: saving model to models/bert_text_Adam_lr0.0001_bz256.75.hdf5
Epoch 76/300

Epoch 00076: saving model to models/bert_text_Adam_lr0.0001_bz256.76.hdf5
Epoch 77/300

Epoch 00077: savin


Epoch 00104: saving model to models/bert_text_Adam_lr0.0001_bz256.104.hdf5
Epoch 105/300

Epoch 00105: saving model to models/bert_text_Adam_lr0.0001_bz256.105.hdf5
Epoch 106/300

Epoch 00106: saving model to models/bert_text_Adam_lr0.0001_bz256.106.hdf5
Epoch 107/300

Epoch 00107: saving model to models/bert_text_Adam_lr0.0001_bz256.107.hdf5
Epoch 108/300

Epoch 00108: saving model to models/bert_text_Adam_lr0.0001_bz256.108.hdf5
Epoch 109/300

Epoch 00109: saving model to models/bert_text_Adam_lr0.0001_bz256.109.hdf5
Epoch 110/300

Epoch 00110: saving model to models/bert_text_Adam_lr0.0001_bz256.110.hdf5
Epoch 111/300

Epoch 00111: saving model to models/bert_text_Adam_lr0.0001_bz256.111.hdf5
Epoch 112/300

Epoch 00112: saving model to models/bert_text_Adam_lr0.0001_bz256.112.hdf5
Epoch 113/300

Epoch 00113: saving model to models/bert_text_Adam_lr0.0001_bz256.113.hdf5
Epoch 114/300

Epoch 00114: saving model to models/bert_text_Adam_lr0.0001_bz256.114.hdf5
Epoch 115/300

Epoch 001

Epoch 143/300

Epoch 00143: saving model to models/bert_text_Adam_lr0.0001_bz256.143.hdf5
Epoch 144/300

Epoch 00144: saving model to models/bert_text_Adam_lr0.0001_bz256.144.hdf5
Epoch 145/300

Epoch 00145: saving model to models/bert_text_Adam_lr0.0001_bz256.145.hdf5
Epoch 146/300

Epoch 00146: saving model to models/bert_text_Adam_lr0.0001_bz256.146.hdf5
Epoch 147/300

Epoch 00147: saving model to models/bert_text_Adam_lr0.0001_bz256.147.hdf5
Epoch 148/300

Epoch 00148: saving model to models/bert_text_Adam_lr0.0001_bz256.148.hdf5
Epoch 149/300

Epoch 00149: saving model to models/bert_text_Adam_lr0.0001_bz256.149.hdf5
Epoch 150/300

Epoch 00150: saving model to models/bert_text_Adam_lr0.0001_bz256.150.hdf5
Epoch 151/300

Epoch 00151: saving model to models/bert_text_Adam_lr0.0001_bz256.151.hdf5
Epoch 152/300

Epoch 00152: saving model to models/bert_text_Adam_lr0.0001_bz256.152.hdf5
Epoch 153/300

Epoch 00153: saving model to models/bert_text_Adam_lr0.0001_bz256.153.hdf5
Epoch 154/


Epoch 00181: saving model to models/bert_text_Adam_lr0.0001_bz256.181.hdf5
Epoch 182/300

Epoch 00182: saving model to models/bert_text_Adam_lr0.0001_bz256.182.hdf5
Epoch 183/300

Epoch 00183: saving model to models/bert_text_Adam_lr0.0001_bz256.183.hdf5
Epoch 184/300

Epoch 00184: saving model to models/bert_text_Adam_lr0.0001_bz256.184.hdf5
Epoch 185/300

Epoch 00185: saving model to models/bert_text_Adam_lr0.0001_bz256.185.hdf5
Epoch 186/300

Epoch 00186: saving model to models/bert_text_Adam_lr0.0001_bz256.186.hdf5
Epoch 187/300

Epoch 00187: saving model to models/bert_text_Adam_lr0.0001_bz256.187.hdf5
Epoch 188/300

Epoch 00188: saving model to models/bert_text_Adam_lr0.0001_bz256.188.hdf5
Epoch 189/300

Epoch 00189: saving model to models/bert_text_Adam_lr0.0001_bz256.189.hdf5
Epoch 190/300

Epoch 00190: saving model to models/bert_text_Adam_lr0.0001_bz256.190.hdf5
Epoch 191/300

Epoch 00191: saving model to models/bert_text_Adam_lr0.0001_bz256.191.hdf5
Epoch 192/300

Epoch 001


Epoch 00219: saving model to models/bert_text_Adam_lr0.0001_bz256.219.hdf5
Epoch 220/300

Epoch 00220: saving model to models/bert_text_Adam_lr0.0001_bz256.220.hdf5
Epoch 221/300

Epoch 00221: saving model to models/bert_text_Adam_lr0.0001_bz256.221.hdf5
Epoch 222/300

Epoch 00222: saving model to models/bert_text_Adam_lr0.0001_bz256.222.hdf5
Epoch 223/300

Epoch 00223: saving model to models/bert_text_Adam_lr0.0001_bz256.223.hdf5
Epoch 224/300

Epoch 00224: saving model to models/bert_text_Adam_lr0.0001_bz256.224.hdf5
Epoch 225/300

Epoch 00225: saving model to models/bert_text_Adam_lr0.0001_bz256.225.hdf5
Epoch 226/300

Epoch 00226: saving model to models/bert_text_Adam_lr0.0001_bz256.226.hdf5
Epoch 227/300

Epoch 00227: saving model to models/bert_text_Adam_lr0.0001_bz256.227.hdf5
Epoch 228/300

Epoch 00228: saving model to models/bert_text_Adam_lr0.0001_bz256.228.hdf5
Epoch 229/300

Epoch 00229: saving model to models/bert_text_Adam_lr0.0001_bz256.229.hdf5
Epoch 230/300

Epoch 002


Epoch 00257: saving model to models/bert_text_Adam_lr0.0001_bz256.257.hdf5
Epoch 258/300

Epoch 00258: saving model to models/bert_text_Adam_lr0.0001_bz256.258.hdf5
Epoch 259/300

Epoch 00259: saving model to models/bert_text_Adam_lr0.0001_bz256.259.hdf5
Epoch 260/300

Epoch 00260: saving model to models/bert_text_Adam_lr0.0001_bz256.260.hdf5
Epoch 261/300

Epoch 00261: saving model to models/bert_text_Adam_lr0.0001_bz256.261.hdf5
Epoch 262/300

Epoch 00262: saving model to models/bert_text_Adam_lr0.0001_bz256.262.hdf5
Epoch 263/300

Epoch 00263: saving model to models/bert_text_Adam_lr0.0001_bz256.263.hdf5
Epoch 264/300

Epoch 00264: saving model to models/bert_text_Adam_lr0.0001_bz256.264.hdf5
Epoch 265/300

Epoch 00265: saving model to models/bert_text_Adam_lr0.0001_bz256.265.hdf5
Epoch 266/300

Epoch 00266: saving model to models/bert_text_Adam_lr0.0001_bz256.266.hdf5
Epoch 267/300

Epoch 00267: saving model to models/bert_text_Adam_lr0.0001_bz256.267.hdf5
Epoch 268/300

Epoch 002


Epoch 00295: saving model to models/bert_text_Adam_lr0.0001_bz256.295.hdf5
Epoch 296/300

Epoch 00296: saving model to models/bert_text_Adam_lr0.0001_bz256.296.hdf5
Epoch 297/300

Epoch 00297: saving model to models/bert_text_Adam_lr0.0001_bz256.297.hdf5
Epoch 298/300

Epoch 00298: saving model to models/bert_text_Adam_lr0.0001_bz256.298.hdf5
Epoch 299/300

Epoch 00299: saving model to models/bert_text_Adam_lr0.0001_bz256.299.hdf5
Epoch 300/300

Epoch 00300: saving model to models/bert_text_Adam_lr0.0001_bz256.300.hdf5


In [16]:
epoch = input("\n\nWhich epoch to load?\nAns: ")
epoch = int(epoch)
load_model_path = '%s.%02d.hdf5' % (model_path, epoch)
print('Loading model - ', load_model_path)
model = load_model(load_model_path, custom_objects={'loss': 'categorical_crossentropy'})
# print(model.summary())
predictions = model.predict(dev_articles_emb)
predictions = predictions.argmax(axis=1)

# writing predictions to file
task_TC_output_file = "BERT_model-output-TC.txt"
dev_article_ids, dev_span_starts, dev_span_ends, dev_labels = read_predictions_from_file(dev_template_labels_file)

with open(task_TC_output_file, "w") as fout:
    for article_id, prediction, span_start, span_end in zip(dev_article_ids, predictions, dev_span_starts,
                                                            dev_span_ends):
        fout.write("%s\t%s\t%s\t%s\n" % (article_id, index2label[prediction], span_start, span_end))
print("Predictions written to file " + task_TC_output_file)



Which epoch to load?
Ans: 189
Loading model -  models/bert_text_Adam_lr0.0001_bz256.189.hdf5
Predictions written to file BERT_model-output-TC.txt
