In [1]:
from bert_serving.client import BertClient
import numpy as np
import pandas as pd

In [2]:
from tqdm import tqdm
from preprocess import read_articles_from_file_list, read_predictions_from_file, label2index
bc = BertClient(ip='10.2.0.111')

Using TensorFlow backend.


In [4]:
a = bc.encode(['First do it', 'then do it right', 'then do it better'])
a.shape

(3, 4096)

In [4]:
train_folder = "datasets/train-articles"  # check that the path to the datasets folder is correct,
dev_folder = "datasets/dev-articles"  # if not adjust these variables accordingly
test_folder = "datasets/test-articles"  # if not adjust these variables accordingly

train_labels_file = "datasets/train-task2-TC.labels"
dev_template_labels_file = "datasets/dev-task-TC-template.out"
# task_TC_output_file = "baseline-output-TC.txt"

test_template_labels_file = "datasets/test-task-TC-template.out"
# test_task_TC_output_file = "test-output-TC.txt"

In [5]:
def clean_text(text):
    # text = text.lower()
    # text = text.replace('\'', '')
    text = text.replace('‘', ' \' ')
    text = text.replace('’', ' \' ')
    text = text.replace('“', ' \" ')
    text = text.replace('”', ' \" ')
    text = text.replace('â', ' \' ')

    text = text.replace('"', ' " ')
    text = text.replace('\'', ' \' ')

    text = text.replace('—', ' - ')
    text = text.replace('–', ' - ')
    text = text.replace('…', '...')
    text = text.replace('  ', ' ')
    text = text.strip()
    return text

In [6]:
articles = read_articles_from_file_list(train_folder)
dev_articles = read_articles_from_file_list(dev_folder)
test_articles = read_articles_from_file_list(test_folder)
ref_articles_id, ref_span_starts, ref_span_ends, train_gold_labels = read_predictions_from_file(train_labels_file)
dev_article_ids, dev_span_starts, dev_span_ends, dev_labels = read_predictions_from_file(dev_template_labels_file)
test_article_ids, test_span_starts, test_span_ends, test_labels = read_predictions_from_file(test_template_labels_file)
print("Loaded %d annotations from %d articles" % (len(ref_span_starts), len(set(ref_articles_id))))


Loaded 6129 annotations from 357 articles


In [7]:
def compute_features(articles, ref_articles_id, span_starts, span_ends):
    # only one feature, the length of the span
    print(type(span_starts), len(span_starts))
    print(type(span_ends), len(span_ends))
    data = []
    article_spans = []
    for i, ref_id in tqdm(enumerate(ref_articles_id)):
        # print(articles[ref_id], span_starts[i], span_ends[i])
        article = articles[ref_id]
        article_span = clean_text(article[int(span_starts[i]):int(span_ends[i])])
        data.append([article_span])
        article_spans.append(article_span)
        
    return article_spans

In [8]:
articles = compute_features(articles, ref_articles_id, ref_span_starts, ref_span_ends)


6129it [00:00, 34163.16it/s]

<class 'list'> 6129
<class 'list'> 6129





In [9]:
dev_articles = compute_features(dev_articles, dev_article_ids, dev_span_starts, dev_span_ends)

1063it [00:00, 222105.47it/s]

<class 'list'> 1063
<class 'list'> 1063





In [10]:
test_articles = compute_features(test_articles, test_article_ids, test_span_starts, test_span_ends)

1790it [00:00, 230576.58it/s]

<class 'list'> 1790
<class 'list'> 1790





In [11]:
from keras.utils import to_categorical
from pprint import pprint
import pickle
from preprocess import label2index, index2label

In [12]:
GET_NEW_BERT_EMB = False

In [13]:
data_path = './bert_processed_data/'
if GET_NEW_BERT_EMB:
    articles_emb = bc.encode(articles)
    dev_articles_emb = bc.encode(dev_articles)
    test_articles_emb = bc.encode(test_articles)
    pprint(set(train_gold_labels))

    labels = [label2index[x] for x in train_gold_labels]
    labels = to_categorical(np.asarray(labels))

    # save train data
    pickle.dump(articles_emb, open(data_path + 'train_x.p', 'wb'))
    pickle.dump(labels, open(data_path + 'train_y.p', 'wb'))

    # save dev data
    pickle.dump(dev_articles_emb, open(data_path + 'dev_x.p', 'wb'))
    pickle.dump(dev_articles_emb, open(data_path + 'test_x.p', 'wb'))
    
else:
    articles_emb = pickle.load(open(data_path + 'train_x.p', 'rb'))
    labels = pickle.load(open(data_path + 'train_y.p', 'rb'))
    dev_articles_emb = pickle.load(open(data_path + 'dev_x.p', 'rb'))
    test_articles_emb = pickle.load(open(data_path + 'test_x.p', 'rb'))

here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


{'Appeal_to_Authority',
 'Appeal_to_fear-prejudice',
 'Bandwagon,Reductio_ad_hitlerum',
 'Black-and-White_Fallacy',
 'Causal_Oversimplification',
 'Doubt',
 'Exaggeration,Minimisation',
 'Flag-Waving',
 'Loaded_Language',
 'Name_Calling,Labeling',
 'Repetition',
 'Slogans',
 'Thought-terminating_Cliches',
 'Whataboutism,Straw_Men,Red_Herring'}


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train_x, test_x, train_y, test_y = train_test_split(articles_emb,
                                                  labels,
                                                  test_size=0.20,
                                                  shuffle=True,
                                                  stratify=labels)

In [16]:
from keras.layers import Conv2D, MaxPool2D, Bidirectional
from keras.layers import Embedding, Concatenate
from keras.layers.core import *
from keras.layers.recurrent import LSTM
from keras.models import *

def model_MLP(emb_size, out_size):
    model = Sequential()
    model.add(Dense(128, input_dim=emb_size, activation='relu'))
    model.add(Dropout(0.5))
#     model.add(Dense(64, activation='relu'))
#     model.add(Dropout(0.5))
    model.add(Dense(out_size, activation='softmax'))
    print(model.summary())
    return model

In [30]:
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam, SGD
from models import load_model


emb_size = articles_emb[0].shape[0]
print(emb_size)
out_size = 14

model = model_MLP(emb_size, out_size)

lr = 0.0001
bz = 256
epochs = 250

opt = Adam(lr=lr)
# opt = SGD(0.01)
# print(str(opt))
# exit()
min_loss_arg = -1
max_acc_arg = -1
model_name = 'bert_text_Adam_lr%s_bz%s' % (lr, bz)
model_path = 'models/%s' % (model_name)
checkpoint = ModelCheckpoint('%s.{epoch:02d}.hdf5' % (model_path), monitor='loss', verbose=1,
                             save_best_only=False, mode='auto')

model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['acc'])
try:
    history = model.fit(train_x, train_y, validation_data=[test_x, test_y], epochs=epochs,
              batch_size=bz,
              shuffle=True, callbacks=[checkpoint])
    loss_hist = history.history['val_loss']
    acc_hist = history.history['val_acc']
    min_loss_arg = np.argmin(loss_hist)
    max_acc_arg = np.argmax(acc_hist)
except Exception as e:
    print(e)
    min_loss_arg = -1
    max_acc_arg = -1


4096
Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 128)               524416    
_________________________________________________________________
dropout_7 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 14)                1806      
Total params: 526,222
Trainable params: 526,222
Non-trainable params: 0
_________________________________________________________________
None
Train on 4903 samples, validate on 1226 samples
Epoch 1/250

Epoch 00001: saving model to models/bert_text_Adam_lr0.0001_bz256.01.hdf5
Epoch 2/250

Epoch 00002: saving model to models/bert_text_Adam_lr0.0001_bz256.02.hdf5
Epoch 3/250

Epoch 00003: saving model to models/bert_text_Adam_lr0.0001_bz256.03.hdf5
Epoch 4/250

Epoch 00004: saving model to models/bert_text_

Epoch 36/250

Epoch 00036: saving model to models/bert_text_Adam_lr0.0001_bz256.36.hdf5
Epoch 37/250

Epoch 00037: saving model to models/bert_text_Adam_lr0.0001_bz256.37.hdf5
Epoch 38/250

Epoch 00038: saving model to models/bert_text_Adam_lr0.0001_bz256.38.hdf5
Epoch 39/250

Epoch 00039: saving model to models/bert_text_Adam_lr0.0001_bz256.39.hdf5
Epoch 40/250

Epoch 00040: saving model to models/bert_text_Adam_lr0.0001_bz256.40.hdf5
Epoch 41/250

Epoch 00041: saving model to models/bert_text_Adam_lr0.0001_bz256.41.hdf5
Epoch 42/250

Epoch 00042: saving model to models/bert_text_Adam_lr0.0001_bz256.42.hdf5
Epoch 43/250

Epoch 00043: saving model to models/bert_text_Adam_lr0.0001_bz256.43.hdf5
Epoch 44/250

Epoch 00044: saving model to models/bert_text_Adam_lr0.0001_bz256.44.hdf5
Epoch 45/250

Epoch 00045: saving model to models/bert_text_Adam_lr0.0001_bz256.45.hdf5
Epoch 46/250

Epoch 00046: saving model to models/bert_text_Adam_lr0.0001_bz256.46.hdf5
Epoch 47/250

Epoch 00047: savin


Epoch 00074: saving model to models/bert_text_Adam_lr0.0001_bz256.74.hdf5
Epoch 75/250

Epoch 00075: saving model to models/bert_text_Adam_lr0.0001_bz256.75.hdf5
Epoch 76/250

Epoch 00076: saving model to models/bert_text_Adam_lr0.0001_bz256.76.hdf5
Epoch 77/250

Epoch 00077: saving model to models/bert_text_Adam_lr0.0001_bz256.77.hdf5
Epoch 78/250

Epoch 00078: saving model to models/bert_text_Adam_lr0.0001_bz256.78.hdf5
Epoch 79/250

Epoch 00079: saving model to models/bert_text_Adam_lr0.0001_bz256.79.hdf5
Epoch 80/250

Epoch 00080: saving model to models/bert_text_Adam_lr0.0001_bz256.80.hdf5
Epoch 81/250

Epoch 00081: saving model to models/bert_text_Adam_lr0.0001_bz256.81.hdf5
Epoch 82/250

Epoch 00082: saving model to models/bert_text_Adam_lr0.0001_bz256.82.hdf5
Epoch 83/250

Epoch 00083: saving model to models/bert_text_Adam_lr0.0001_bz256.83.hdf5
Epoch 84/250

Epoch 00084: saving model to models/bert_text_Adam_lr0.0001_bz256.84.hdf5
Epoch 85/250

Epoch 00085: saving model to mo


Epoch 00113: saving model to models/bert_text_Adam_lr0.0001_bz256.113.hdf5
Epoch 114/250

Epoch 00114: saving model to models/bert_text_Adam_lr0.0001_bz256.114.hdf5
Epoch 115/250

Epoch 00115: saving model to models/bert_text_Adam_lr0.0001_bz256.115.hdf5
Epoch 116/250

Epoch 00116: saving model to models/bert_text_Adam_lr0.0001_bz256.116.hdf5
Epoch 117/250

Epoch 00117: saving model to models/bert_text_Adam_lr0.0001_bz256.117.hdf5
Epoch 118/250

Epoch 00118: saving model to models/bert_text_Adam_lr0.0001_bz256.118.hdf5
Epoch 119/250

Epoch 00119: saving model to models/bert_text_Adam_lr0.0001_bz256.119.hdf5
Epoch 120/250

Epoch 00120: saving model to models/bert_text_Adam_lr0.0001_bz256.120.hdf5
Epoch 121/250

Epoch 00121: saving model to models/bert_text_Adam_lr0.0001_bz256.121.hdf5
Epoch 122/250

Epoch 00122: saving model to models/bert_text_Adam_lr0.0001_bz256.122.hdf5
Epoch 123/250

Epoch 00123: saving model to models/bert_text_Adam_lr0.0001_bz256.123.hdf5
Epoch 124/250

Epoch 001


Epoch 00151: saving model to models/bert_text_Adam_lr0.0001_bz256.151.hdf5
Epoch 152/250

Epoch 00152: saving model to models/bert_text_Adam_lr0.0001_bz256.152.hdf5
Epoch 153/250

Epoch 00153: saving model to models/bert_text_Adam_lr0.0001_bz256.153.hdf5
Epoch 154/250

Epoch 00154: saving model to models/bert_text_Adam_lr0.0001_bz256.154.hdf5
Epoch 155/250

Epoch 00155: saving model to models/bert_text_Adam_lr0.0001_bz256.155.hdf5
Epoch 156/250

Epoch 00156: saving model to models/bert_text_Adam_lr0.0001_bz256.156.hdf5
Epoch 157/250

Epoch 00157: saving model to models/bert_text_Adam_lr0.0001_bz256.157.hdf5
Epoch 158/250

Epoch 00158: saving model to models/bert_text_Adam_lr0.0001_bz256.158.hdf5
Epoch 159/250

Epoch 00159: saving model to models/bert_text_Adam_lr0.0001_bz256.159.hdf5
Epoch 160/250

Epoch 00160: saving model to models/bert_text_Adam_lr0.0001_bz256.160.hdf5
Epoch 161/250

Epoch 00161: saving model to models/bert_text_Adam_lr0.0001_bz256.161.hdf5
Epoch 162/250

Epoch 001


Epoch 00189: saving model to models/bert_text_Adam_lr0.0001_bz256.189.hdf5
Epoch 190/250

Epoch 00190: saving model to models/bert_text_Adam_lr0.0001_bz256.190.hdf5
Epoch 191/250

Epoch 00191: saving model to models/bert_text_Adam_lr0.0001_bz256.191.hdf5
Epoch 192/250

Epoch 00192: saving model to models/bert_text_Adam_lr0.0001_bz256.192.hdf5
Epoch 193/250

Epoch 00193: saving model to models/bert_text_Adam_lr0.0001_bz256.193.hdf5
Epoch 194/250

Epoch 00194: saving model to models/bert_text_Adam_lr0.0001_bz256.194.hdf5
Epoch 195/250

Epoch 00195: saving model to models/bert_text_Adam_lr0.0001_bz256.195.hdf5
Epoch 196/250

Epoch 00196: saving model to models/bert_text_Adam_lr0.0001_bz256.196.hdf5
Epoch 197/250

Epoch 00197: saving model to models/bert_text_Adam_lr0.0001_bz256.197.hdf5
Epoch 198/250

Epoch 00198: saving model to models/bert_text_Adam_lr0.0001_bz256.198.hdf5
Epoch 199/250

Epoch 00199: saving model to models/bert_text_Adam_lr0.0001_bz256.199.hdf5
Epoch 200/250

Epoch 002


Epoch 00227: saving model to models/bert_text_Adam_lr0.0001_bz256.227.hdf5
Epoch 228/250

Epoch 00228: saving model to models/bert_text_Adam_lr0.0001_bz256.228.hdf5
Epoch 229/250

Epoch 00229: saving model to models/bert_text_Adam_lr0.0001_bz256.229.hdf5
Epoch 230/250

Epoch 00230: saving model to models/bert_text_Adam_lr0.0001_bz256.230.hdf5
Epoch 231/250

Epoch 00231: saving model to models/bert_text_Adam_lr0.0001_bz256.231.hdf5
Epoch 232/250

Epoch 00232: saving model to models/bert_text_Adam_lr0.0001_bz256.232.hdf5
Epoch 233/250

Epoch 00233: saving model to models/bert_text_Adam_lr0.0001_bz256.233.hdf5
Epoch 234/250

Epoch 00234: saving model to models/bert_text_Adam_lr0.0001_bz256.234.hdf5
Epoch 235/250

Epoch 00235: saving model to models/bert_text_Adam_lr0.0001_bz256.235.hdf5
Epoch 236/250

Epoch 00236: saving model to models/bert_text_Adam_lr0.0001_bz256.236.hdf5
Epoch 237/250

Epoch 00237: saving model to models/bert_text_Adam_lr0.0001_bz256.237.hdf5
Epoch 238/250

Epoch 002

In [31]:
print('max_acc_arg', max_acc_arg)
print('min_loss_arg', min_loss_arg)

78
60


In [32]:
epoch = input("\n\nWhich epoch to load?\nAns: ")
epoch = int(epoch)
load_model_path = '%s.%02d.hdf5' % (model_path, epoch)
print('Loading model - ', load_model_path)
model = load_model(load_model_path, custom_objects={'loss': 'categorical_crossentropy'})
# print(model.summary())



Which epoch to load?
Ans: 78
Loading model -  models/bert_text_Adam_lr0.0001_bz256.78.hdf5


In [33]:
predictions = model.predict(dev_articles_emb)
predictions = predictions.argmax(axis=1)

# writing predictions to file
task_TC_output_file = "BERT_model-output-TC.txt"
dev_article_ids, dev_span_starts, dev_span_ends, dev_labels = read_predictions_from_file(dev_template_labels_file)

with open(task_TC_output_file, "w") as fout:
    for article_id, prediction, span_start, span_end in zip(dev_article_ids, predictions, dev_span_starts,
                                                            dev_span_ends):
        fout.write("%s\t%s\t%s\t%s\n" % (article_id, index2label[prediction], span_start, span_end))
print("Predictions written to file " + task_TC_output_file)

Predictions written to file BERT_model-output-TC.txt


In [34]:
predictions = model.predict(test_articles_emb)
predictions = predictions.argmax(axis=1)

# writing predictions to file
test_task_TC_output_file = "test-output-TC.txt"
test_article_ids, test_span_starts, test_span_ends, test_labels = read_predictions_from_file(test_template_labels_file)

with open(test_task_TC_output_file, "w") as fout:
    for article_id, prediction, span_start, span_end in zip(test_article_ids, predictions, test_span_starts,
                                                            test_span_ends):
        fout.write("%s\t%s\t%s\t%s\n" % (article_id, index2label[prediction], span_start, span_end))
print("Predictions written to file " + test_task_TC_output_file)

Predictions written to file test-output-TC.txt
