# BERT Future Success

## Setup

In [None]:
!pip install transformers

In [2]:
from google.colab import drive
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification
import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
PATH_GDRIVE = '/content/drive'
drive.mount(PATH_GDRIVE, force_remount=True)

In [5]:
# paths
DIR = 'drive/MyDrive/MSc Dissertation/data/'
PATH_SAMPLES = DIR + 'samples/review_users/%s_%s.csv'
PATH_MODELS = DIR + 'models/review_users/%s_%d'
PATH_RESULTS = DIR + 'results/review_users/%s_%d'
# bert models
BERT_MODEL_ENG = 'distilbert-base-uncased'
BERT_MODEL_NENG = 'distilbert-base-multilingual-cased'

## Data Reading and Writing

In [6]:
def read_data(sample_name, seed=None):
    train = pd.read_csv(PATH_SAMPLES % (sample_name, 'train'))[['uid', 'gid_mean_class', 'text']].reset_index(drop=True)
    validation = pd.read_csv(PATH_SAMPLES % (sample_name, 'val'))[['uid', 'gid_mean_class', 'text']].reset_index(drop=True)
    test = pd.read_csv(PATH_SAMPLES % (sample_name, 'test'))[['uid', 'gid_mean_class', 'text']].reset_index(drop=True)
    return train, validation, test

In [7]:
def write_data_train(sample_name, run_num, train_history):
    with open((PATH_RESULTS % (sample_name, run_num)) + '_train.txt', 'w+') as f:
        f.write(str(train_history.params) + '\n')
        f.write(str(train_history.history) + '\n')

In [13]:
def write_data_test(sample_name, run_num, Y_test, Y_pred, uids):
    with open((PATH_RESULTS % (sample_name, run_num)) + '_test.txt', 'w+') as f:
        f.write(str(Y_test) + '\n')
        f.write(str(Y_pred) + '\n')
        f.write(str(uids) + '\n')

## Model Training and Testing


In [9]:
def train_model(sample_name, train_data, validation_data, run_num=0, num_epochs=2, batch_size=16, learning_rate=5e-5, num_labels=6, seed=None):
    with tpu_strategy.scope():
        # load pre-trained stuff
        if sample_name.startswith('eng'): model_name = BERT_MODEL_ENG
        else: model_name = BERT_MODEL_NENG
        tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
        classifier = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        # separate data
        Xt, Yt = train_data['text'].tolist(), train_data['gid_mean_class'].tolist()
        Xv, Yv = validation_data['text'].tolist(), validation_data['gid_mean_class'].tolist()
        # encode and format data
        encode = lambda x: tokenizer(x, truncation=True, padding=True, return_tensors='tf')
        format = lambda x, y: tf.data.Dataset.from_tensor_slices((dict(x), y)).shuffle(100, seed=seed)
        train_data = format(encode(Xt), Yt)
        validation_data = format(encode(Xv), Yv)
        # train and save the model
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-8)
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        classifier.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    fit_results = classifier.fit(
        train_data.batch(batch_size),
        epochs=num_epochs,
        batch_size=batch_size,
        validation_data=validation_data.batch(batch_size)
    )
    classifier.save_pretrained(PATH_MODELS % (sample_name, run_num))
    write_data_train(sample_name, run_num, fit_results)

In [15]:
def test_model(sample_name, test_data, run_num=0, batch_size=16, num_labels=6, seed=None):
    with tpu_strategy.scope():
        # load pre-trained stuff
        if sample_name.startswith('eng'): model_name = BERT_MODEL_ENG
        else: model_name = BERT_MODEL_NENG
        tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
        classifier = TFDistilBertForSequenceClassification.from_pretrained(PATH_MODELS % (sample_name, run_num), num_labels=num_labels)
        # separate data
        uids = test_data['uid'].tolist()
        Xt, Yt = test_data['text'].tolist(), test_data['gid_mean_class'].tolist()
        # encode, format and convert data
        encode = lambda x: tokenizer(x, truncation=True, padding=True, return_tensors='tf')
        format = lambda x: tf.data.Dataset.from_tensor_slices(dict(x))
        convert = lambda y: tf.argmax(tf.nn.softmax(y.logits, axis=1), axis=1).numpy()[::8]
        Xt = format(encode(Xt))
        Yp = convert(classifier.predict(Xt)).tolist()
    write_data_test(sample_name, run_num, Yt, Yp, uids)

## Running Everything

In [14]:
batch_sizes = [16, 32, 64]
num_epochs = [2, 3, 4]
run_num = 0
sample_name = 'eng_160k'
seed = 1337
train, validation, test = read_data(sample_name, seed=seed)

In [None]:
for batch_size in batch_sizes:
    for num_epoch in num_epochs:
        print(f'=== RUN_NUM={run_num} ===')
        train_model(sample_name, train, validation, run_num=run_num, seed=seed, batch_size=batch_size, num_epochs=num_epoch)
        test_model(sample_name, test, run_num=run_num, seed=seed, batch_size=batch_size)
        run_num += 1

In [11]:
sample_names = [
    'eng_160k_6', 'eng_160k_3',
    'any_160k_6', 'any_160k_3',
]
seed=1337

In [None]:
for sample_name in sample_names:
    num_labels = int(sample_name[-1])
    train, validation, test = read_data(sample_name, seed=seed)
    train_model(sample_name, train, validation, run_num=10, batch_size=64, num_labels=num_labels, seed=seed)

In [None]:
for sample_name in sample_names:
    num_labels = int(sample_name[-1])
    _, _, test = read_data(sample_name, seed=seed)
    test_model(sample_name, test, run_num=10, batch_size=64, num_labels=num_labels, seed=seed)