# Multi-task classification with BERT

In [None]:
# tensorflow-related
import tensorflow as tf
import tensorflow_hub as hub
from bert import optimization, tokenization, modeling
# others
import pandas as pd
import numpy as np
import sys
from spellchecker import SpellChecker
import re

In [2]:
# import our re-implementations
sys.path.insert(0, '../bert_reimplementations')
import run_classifier_adapted as rc

## Text preprocessing

### Minor spell-checking
We propose to apply two forms of word correction:
1. We reduce contiguous repetitions of the same character to at most two consecutive repetitions. *Example: "Loooose" -> "Loose"*.
2. We correct words' spelling if the correction requires changing at most two characters.
3. Lower-case the data since that's how the pre-trained model was trained.

In practice, the second takes very long as the comment texts can be pretty long. We've thus decided to omit this kind or preprocessing.

In [3]:
# reduce words' excessive character repetition
def normalize_character_repetition(word):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", word)
# correct a full sentence. We won't be using this one
def sentence_correction(sentence, spellchecker):
    sentence = " ".join([
        spellchecker.correction(normalize_character_repetition(word)) for word in sentence.split()
        ])
    return sentence
# instantiate spellchecker object
spellchecker = SpellChecker()

In [4]:
# original data
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
# repetition-normalized
train["comment_text"] = train["comment_text"].apply(lambda x: normalize_character_repetition(x.lower()))
test["comment_text"] = test["comment_text"].apply(lambda x: normalize_character_repetition(x.lower()))

### Data split
We'll leave out 10% of the training data as a validation set.

In [5]:
TRAIN_VAL_RATIO = 0.9
LEN = train.shape[0]
SIZE_TRAIN = int(TRAIN_VAL_RATIO*LEN)

# shuffle the train set
indices = train.index.tolist()
np.random.shuffle(indices)
train = train.iloc[indices]
# subset it
val = train.iloc[SIZE_TRAIN:]
train = train.iloc[:SIZE_TRAIN]

### BERT-style formatting
The Google researchers' repository describes a specific format for us to feed our data to their model. This section takes care of that.

In [6]:
# define constants similar to the Google repo
ID = 'id'
DATA_COLUMN = 'comment_text'
LABEL_COLUMNS = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [7]:
# extract training and test examples into bert-friendly objects
label_names = train.columns.values.tolist()[2:]
train_InputExamples = train.apply(lambda x: rc.InputExample(guid = x["id"], text_a = x["comment_text"],
                                                            labels = x[label_names].tolist()), axis = 1)
val_InputExamples = val.apply(lambda x: rc.InputExample(guid = x["id"], text_a = x["comment_text"],
                                                        labels = x[label_names].tolist()), axis = 1)
test_InputExamples = test.apply(lambda x: rc.InputExample(guid = x["id"], text_a = x["comment_text"],
                                                          labels = [0]*6), axis = 1)

### Word embedding
The pre-trained model's vocabulary, checkpoint and model configuration are available for download [here](https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip?source=post_page---------------------------). The Google researchers' code uses a custom tokenizer to vectorize each word before feeding the data to the network.

In [8]:
# words to indices mapping
BERT_VOCAB= "../bert_pretrained/vocab.txt"
# pre-trained model weights
BERT_INIT_CHKPNT = "../bert_pretrained/bert_model.ckpt"
# BERT model architecture
BERT_CONFIG = "../bert_pretrained/bert_config.json"

# max sequence length
MAX_SEQ_LENGTH = 500

# tokenization
tokenization.validate_case_matches_checkpoint(True, BERT_INIT_CHKPNT)
tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB, do_lower_case=True)

In [None]:
# embedding
train_features = rc.convert_examples_to_features(train_InputExamples, MAX_SEQ_LENGTH, tokenizer)
val_features = rc.convert_examples_to_features(val_InputExamples, MAX_SEQ_LENGTH, tokenizer)
test_features = rc.convert_examples_to_features(test_InputExamples, MAX_SEQ_LENGTH, tokenizer)

In [None]:
# Create an input functions like the original repo
train_input_fn = rc.input_fn_builder(features=train_features, seq_length=MAX_SEQ_LENGTH,
                                                      is_training=True, drop_remainder=False)
val_input_fn = rc.input_fn_builder(features=val_features, seq_length=MAX_SEQ_LENGTH,
                                                      is_training=False, drop_remainder=False)
test_input_fn = rc.input_fn_builder(features=test_features, seq_length=MAX_SEQ_LENGTH,
                                                      is_training=False, drop_remainder=False)

## Model construction

In [None]:
# the researchers copied these hyperparams from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
# then I copied them from the researchers
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_examples) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
SAVE_SUMMARY_STEPS = 500
OUTPUT_DIR = "../bert_reimplementations/output"

In [None]:
# Specify output directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    keep_checkpoint_max=1,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS
)

In [None]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)
model_fn = model_fn_builder(
  bert_config=bert_config,
  num_labels= len(LABEL_COLUMNS),
  init_checkpoint=BERT_INIT_CHKPNT,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=False,
  use_one_hot_embeddings=False
)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

In [None]:
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)