In [0]:
# Copyright 2019 Google Inc.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<a href="https://colab.research.google.com/github/kpe/bert-for-tf2/blob/master/examples/tpu_movie_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Detecting Hate Speech Tweets With BERT

We are using bert-tensorflow for this classification task. At the moment I'm making sure it's tensorflow version 1.x because tensorflow version 2 gives issues with Bert at the moment. I believe Tensorflow hopes to have this issue resolved in tensorflow v 2.1

We are using a TPU as a GPU does not have the required memory for Large BERT models- it can only cope with the base model. We'll see if there a TPU detected and we'll set it to a global environment variable so it can be accessed by our BERT functions later.

In [0]:
!pip install gcsfs 
import pandas as pd
import numpy as np

#Make sure to use tensorflow version 1.x, version 2 doesn't work with bert
%tensorflow_version 1.x
import tensorflow as tf
#!pip install gast==0.2.2
import os

#For cross-validation and grid search
from itertools import product
from tensorflow.python.summary.summary_iterator import summary_iterator
from google.cloud import storage
import ipywidgets as widgets
from IPython.display import display

import sklearn
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn import metrics


import html
import re
import json
import pprint
import random
import string
import nltk
from datetime import datetime
import time


assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

#Below we give ourselves as well as the TPU access to our private GCS bucket
from google.colab import auth
auth.authenticate_user()
tf.reset_default_graph()  
with tf.Session(TPU_ADDRESS) as session:
  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)

USE_TPU=True
try:
  #tf.config.experimental_connect_to_host(TPU_ADDRESS)
  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=TPU_ADDRESS)
  tf.config.experimental_connect_to_cluster(cluster_resolver)
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
except Exception as ex:
  print(ex)
  USE_TPU=False

print("        USE_TPU:", USE_TPU)
print("Eager Execution:", tf.executing_eagerly())

assert not tf.executing_eagerly(), "Eager execution on TPUs have issues currently"

Collecting gcsfs
  Downloading https://files.pythonhosted.org/packages/18/3b/454be7c97d05e15eb20a0099f425f0ed6b7552e352c77adb923c3872ba14/gcsfs-0.6.1-py2.py3-none-any.whl
Installing collected packages: gcsfs
Successfully installed gcsfs-0.6.1
TensorFlow 1.x selected.
TPU address is grpc://10.38.114.34:8470
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

INFO:tensorflow:Initializing the TPU system: 10.38.114.34:8470
INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Querying Tensorflow master (grpc://10.38.114.34:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tenso

Setting a random seed for reproducability of results and checking version of tensorflow

In [0]:
# Setting the graph-level random seed for the default graph. Different than operation level seed
SEED = 3060
tf.reset_default_graph()
os.environ['PYTHONHASHSEED'] = str(SEED)
tf.set_random_seed(SEED) 
random.seed(SEED)
np.random.seed(SEED)
print("Tensorflow Version:", tf.__version__)

Tensorflow Version: 1.15.2


Below we will set the directory where we will store our output model. To ensure the right variables are loaded in our run config function later, our output directory must be in the same directory as our pre-trained bert model directory.

Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist).

In [0]:
#Large whole word masking BERT pre-trained weights
bert_model_name = 'wwm_uncased_L-24_H-1024_A-16' 

#Where we output the fine tuned model
output_dir = os.path.join(bert_model_name, 'output1')

DATASET = "HatEval" #@param ["HatEval", "AnalyticsVidhya"]

#@markdown Whether or not to use the further pretrained model
FURTHER_PRETRAINED = True #@param {type:"boolean"}
if FURTHER_PRETRAINED == True:

  further_pretrained_model = os.path.join(bert_model_name, 'further_pretrained_model')
  output_dir = os.path.join(further_pretrained_model, 'output1')

#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = True #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = True #@param {type:"boolean"}
BUCKET = 'csc3002' #@param {type:"string"}
os.environ["GCLOUD_PROJECT"] = "csc3002"

if USE_BUCKET:
  OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, output_dir)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))


***** Model output directory: gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model/output1 *****


<b> If you're not connected to a TPU environment but still want to access GCS bucket - run below: </b>

In [0]:
"""from google.colab import drive
drive.mount('/content/drive')
!gcloud auth activate-service-account --key-file '/content/drive/My Drive/storageCreds.json'
"""

"from google.colab import drive\ndrive.mount('/content/drive')\n!gcloud auth activate-service-account --key-file '/content/drive/My Drive/storageCreds.json'\n"

<b>Setting up Data Based Upon Choice of DATASET</b>


In [0]:
if DATASET == 'HatEval':
  dirc = 'gs://csc3002/hateval2019'

  rawTrain = pd.read_csv(os.path.join(dirc, 'hateval2019_en_train.csv'), sep=',',  index_col = False, encoding = 'utf-8')
  rawTrain.rename(columns={'text': 'tweet', 'HS': 'label'}, inplace=True)
  rawTrain.drop(['TR', 'AG'], inplace = True, axis = 1)

  rawDev = pd.read_csv(os.path.join(dirc, 'hateval2019_en_dev.csv'), sep=',',  index_col = False, encoding = 'utf-8')
  rawDev.rename(columns={'text': 'tweet', 'HS': 'label'}, inplace=True)
  rawDev.drop(['TR', 'AG'], inplace = True, axis = 1)

  rawTest = pd.read_csv(os.path.join(dirc, 'hateval2019_en_test.csv'), sep=',',  index_col = False, encoding = 'utf-8')
  rawTest.rename(columns={'text': 'tweet', 'HS': 'label'}, inplace=True)
  rawTest.drop(['TR', 'AG'], inplace = True, axis = 1)

  imbalanced = False

elif DATASET == "AnalyticsVidhya":
  dirc = 'gs://csc3002/trial'

  rawTrain= pd.read_csv(os.path.join(dirc, 'train_E6oV3lV.csv'),  sep=',',  index_col = False, encoding = 'utf-8')
  
  rawTrain, rawDev = train_test_split(rawTrain, test_size=0.20, random_state=SEED)
  
  rawTest = pd.read_csv(os.path.join(dirc, 'test_tweets_anuFYb8.csv'), sep=',',  index_col = False, encoding = 'utf-8')
  
  imbalanced = True

else:
  raise ValueError('No Valid DATASET chosen')

# Training Data
I've stored all of the data, (train, dev and test),  in my google bucket for ease of access, authentication will have to be provided

In [0]:
!gcloud config set project 'my-project-csc3002'

train = rawTrain.sample(frac=1, random_state = SEED) #Shuffling really helps model performance
train.reset_index(drop = True, inplace = True)
train.id = train.index
pd.set_option('display.max_colwidth', -1)
print("Out of {} tweets in this database, {} are not hate, {} are hate".format(len(train.index), 
                                                      len(train[train['label']==0]),
                                                      len(train[train['label']==1])))

Updated property [core/project].
Out of 9000 tweets in this database, 5217 are not hate, 3783 are hate


<b>Original Dataset </b>

In [0]:
train.head(30)

Unnamed: 0,id,tweet,label
0,0,"How come Allah is not helping you it is up to Christian countries to protect you feed you ,The countries hit by violence from islam take refugees in feed them etcPlease no more explaining about your hard times we are doing our best for uYes there is good and bad every where",0
1,1,"With todays #JalalabadAttack &amp; other vicious attacks claimed by ISIS, I smell a spillover of refugees in Pak again. This time we should not open borders for them. We cant afford terrorists taking undue advantage. Let Americans deal with it.#Afghanistan #Jalalabad",1
2,2,"https://t.co/i9LJDjtGz7Migration greatest threat‚Äô to Austrian security, says top military figure.EU and Europe bitterly divided√∞≈∏‚Äò‚Ä∞major confrontations between the two.Nothing more counterproductive than ‚Äúcenters‚Äù on European territory or euro bribes for migrants.#Visegrad #V4 https://t.co/VnPCTe7opC",0
3,3,When all your friends are out hoe'in and you're stuck at home in a shitty relationship https://t.co/X9oz1Tx7TC,0
4,4,I wonder if rick will make another deal with those crazy ass women ü§î and if that crazy ass nigga will actually hoe Daryl again üòê,1
5,5,Worker Charged With Sexually Molesting Eight Children at Immigrant Shelter https://t.co/D6HcH03nGL via @CitizenTruth_ #realDonaldTrump do something about this disgrace and stop separating children from their parents.,0
6,6,UN seeks new funding pledges for Palestinian refugees... https://t.co/SNJhD1PWxT https://t.co/DlHQ8fc5N6,0
7,7,"If you really wanna know what someone you're fucking thinks about you, make them show you how you're stored in their phone...",0
8,8,Going to make Du'a at the shrine of Imam Reza(AS) for the refugees in Athens.,0
9,9,"Poor kid. Someone wise must have told him, ""When the world gives you lemons, make lemonade."" He listened. His lemonade should now be offered with ICE in abundance. #BuildTheWall #SendThemBack https://t.co/8AM7fgo9ph",0


### Text Preprocessing

The text pre-processing for this project is detailed in the notebook `Text_Preprocessing.ipynb` in the github repo. Below is an import of the repo into the google colab workspace so I can retrieve and use these functions at convenience

Also below is a function which loads whichever dataset I choose to load from my GCS bucket or local system. This will be useful later when I want to quickly load in data without the messy, long-winded code to go along with it.

In [0]:
#@title Text Pre-Processing Options
HASHTAG_SEGMENTATION = True #@param {type:"boolean"}
EMOJI_REPLACEMENT = "Replace_Emoji_v1" #@param ["None", "Replace_Emoji_v1", "Replace_Emoji_v2"]
LEMMATIZE = False #@param {type:"boolean"}
REMOVE_STOPWORDS = False #@param {type:"boolean"}
REMOVE_PUNCTUATION = False #@param {type:"boolean"}

In [0]:
!git clone https://github.com/fionnmcconville/Automatic-Detection-of-Hate-Speech-Online-Using-BERT.git
%cd Automatic-Detection-of-Hate-Speech-Online-Using-BERT 
#!ls
import preprocessing as pre
#Return to original workspace
%cd ..

#Function caller can optionally load two dataframes and combine them
def loadData(data1, data2 = None):

  if data2 is not None:
    frames = [data1,data2]
    data = pd.concat(frames)
  else:
    data = data1
  
  #Replace emoji must be done before basic preprocess otherwise unicode will be wiped out
  #And this function will be ineffective
  if EMOJI_REPLACEMENT == 'Replace_Emoji_v1':
    data['tweet'] = data['tweet'].apply(pre.emojiReplace)

  if EMOJI_REPLACEMENT == 'Replace_Emoji_v2':
    data['tweet'] = data['tweet'].apply(pre.emojiReplace_v2)

  #Must be performed after emoji translation
  data['tweet'] = data['tweet'].apply(pre.preprocess)

  if HASHTAG_SEGMENTATION == True:
    data['tweet'] = data['tweet'].apply(pre.hashtagSegment)

  if REMOVE_PUNCTUATION == True:
    data['tweet'] = data['tweet'].apply(lambda x: pre.remove_punct(x))

  if REMOVE_STOPWORDS == True:
    data['tweet'] = data['tweet'].apply(lambda x: pre.remove_stopwords(x))

  if LEMMATIZE == True:
    data['tweet'] = data['tweet'].apply(lambda x: pre.lemmatizing(x))

  #Remove small sequences that could skew model
  #data = data[data['tweet'].apply(lambda x: len(x) > 10)]
  data.dropna(inplace = True)
  data.reset_index(drop = True, inplace = True) 
  if DATASET == "AnalyticsVidhya"  and len(data.index) < 20000:
    return data
  else:
    #We don't shuffle data when it is the analytics vidhya test set
    data = data.sample(frac = 1, random_state=SEED) # Shuffle data 
  return data

#Testing function
train = loadData(rawTrain)

Cloning into 'Automatic-Detection-of-Hate-Speech-Online-Using-BERT'...
remote: Enumerating objects: 75, done.[K
remote: Counting objects: 100% (75/75), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 115 (delta 36), reused 53 (delta 19), pack-reused 40[K
Receiving objects: 100% (115/115), 87.26 MiB | 35.78 MiB/s, done.
Resolving deltas: 100% (40/40), done.
/content/Automatic-Detection-of-Hate-Speech-Online-Using-BERT
[33mDownloading emoji data ...[0m
[92m... OK[0m (Got response in 0.13 seconds)
[33mWriting emoji data to /root/.demoji/codes.json ...[0m
[92m... OK[0m
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
/content


**Cleaned tweet text dataset**

In [0]:
train[:30]

Unnamed: 0,id,tweet,label
479,680,"how come allah is not helping you it is up to christian countries to protect you feed you ,the countries hit by violence from islam take refugees in feed them etcplease no more explaining about your hard times we are doing our best for uyes there is good and bad every where",0
3794,3995,"with todays jalalabad attack and other vicious attacks claimed by isis, i smell a spillover of refugees in pak again. this time we should not open borders for them. we cant afford terrorists taking undue advantage. let americans deal with it.#afghanistan jalalabad",1
2512,2713,"greatest threat to austrian security, says top military figure.eu and europe bitterly dividedmajor confrontations between the two.nothing more counterproductive than centers on european territory or euro bribes for migrants.#visegrad v4",0
7308,7509,when all your friends are out hoe'in and you're stuck at home in a shitty relationship,0
6183,6384,i wonder if rick will make another deal with those crazy ass women thinking face and if that crazy ass nigga will actually hoe daryl again neutral face,1
427,628,worker charged with sexually molesting eight children at immigrant shelter via real donald trump do something about this disgrace and stop separating children from their parents.,0
3040,3241,un seeks new funding pledges for palestinian refugees...,0
7419,7620,"if you really wanna know what someone you're fucking thinks about you, make them show you how you're stored in their phone...",0
4332,4533,going to make du'a at the shrine of imam reza(as) for the refugees in athens.,0
3554,3755,"poor kid. someone wise must have told him, ""when the world gives you lemons, make lemonade."" he listened. his lemonade should now be offered with ice in abundance. build the wall send them back",0


Loading in dev data and specifying global variables

In [0]:
dev = loadData(rawDev)

DATA_COLUMN = 'tweet'
LABEL_COLUMN = 'label'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

print("Size of training data", len(train.index))
print("Size of development data", len(dev.index), '\n')

Size of training data 9000
Size of development data 1000 



# Setting Up BERT Training 

##Custom BERT Repositry For Custom Functionality

Rather than using the official BERT setup, I have instead forked the BERT rpoe and customised it to allow for a more tailor-made approach when evaluating and fine-tuning the model

The`create_model` function in my BERT repo has been edited to allow for multiple <b>Fine-Tuning</b> strategies. Normally, the default function from BERT simply fine-tunes a single layer that will be trained on top of BERT to adapt it to our classification problem. This strategy of using a pre-trained model, then fine-tuning it is called <b>Transfer Learning</b>.

Also the `model_fn` method in my BERT repo provides far more detailed metrics than just accuracy and loss - which is all the default repo provides. It has metrics such as F-Score, AUC, precision and recall; so I can better analyse the performance of different models

In [0]:
!git clone https://github.com/fionnmcconville/bert.git
%cd bert
import run_classifier
import optimization
import tokenization
import modeling
#Return to original workspace
%cd ..

Cloning into 'bert'...
remote: Enumerating objects: 360, done.[K
Receiving objects:   0% (1/360)   Receiving objects:   1% (4/360)   Receiving objects:   2% (8/360)   Receiving objects:   3% (11/360)   Receiving objects:   4% (15/360)   Receiving objects:   5% (18/360)   Receiving objects:   6% (22/360)   Receiving objects:   7% (26/360)   Receiving objects:   8% (29/360)   Receiving objects:   9% (33/360)   Receiving objects:  10% (36/360)   Receiving objects:  11% (40/360)   Receiving objects:  12% (44/360)   Receiving objects:  13% (47/360)   Receiving objects:  14% (51/360)   Receiving objects:  15% (54/360)   Receiving objects:  16% (58/360)   Receiving objects:  17% (62/360)   Receiving objects:  18% (65/360)   Receiving objects:  19% (69/360)   Receiving objects:  20% (72/360)   Receiving objects:  21% (76/360)   Receiving objects:  22% (80/360)   Receiving objects:  23% (83/360)   Receiving objects:  24% (87/360)   Receiving objects:  25% (90/360)   R

## BERT Preprocessing and Setup
We'll need to transform our data into a format BERT understands. This involves two steps. First, we create  `InputExample`'s using the constructor provided in the BERT library.

- `text_a` is the text we want to classify, which in this case, is the `tweet` field in our Dataframe. 
- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.
- `label` is the label for our example, i.e. HS, Not HS

In [0]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for book-keeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

dev_InputExamples = dev.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

Next, we need to preprocess our data so that it matches the data BERT was trained on.


1. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
2. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
3. Map our words to indexes using a vocab file that BERT provides
4. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
5. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))

Happily, we don't have to worry about most of these details. It's automated with the below inbuilt functions




Below is a way to retrieve desired BERT parameters, such as it's pre-trained checkpoints and it's vocab file, from my google storage bucket where I've downloaded the uncased LARGE version of bert.

In [0]:
bucket_dir = 'gs://csc3002'
bert_ckpt_dir = os.path.join(bucket_dir, bert_model_name)

#For further pretrained model
if FURTHER_PRETRAINED:
  further_pretrained_model = os.path.join(bert_model_name, 'further_pretrained_model')
  further_pretrained_model = os.path.join(bucket_dir, further_pretrained_model)
  bert_ckpt_file = tf.train.latest_checkpoint(further_pretrained_model)
  print("\nUsing BERT checkpoint from directory:", os.path.join(further_pretrained_model))

else:
  bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
  print("\nUsing BERT checkpoint from directory:", bert_ckpt_dir)

print("\nBERT checkpoint file is:", bert_ckpt_file)

#Setting up BERT config, vocab file and tokenizer - all default from the BERT repo
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")
vocab_file = os.path.join(bert_ckpt_dir, "vocab1.txt")
  
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file)


print("\nMake sure that the function loads a checkpoint, if it doesn't an error will be thrown here")
assert bert_ckpt_file is not None, "No BERT checkpoint file loaded"

print("\nUsing vocab file:", vocab_file)
print("\nBelow is an example of the BERT tokenizer in action:")
tokenizer.tokenize("This here's an example of using the BERT tokenizer")


Using BERT checkpoint from directory: gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model

BERT checkpoint file is: gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model/model.ckpt-80000


Make sure that the function loads a checkpoint, if it doesn't an error will be thrown here

Using vocab file: gs://csc3002/wwm_uncased_L-24_H-1024_A-16/vocab1.txt

Below is an example of the BERT tokenizer in action:


['this',
 'here',
 "'",
 's',
 'an',
 'example',
 'of',
 'using',
 'the',
 'bert',
 'token',
 '##izer']

Using our tokenizer, we'll call `run_classifier.convert_examples_to_features` on our InputExamples to convert them into features BERT understands.

In [0]:
# BERT is limited to 512 tokens in length
MAX_SEQ_LENGTH = 256 #@param {type:"slider", min:128, max:512, step:32}

In [0]:
# Convert our train and dev features to InputFeatures that BERT understands.
train_features =  run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
dev_features = run_classifier.convert_examples_to_features(dev_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)


INFO:tensorflow:Writing example 0 of 9000
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] how come allah is not helping you it is up to christian countries to protect you feed you , the countries hit by violence from islam take refugees in feed them etc ##ple ##ase no more explaining about your hard times we are doing our best for u ##yes there is good and bad every where [SEP]
INFO:tensorflow:input_ids: 101 2129 2272 16455 2003 2025 5094 2017 2009 2003 2039 2000 3017 3032 2000 4047 2017 5438 2017 1010 1996 3032 2718 2011 4808 2013 7025 2202 8711 1999 5438 2068 4385 10814 11022 2053 2062 9990 2055 2115 2524 2335 2057 2024 2725 2256 2190 2005 1057 23147 2045 2003 2204 1998 2919 2296 2073 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

The Run config will be the same across all evaluation options below for running BERT. In it we define the amount of summary steps, as well as how often we should checkpoint the model

In [0]:
SAVE_CHECKPOINTS_STEPS = 10000 #@param {type:"number"}
SUMMARY_STEPS = 200 #@param {type:"number"}

In [0]:
run_config = tf.compat.v1.estimator.tpu.RunConfig(  
    #I think the output file must be a sub-directory of the main BERT file
    model_dir=OUTPUT_DIR, 
    tf_random_seed=SEED,
    cluster=cluster_resolver,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=SUMMARY_STEPS,    #Shows us summary metrics every 100 steps
        num_shards=8,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))
print(run_config.session_config)

allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.38.114.34:8470"
    }
  }
}
isolate_session_state: true



## Fine-Tuning Model

In [0]:
#@title Fine-Tuning Options
FT_MODEL = "BiLTSM" #@param ['Default', 'Multi-Layer Perceptron', 'BiLTSM']
LOSS_FN = "Default" #@param  ['focal_loss','binary_cross_entropy','kld','squared_hinge','hinge', 'Default']
TRAIN_BATCH_SIZE = 32 #@param {type:"slider", min:16, max:32, step:16}

#Must be set to 8 because on a TPU, model will truncate last few entries if they don't fit in the specified batch size
EVAL_BATCH_SIZE = 8 
PREDICT_BATCH_SIZE = 8 

LEARNING_RATE = 0.00002  #@param {type:"slider", min:1e-5, max:5e-5, step:1e-6}
NUM_TRAIN_STEPS = 800 #@param {type:"slider", min:0, max:10000, step:50}
#@markdown The parameters below are not relevant if the FT_MODEL is set to 'Default
FT_LAYERS = 2 #@param {type:"slider", min:2, max:4, step:1}
HIDDEN_SIZE = 256 #@param {type:"slider", min:32, max:382, step:4}
FT_PARAMS = [FT_MODEL, LOSS_FN, FT_LAYERS, HIDDEN_SIZE]


In [0]:
# Maybe have a params dict with model params
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, 
                 labels, num_labels, FT_PARAMS):
  
  """ Create a classification model based on BERT """
  model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=True,  # True if use TPU
  )

  model_type = FT_PARAMS[0]
  loss_type = FT_PARAMS[1]
  num_extra_layers = FT_PARAMS[2]
  h_size = FT_PARAMS[3]


  if model_type == "BiLTSM":
    tf.logging.info("Using Bi-Directional LTSM for Fine-Tuning. %d extra layer" % (num_extra_layers)) 
    #output layer must be rank 3 for biltsm
    output_layer = model.get_sequence_output()
    #Output shape of 4, 246, 1024 which is [batch_size, seq_len, input_size]
    #seq len parameter corresponds to max_time param in bi_dynamic_rnn function

    for layer in range(num_extra_layers):

      #Using different variable scopes, if you want though you can just make the name
      #"hidden" each time and the weights will be shared across layers
      with tf.variable_scope('hidden_{}'.format(layer),reuse=tf.AUTO_REUSE):
        
        #num units in LTSMCell for fw and bw must match
        cell_fw = tf.nn.rnn_cell.LSTMCell(num_units=h_size)
        cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob = 0.9)

        cell_bw = tf.nn.rnn_cell.LSTMCell(num_units=h_size)
        cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob = 0.9)
        outputs, states = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=cell_fw, cell_bw=cell_bw, inputs=output_layer, dtype=tf.float32)

        output_layer = tf.concat(outputs,2) #Could be this too

    output_layer = output_layer[:,-1,:] #Flatten output logits to [batch_size, hidden_size]

  elif model_type == "MLP":
    tf.logging.info("Using Multi-Layer Perceptron for Fine-Tuning. %d extra layer" % (num_extra_layers))  
    final_hidden = model.get_pooled_output()
    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=2)
    batch_size = final_hidden_shape[0]
    hidden_size = final_hidden_shape[1]
    
    for layer in range(num_extra_layers):
      with tf.variable_scope('hidden_{}'.format(layer),reuse=tf.AUTO_REUSE):
        h_weights = tf.get_variable("h{}_weights".format(layer), [h_size, hidden_size],
                                      initializer=tf.truncated_normal_initializer(stddev=0.02))
        h_bias = tf.get_variable("h{}_bias".format(layer), [h_size], initializer=tf.zeros_initializer())

        #final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size])
        h_logits = tf.nn.bias_add(tf.matmul(final_hidden, h_weights, transpose_b=True), h_bias)
        h_logits = tf.nn.relu(h_logits)
        #Dropout after activation
        if is_training:       
          h_logits = tf.nn.dropout(h_logits, rate=0.1) #not sure if this is needed
        
        #Reset values to reflect current last layer
        final_hidden = h_logits
        hidden_size = h_logits.shape[-1].value
    
    output_layer = final_hidden # Output layer for MLP

  # Revert to default BERT Fine-Tuning
  else:
    tf.logging.info("\nUsing original BERT Model for Fine-Tuning\n") 
    output_layer = model.get_pooled_output() #Output layer for default BERT

  hidden_size = output_layer.shape[-1].value

  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02)) 

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("output"):
    if is_training and model_type != "BiLTSM": #We already do dropout in BiLTSM loop
      # I.e., 0.1 dropout
      output_layer = tf.nn.dropout(output_layer, rate=0.1) 

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    probabilities = tf.nn.softmax(logits, axis=-1 )
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32 )

    if loss_type == 'focal_loss':
      # Focal loss (Set default focal loss gamma to 2)
      per_example_loss = -one_hot_labels * ((1 - probabilities) ** 2) * log_probs
      per_example_loss = tf.reduce_sum(per_example_loss, axis=1)

    elif loss_type == 'binary_cross_entropy':
        per_example_loss = tf.keras.losses.binary_crossentropy(y_true=one_hot_labels,
                                                                          y_pred=probabilities)
    elif loss_type == 'kld':
        per_example_loss = tf.keras.metrics.KLDivergence(y_true=one_hot_labels,
                                                    y_pred=probabilities)
    elif loss_type == 'squared_hinge':
        per_example_loss = tf.keras.losses.squared_hinge(y_true=one_hot_labels,
                                                              y_pred=probabilities)
    elif loss_type == 'hinge':
        per_example_loss = tf.keras.metrics.hinge(y_true=one_hot_labels,
                                                      y_pred=probabilities)
    else:   # Fallback to cross-entropy
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)

    loss = tf.reduce_mean(per_example_loss)
  return loss, per_example_loss, log_probs, probabilities

In [0]:
def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu, ft_params):
  """Returns `model_fn` closure for TPUEstimator."""

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    tf.logging.info("*** Features ***")
    for name in sorted(features.keys()):
      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]
    is_real_example = None
    if "is_real_example" in features:
      is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
    else:
      is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    loss_types = ['focal_loss','binary_cross_entropy','kld','squared_hinge','hinge', 'cross-entropy']
    model_types = ['Default', 'Multi-Layer Perceptron', 'BiLTSM']
    loss_type = ft_params[1]
    tf.logging.info("\nUsing loss type:%s" % (loss_type)) 

    (total_loss, per_example_loss, log_probs, probabilities) = create_model(
        bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels,
        FT_PARAMS)
    
    tvars = tf.trainable_variables()
    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
      (assignment_map, initialized_variable_names
      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      if use_tpu:

        def tpu_scaffold():
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
          return tf.train.Scaffold()

        scaffold_fn = tpu_scaffold
      else:
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:

      train_op = optimization.create_optimizer(
          total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          train_op=train_op,
          scaffold_fn=scaffold_fn)
    elif mode == tf.estimator.ModeKeys.EVAL:

      def metric_fn(per_example_loss, label_ids, log_probs, is_real_example):
        predictions = tf.argmax(log_probs, axis=-1, output_type=tf.int32)

        accuracy = tf.compat.v1.metrics.accuracy(labels=label_ids, predictions=predictions, weights=is_real_example)
        loss = tf.compat.v1.metrics.mean(values=per_example_loss, weights=is_real_example)
        f1_score = tf.contrib.metrics.f1_score(label_ids, predictions)
        auc = tf.compat.v1.metrics.auc(label_ids, predictions)
        recall = tf.compat.v1.metrics.recall(label_ids, predictions)
        precision = tf.compat.v1.metrics.precision(label_ids, predictions)
        true_pos = tf.compat.v1.metrics.true_positives(label_ids, predictions)
        true_neg = tf.compat.v1.metrics.true_negatives(label_ids, predictions)
        false_pos = tf.compat.v1.metrics.false_positives(label_ids, predictions)  
        false_neg = tf.compat.v1.metrics.false_negatives(label_ids, predictions)
        return {
            "eval_accuracy": accuracy,
            "eval_loss": loss,
            "F1_Score": f1_score,
            "auc": auc,
            "precision": precision,
            "recall": recall,
            "true_positives": true_pos,
            "true_negatives": true_neg,
            "false_positives": false_pos,
            "false_negatives": false_neg
        }

      eval_metrics = (metric_fn, [per_example_loss, label_ids, log_probs, is_real_example])

      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          eval_metrics=eval_metrics,
          scaffold_fn=scaffold_fn)
    else:
      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          predictions={"probabilities": probabilities},
          scaffold_fn=scaffold_fn)
    return output_spec

  return model_fn

In [0]:
# Warmup is a period of time where the learning rate 
#is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
num_warmup_steps = int(NUM_TRAIN_STEPS * WARMUP_PROPORTION)

print("The model will stop training when it reaches", NUM_TRAIN_STEPS, "as a checkpoint")
print("\nThe bert checkpoint directory is", bert_ckpt_dir)
print("\nThe output directory is", OUTPUT_DIR, '\n')

#This is the model function, which feeds in the bert configurations, the pretrained model itself and the parameters for the fine tuning of the model
"""model_fn = run_classifier.model_fn_builder(
  bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
  num_labels=len(label_list),
  init_checkpoint=bert_ckpt_file,
  learning_rate=LEARNING_RATE,
  num_train_steps=NUM_TRAIN_STEPS,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  )"""

model_fn = model_fn_builder(
  bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
  num_labels=len(label_list),
  init_checkpoint=bert_ckpt_file,
  learning_rate=LEARNING_RATE,
  num_train_steps=NUM_TRAIN_STEPS,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  ft_params = FT_PARAMS)


#We use Tensorflow estimators to train, evaluate and test our model
estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=True,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE,
    predict_batch_size=PREDICT_BATCH_SIZE)

The model will stop training when it reaches 800 as a checkpoint

The bert checkpoint directory is gs://csc3002/wwm_uncased_L-24_H-1024_A-16

The output directory is gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model/output1 

INFO:tensorflow:Using config: {'_model_dir': 'gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model/output1', '_tf_random_seed': 3060, '_save_summary_steps': 100, '_save_checkpoints_steps': 10000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.38.114.34:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, 

Next we create an input builder function that takes our training feature set (`train_features`) and produces a generator.

This is a pretty standard design pattern for working with Tensorflow Estimators

In [0]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True,)

# Input function for dev data, we feed in our previously created dev_features for this
test_input_fn = run_classifier.input_fn_builder(
    features=dev_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=True)

# Train Model
<b>Now we train our BERT fine-tuned model

In [0]:
print("\nThe model will stop training when it reaches", num_train_steps, "as a checkpoint")
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=NUM_TRAIN_STEPS)
train_time = datetime.now() - current_time
print("Training took time ", train_time)


The model will stop training when it reaches 843 as a checkpoint
Beginning Training!
INFO:tensorflow:Querying Tensorflow master (grpc://10.46.146.242:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 14212402838772463947)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 2645688729414921122)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 4430974932734288524)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 1719957632343019041)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 17179869184,

KeyboardInterrupt: ignored

# Evaluate Model

We now evaluate the performance of our model on the development data

In [0]:
#You need to provide number of steps for a TPU
eval_steps = int(len(dev_features) / EVAL_BATCH_SIZE)

#Eval will be slightly WRONG on the TPU because it will drop the last batch (drop_remainder = True).
estimator.evaluate(input_fn=test_input_fn, steps=eval_steps)

INFO:tensorflow:Querying Tensorflow master (grpc://10.47.247.18:8470) for TPU system metadata.


INFO:tensorflow:Querying Tensorflow master (grpc://10.47.247.18:8470) for TPU system metadata.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 16348832557802236257)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 16348832557802236257)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 9442498558503587591)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 9442498558503587591)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 1819677390197866551)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 1819677390197866551)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 9397544896479164317)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 9397544896479164317)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 2302517781714340699)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 2302517781714340699)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 11410651471969234772)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 11410651471969234772)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 15075635063155519615)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 15075635063155519615)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 17179869184, 14212200204431715904)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 17179869184, 14212200204431715904)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 17179869184, 5287456729480160148)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 17179869184, 5287456729480160148)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 8589934592, 3443115717158911620)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 8589934592, 3443115717158911620)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 13313383819007549821)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 13313383819007549821)


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:*** Features ***


INFO:tensorflow:*** Features ***


INFO:tensorflow:  name = input_ids, shape = (1, 256)


INFO:tensorflow:  name = input_ids, shape = (1, 256)


INFO:tensorflow:  name = input_mask, shape = (1, 256)


INFO:tensorflow:  name = input_mask, shape = (1, 256)


INFO:tensorflow:  name = label_ids, shape = (1,)


INFO:tensorflow:  name = label_ids, shape = (1,)


INFO:tensorflow:  name = segment_ids, shape = (1, 256)


INFO:tensorflow:  name = segment_ids, shape = (1, 256)


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2020-03-25T15:48:19Z


INFO:tensorflow:Starting evaluation at 2020-03-25T15:48:19Z


INFO:tensorflow:TPU job name worker


INFO:tensorflow:TPU job name worker


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model/output/model.ckpt-843


INFO:tensorflow:Restoring parameters from gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model/output/model.ckpt-843


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Init TPU system


INFO:tensorflow:Init TPU system


INFO:tensorflow:Initialized TPU in 9 seconds


INFO:tensorflow:Initialized TPU in 9 seconds


INFO:tensorflow:Starting infeed thread controller.


INFO:tensorflow:Starting infeed thread controller.


INFO:tensorflow:Starting outfeed thread controller.


INFO:tensorflow:Starting outfeed thread controller.


INFO:tensorflow:Initialized dataset iterators in 0 seconds


INFO:tensorflow:Initialized dataset iterators in 0 seconds


INFO:tensorflow:Enqueue next (125) batch(es) of data to infeed.


INFO:tensorflow:Enqueue next (125) batch(es) of data to infeed.


INFO:tensorflow:Dequeue next (125) batch(es) of data from outfeed.


INFO:tensorflow:Dequeue next (125) batch(es) of data from outfeed.


INFO:tensorflow:Outfeed finished for iteration (0, 0)


INFO:tensorflow:Outfeed finished for iteration (0, 0)


INFO:tensorflow:Evaluation [125/125]


INFO:tensorflow:Evaluation [125/125]


INFO:tensorflow:Stop infeed thread controller


INFO:tensorflow:Stop infeed thread controller


INFO:tensorflow:Shutting down InfeedController thread.


INFO:tensorflow:Shutting down InfeedController thread.


INFO:tensorflow:InfeedController received shutdown signal, stopping.


INFO:tensorflow:InfeedController received shutdown signal, stopping.


INFO:tensorflow:Infeed thread finished, shutting down.


INFO:tensorflow:Infeed thread finished, shutting down.


INFO:tensorflow:infeed marked as finished


INFO:tensorflow:infeed marked as finished


INFO:tensorflow:Stop output thread controller


INFO:tensorflow:Stop output thread controller


INFO:tensorflow:Shutting down OutfeedController thread.


INFO:tensorflow:Shutting down OutfeedController thread.


INFO:tensorflow:OutfeedController received shutdown signal, stopping.


INFO:tensorflow:OutfeedController received shutdown signal, stopping.


INFO:tensorflow:Outfeed thread finished, shutting down.


INFO:tensorflow:Outfeed thread finished, shutting down.


INFO:tensorflow:outfeed marked as finished


INFO:tensorflow:outfeed marked as finished


INFO:tensorflow:Shutdown TPU system.


INFO:tensorflow:Shutdown TPU system.


INFO:tensorflow:Finished evaluation at 2020-03-25-15:49:59


INFO:tensorflow:Finished evaluation at 2020-03-25-15:49:59


INFO:tensorflow:Saving dict for global step 843: F1_Score = 0.7973856, auc = 0.8194964, eval_accuracy = 0.814, eval_loss = 0.86818445, false_negatives = 61.0, false_positives = 125.0, global_step = 843, loss = 0.5969863, precision = 0.74541754, recall = 0.85714287, true_negatives = 448.0, true_positives = 366.0


INFO:tensorflow:Saving dict for global step 843: F1_Score = 0.7973856, auc = 0.8194964, eval_accuracy = 0.814, eval_loss = 0.86818445, false_negatives = 61.0, false_positives = 125.0, global_step = 843, loss = 0.5969863, precision = 0.74541754, recall = 0.85714287, true_negatives = 448.0, true_positives = 366.0


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 843: gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model/output/model.ckpt-843


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 843: gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model/output/model.ckpt-843


INFO:tensorflow:evaluation_loop marked as finished


INFO:tensorflow:evaluation_loop marked as finished


{'F1_Score': 0.7973856,
 'auc': 0.8194964,
 'eval_accuracy': 0.814,
 'eval_loss': 0.86818445,
 'false_negatives': 61.0,
 'false_positives': 125.0,
 'global_step': 843,
 'loss': 0.5969863,
 'precision': 0.74541754,
 'recall': 0.85714287,
 'true_negatives': 448.0,
 'true_positives': 366.0}

### Running Evaluation Whilst Training

Below is a custom function to run evaluation on the fine-tuned BERT model whilst training. The `tf.train_and_evaluate` inbuilt function for tensorflow doesn't work as it hasn't been made compatible with a distributed TPU strategy. 

Thus, instead of being able to evaluate the model in memory, we must save the model graph and metadata to a checkpoint and reload it every n steps we want to run an evaluation.

The function finds the optimum number of steps the fine-tuning should run for based upon F1 Score by testing the trained model to that point against the dev set


In [0]:
#We'll set a large value for train steps because we want to make this model run
#for as long as possible before it finds the optimimum model
hparams = {'train_steps': 3000, 
            'train_batch_size': 32,
            'eval_batch_size': 8,
            'use_tpu': True,
            'num_train_features': len(train_features),
            'num_eval_features': len(dev_features),
           'learning_rate': 2e-5 
            }
if DATASET == 'AnalyticsVidhya':
  hparams['train_steps'] = 12000
          
def load_global_step_from_checkpoint_dir(checkpoint_dir):
  try:
    checkpoint_reader = tf.train.NewCheckpointReader(
        tf.train.latest_checkpoint(checkpoint_dir))
    return checkpoint_reader.get_tensor(tf.GraphKeys.GLOBAL_STEP)
  except:  
    return 0

def train_and_evaluate(out_dir, hparams, steps_per_eval):

#Delete prior model graph, checkpoints and eval files to enable consecutive runs, rather than resetting runtime
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
  # Doesn't matter if the directory didn't exist
    pass
  tf.gfile.MakeDirs(OUTPUT_DIR)

  max_steps = hparams['train_steps']
  train_batch_size = hparams['train_batch_size']
  eval_batch_size = hparams['eval_batch_size']
  print('\ntrain_batch_size={:d}  eval_batch_size={:d}  max_steps={:d}'.format(
                  train_batch_size,
                  eval_batch_size,
                  max_steps))

  config = tf.contrib.tpu.RunConfig(
    cluster=cluster_resolver,
    model_dir=out_dir,
    save_checkpoints_steps=steps_per_eval,
    tpu_config=tf.contrib.tpu.TPUConfig(
      iterations_per_loop=steps_per_eval,
      per_host_input_for_training=True))

  model_fn = run_classifier.model_fn_builder(
  bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
  num_labels=len(label_list),
  init_checkpoint=bert_ckpt_file,
  learning_rate=hparams['learning_rate'],
  num_train_steps=NUM_TRAIN_STEPS,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  use_one_hot_embeddings=True)

  estimator = tf.contrib.tpu.TPUEstimator(  # TPU change 4
    model_fn=model_fn,
    config=config,
    params=hparams,
    model_dir=out_dir,
    train_batch_size=train_batch_size,
    eval_batch_size=eval_batch_size,
    use_tpu=True
  )
 # load last checkpoint and start from there
  current_step = load_global_step_from_checkpoint_dir(out_dir)
  steps_per_epoch = hparams['num_train_features'] // train_batch_size
  print('\nTraining for {:d} steps ({:2f} epochs in total). Current'
                  ' step {:d}.'.format(
                  max_steps,
                  max_steps / steps_per_epoch,
                  current_step))

  start_timestamp = time.time()  # This time will include compilation time
  best_score = 0
  best_model = 0
  while current_step < max_steps:
    # Train for up to steps_per_eval number of steps.
    # At the end of training, a checkpoint will be written to --model_dir.
    next_checkpoint = min(current_step + steps_per_eval, max_steps)
    estimator.train(input_fn=train_input_fn, max_steps=next_checkpoint)
    current_step = next_checkpoint
    print('\nFinished training up to step {:d}. Elapsed seconds {:d}.\n'.format(
                    next_checkpoint, int(time.time() - start_timestamp)))

    print('\nStarting to evaluate at step {:d} \n'.format(next_checkpoint))
    eval_results = estimator.evaluate(
      input_fn=test_input_fn,
      steps=hparams['num_eval_features'] // eval_batch_size)
    print('\nEval results at step {:d}: \n'.format(next_checkpoint), eval_results)
    
    current_score = eval_results['F1_Score']
    if current_score > best_score:
      best_score = current_score 
      best_model = current_step
      score_buffer = [] #Reset buffer
    else:
      score_buffer.append(current_score)
    #If 3 times in a row evaluation results haven't improved; we stop training
    if len(score_buffer) == 3:
      elapsed_time = int(time.time() - start_timestamp)
      
      print('\nFinished training at step {:d} as there has been no improvement on the previous 3 iterations'.format(current_step),
      '\nElapsed seconds {:d}. \n'.format(elapsed_time), 
      "\nBest model is at step {:d} with the best F-score {:d}".format(best_model, best_score),
      "\nNow edit the protocol buffer file and set the most recent step to", best_model,
            "so this model checkpoint can be loaded using the tf.train.latest_checkpoint function")
      
      return best_model
    

  elapsed_time = int(time.time() - start_timestamp)
  print('\nFinished training up to step {:d}. Elapsed seconds {:d}. \n'.format(max_steps, elapsed_time))
  return best_model
  

Now run the train_and_evaluate function. We can toggle the steps_pereval in the params to control how often we checkpoint and evaluate

In [0]:
best_step = train_and_evaluate(OUTPUT_DIR, hparams, steps_per_eval=1000) # Will return the optimum step for the BERT model
print("\nBest step for model is at", best_step)


train_batch_size=32  eval_batch_size=8  max_steps=12000
INFO:tensorflow:Using config: {'_model_dir': 'gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model1/output', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.103.143.2:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f8a7f9e16a0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.103

### Cross Validation Evaluation

Does not provide in depth tensorflow logging but it does provide evaluation at the end. As mentioned above, we combine the provided training and dev files



In [0]:
def bertCV(data, train_batch_size = 32, learn_rate = 2e-5,\
           num_train_steps = 850, folds = 5):

  #Filter out all log messages so console isn't consumed with memory
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

  #Dataframe where grid search results will be stored. Empty to begin with
  eval_df = pd.DataFrame(columns = ['F1 Score', 'auc', 'Accuracy'] )
  
  k = 1 # Fold counter

  #Stratified K fold ensures the folds are made by preserving the percentage of samples for each class.
  cv = StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)

  # Sticking within the training dataset for evaluation. Data is the combination of the provided train and dev sets
  for train_index, dev_index in cv.split(data.tweet, data.label): 
    
    #Shuffling again because otherwise the StratifiedKFold function groups a lot of 0's at the start
    training  = data.iloc[train_index]
    training = training.sample(frac = 1, random_state=SEED)
    develop = data.iloc[dev_index]
    develop = develop.sample(frac = 1, random_state=SEED)
    
    """Unlike before where I only one test set and one training set, this time I have K different sets of training and testing.
    Therefore, in each fold I need to get a new set of data and convert it to features each time."""
    
    # Use the InputExample class from BERT's run_classifier code to create examples from the data

    train_InputExamples = training.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                      text_a = x[DATA_COLUMN], 
                                                                      text_b = None, 
                                                                      label = x[LABEL_COLUMN]), axis = 1)

    dev_InputExamples = develop.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                      text_a = x[DATA_COLUMN], 
                                                                      text_b = None, 
                                                                      label = x[LABEL_COLUMN]), axis = 1)
    
    #Convert these examples to features that BERT can interpret
    train_features =  run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
    dev_features = run_classifier.convert_examples_to_features(dev_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

    #Delete prior model graph, checkpoints and eval files to make room for new model each loop
    try:
      tf.gfile.DeleteRecursively(OUTPUT_DIR)
    except:
    # Doesn't matter if the directory didn't exist
      pass
    tf.gfile.MakeDirs(OUTPUT_DIR)

    num_warmup_steps = int(NUM_TRAIN_STEPS * WARMUP_PROPORTION)

    # Model configs
    """
    model_fn = run_classifier.model_fn_builder(
    bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
    num_labels=len(label_list),
    init_checkpoint=bert_ckpt_file,
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=True,
    use_one_hot_embeddings=True)
    """

    model_fn = model_fn_builder(
    bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
    num_labels=len(label_list),
    init_checkpoint=bert_ckpt_file,
    learning_rate=LEARNING_RATE,
    num_train_steps=NUM_TRAIN_STEPS,
    num_warmup_steps=num_warmup_steps,
    use_tpu=True,
    ft_params = FT_PARAMS)
    
    estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
      use_tpu=True,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=TRAIN_BATCH_SIZE,
      eval_batch_size=EVAL_BATCH_SIZE,
      predict_batch_size=PREDICT_BATCH_SIZE)
    
    # Create an input function for training. drop_remainder = True for using TPUs.
    train_input_fn = run_classifier.input_fn_builder(
        features=train_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=True)

    #input function for dev data, we feed in our previously created dev_features for this
    dev_input_fn = run_classifier.input_fn_builder(
        features=dev_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=True)
   
    
    current_time = datetime.now()
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps )
    train_time = datetime.now() - current_time
    

    #You need to provide number of steps for a TPU
    eval_steps = int(len(dev_InputExamples) / EVAL_BATCH_SIZE)

    #Eval may be slightly WRONG on the TPU because it will truncate the last batch.
    eval_results = estimator.evaluate(input_fn=dev_input_fn, steps=eval_steps)

    row = pd.Series({'F1 Score': eval_results['F1_Score'], 'auc': eval_results['auc'], 'Accuracy': eval_results['eval_accuracy'],'Precision': eval_results['precision'],'Recall': eval_results['recall'],\
                                    'False Negatives': eval_results['false_negatives'],'False Positives': eval_results['false_positives'],\
                    'True Negatives':eval_results['true_negatives'] ,'True Positives': eval_results['true_positives'], 'Training Time': train_time })
    #row = get_metrics(OUTPUT_DIR, train_time, k)
    row = pd.Series(row, name = 'Fold ' + str(k))

    """Below statement controls for whenever we get a bad fold which results in a model predicting only one class.
    This isn't truly representative of normal performance and can bring down CV score, so we omit model evaluation
    if the below statement is true"""
    if eval_results['false_negatives'] < 1 or eval_results['false_positives'] < 1: 
      print("Classifier predicts one class. Thus not recording this metric as it will skew CV\n")
      #k = k + 1
      continue

    eval_df = eval_df.append(row)
    print("Fold " + str(k) + ":\tF-Score:", eval_df["F1 Score"][k-1])
    print("Training took time ", train_time)
    print('---------------------------------------------------------------------------------------------------------\n')
    k = k + 1 #Increment on fold counter

  row = eval_df.mean(axis = 0)
  row = pd.Series(row, name = 'CV Average')
  eval_df = eval_df.append(row)
  print("\nTraining Batch Size: ", train_batch_size, "\tLearn Rate: ", learn_rate, "\tNumSteps: ", NUM_TRAIN_STEPS)
  display(eval_df)

  return row # Also return row of CV-Average


Basic cross-validation can be performed here

In [0]:
data = loadData(rawTrain, rawDev, params)

CV_Av = bertCV(data, learn_rate = 2e-5, num_train_steps = 850)

TypeError: ignored

#### Cross-Validation of Cross-Validation

Tensorflow 1.x is non-deterministic, which has resulted in the variability between each run to be greater than the difference in performance gained between introductions of different configurations and parameters. This makes it difficult to determine what is the best pre-training, text preprocessing and fine-tuning pipeline to undertake.

To better ensure the reliability of experiments my solution is to have a 5 fold cross-validation of a cross-validated sample of my data which should reduce the variance run to run significantly.

In [0]:
data = loadData(rawTrain, rawDev)

#Stratified K fold ensures the folds are made by preserving the percentage of samples for each class.
folds = 5
cv = StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
eval_df = pd.DataFrame(columns = ['F1 Score', 'auc', 'Accuracy'])

#This will be a 5-fold CV so the sample each time will be a fifth of the data
i = 1
for __, data_index in cv.split(data.tweet, data.label):
  dat = data.iloc[data_index]
  CV_Av = bertCV(dat, learn_rate = 2e-5, num_train_steps=900)
  CV_Av = pd.Series(CV_Av, name = 'CV Average' + str(i))
  eval_df = eval_df.append(CV_Av)

row = eval_df.mean(axis = 0)
row = pd.Series(row, name = 'BiLTSM')
eval_df1 = pd.read_csv('gs://csc3002/hateval2019/models_eval_df.csv', sep=',',  index_col = 0, encoding = 'utf-8')

eval_df1 = eval_df1.append(row)
eval_df1.to_csv('gs://csc3002/hateval2019/models_eval_df.csv', sep=',',  index = True, encoding = 'utf-8')
eval_df1


Fold 1:	F-Score: 0.7241378426551819
Training took time  0:12:03.859953
---------------------------------------------------------------------------------------------------------

Fold 2:	F-Score: 0.7256637215614319
Training took time  0:09:59.304479
---------------------------------------------------------------------------------------------------------

Fold 3:	F-Score: 0.7317072153091431
Training took time  0:10:04.926114
---------------------------------------------------------------------------------------------------------

Fold 4:	F-Score: 0.7586206197738647
Training took time  0:10:01.333893
---------------------------------------------------------------------------------------------------------

Fold 5:	F-Score: 0.8011527061462402
Training took time  0:10:15.034749
---------------------------------------------------------------------------------------------------------


Training Batch Size:  32 	Learn Rate:  2e-05 	NumSteps:  800


Unnamed: 0,F1 Score,auc,Accuracy,False Negatives,False Positives,Precision,Recall,Training Time,True Negatives,True Positives
Fold 1,0.724138,0.758621,0.76,42.0,54.0,0.7,0.75,00:12:03.859953,178.0,126.0
Fold 2,0.725664,0.762623,0.7675,45.0,48.0,0.719298,0.732143,00:09:59.304479,184.0,123.0
Fold 3,0.731707,0.770936,0.78,48.0,40.0,0.75,0.714286,00:10:04.926114,192.0,120.0
Fold 4,0.758621,0.788801,0.79,37.0,47.0,0.73743,0.781065,00:10:01.333893,184.0,132.0
Fold 5,0.801153,0.826827,0.8275,30.0,39.0,0.780899,0.822485,00:10:15.034749,192.0,139.0
CV Average,0.748256,0.781562,0.785,40.4,45.6,0.737525,0.759996,00:10:28.891837,186.0,128.0


Fold 1:	F-Score: 0.7398843765258789
Training took time  0:10:57.391297
---------------------------------------------------------------------------------------------------------

Fold 2:	F-Score: 0.7547169327735901
Training took time  0:10:17.240653
---------------------------------------------------------------------------------------------------------

Fold 3:	F-Score: 0.7155963182449341
Training took time  0:10:15.866988
---------------------------------------------------------------------------------------------------------

Fold 4:	F-Score: 0.7514792084693909
Training took time  0:10:17.786046
---------------------------------------------------------------------------------------------------------

Fold 5:	F-Score: 0.7283581495285034
Training took time  0:10:22.062096
---------------------------------------------------------------------------------------------------------


Training Batch Size:  32 	Learn Rate:  2e-05 	NumSteps:  800


Unnamed: 0,F1 Score,auc,Accuracy,False Negatives,False Positives,Precision,Recall,Training Time,True Negatives,True Positives
Fold 1,0.739884,0.773194,0.775,40.0,50.0,0.719101,0.761905,00:10:57.391297,182.0,128.0
Fold 2,0.754717,0.792488,0.805,48.0,30.0,0.8,0.714286,00:10:17.240653,202.0,120.0
Fold 3,0.715596,0.757697,0.7675,51.0,42.0,0.735849,0.696429,00:10:15.866988,190.0,117.0
Fold 4,0.751479,0.784831,0.79,42.0,42.0,0.751479,0.751479,00:10:17.786046,189.0,127.0
Fold 5,0.728358,0.765709,0.7725,47.0,44.0,0.73494,0.721893,00:10:22.062096,187.0,122.0
CV Average,0.738007,0.774784,0.782,45.6,41.6,0.748274,0.729198,00:10:26.069416,190.0,122.8


Fold 1:	F-Score: 0.7621950507164001
Training took time  0:10:16.814623
---------------------------------------------------------------------------------------------------------

Fold 2:	F-Score: 0.777108371257782
Training took time  0:10:21.201833
---------------------------------------------------------------------------------------------------------

Fold 3:	F-Score: 0.782608687877655
Training took time  0:10:21.547542
---------------------------------------------------------------------------------------------------------

Fold 4:	F-Score: 0.7816091179847717
Training took time  0:10:21.263959
---------------------------------------------------------------------------------------------------------

Fold 5:	F-Score: 0.7090908288955688
Training took time  0:10:35.990588
---------------------------------------------------------------------------------------------------------


Training Batch Size:  32 	Learn Rate:  2e-05 	NumSteps:  800


Unnamed: 0,F1 Score,auc,Accuracy,False Negatives,False Positives,Precision,Recall,Training Time,True Negatives,True Positives
Fold 1,0.762195,0.796593,0.805,43.0,35.0,0.78125,0.744048,00:10:16.814623,197.0,125.0
Fold 2,0.777108,0.808497,0.815,39.0,35.0,0.786585,0.767857,00:10:21.201833,197.0,129.0
Fold 3,0.782609,0.811269,0.8125,33.0,42.0,0.762712,0.803571,00:10:21.547542,190.0,135.0
Fold 4,0.781609,0.809293,0.81,33.0,43.0,0.759777,0.804734,00:10:21.263959,188.0,136.0
Fold 5,0.709091,0.750916,0.76,52.0,44.0,0.726708,0.692308,00:10:35.990588,187.0,117.0
CV Average,0.762522,0.795314,0.8005,40.0,39.8,0.763406,0.762504,00:10:23.363709,191.8,128.4


Fold 1:	F-Score: 0.7869821190834045
Training took time  0:10:23.567323
---------------------------------------------------------------------------------------------------------

Fold 2:	F-Score: 0.7803467512130737
Training took time  0:10:42.339975
---------------------------------------------------------------------------------------------------------

Fold 3:	F-Score: 0.7818180918693542
Training took time  0:10:29.001128
---------------------------------------------------------------------------------------------------------

Fold 4:	F-Score: 0.7714285254478455
Training took time  0:10:29.696910
---------------------------------------------------------------------------------------------------------

Fold 5:	F-Score: 0.7861270904541016
Training took time  0:10:29.761172
---------------------------------------------------------------------------------------------------------


Training Batch Size:  32 	Learn Rate:  2e-05 	NumSteps:  800


Unnamed: 0,F1 Score,auc,Accuracy,False Negatives,False Positives,Precision,Recall,Training Time,True Negatives,True Positives
Fold 1,0.786982,0.815569,0.82,36.0,36.0,0.786982,0.786982,00:10:23.567323,195.0,133.0
Fold 2,0.780347,0.808499,0.81,34.0,42.0,0.762712,0.798817,00:10:42.339975,189.0,135.0
Fold 3,0.781818,0.812808,0.82,39.0,33.0,0.796296,0.767857,00:10:29.001128,199.0,129.0
Fold 4,0.771429,0.800493,0.8,33.0,47.0,0.741758,0.803571,00:10:29.696910,185.0,135.0
Fold 5,0.786127,0.814245,0.815,32.0,42.0,0.764045,0.809524,00:10:29.761172,190.0,136.0
CV Average,0.781341,0.810323,0.813,34.8,40.0,0.770359,0.79335,00:10:30.873301,191.6,133.6


Fold 1:	F-Score: 0.7820895314216614
Training took time  0:10:37.546774
---------------------------------------------------------------------------------------------------------



# Testing Model

## Adding in Back-Translated Hate Speech Tweets as Extra Data

We have very few instances of hate speech labelled in this dataset. To remedy this I performed back_translation augmentation on this training set.

Below I load in in the extra hate speech tweets I created via back-translation augmentation I performed in another colab notebook and I append it to the existing dataframe

In [0]:
"""dat = '/content/drive/My Drive/hateval2019/backtranslated_hatEval.txt' 
dat = pd.read_csv(dat, sep = '\t', names = ['tweet'], header = None, encoding = 'utf-8')
pd.set_option('display.max_colwidth', -1)
dat = dat.astype(str)
dat.head(50)"""

"dat = '/content/drive/My Drive/hateval2019/backtranslated_hatEval.txt' \ndat = pd.read_csv(dat, sep = '\t', names = ['tweet'], header = None, encoding = 'utf-8')\npd.set_option('display.max_colwidth', -1)\ndat = dat.astype(str)\ndat.head(50)"

**See how the english is a little off?** 

That's because these are the hate speech tweets in the training set translated to french, then translated back again. This creates a whole new, yet similar set of hate speech tweets to train on. (Slightly augmented text)

In [0]:
"""print("There are", len(dat.index), "tweets")
dat = dat[dat['tweet'].apply(lambda x: len(x) > 10)]
print("There are now", len(dat.index), "tweets")
dat.head()"""

'print("There are", len(dat.index), "tweets")\ndat = dat[dat[\'tweet\'].apply(lambda x: len(x) > 10)]\nprint("There are now", len(dat.index), "tweets")\ndat.head()'

<b>Rather than creating 3768 extra tweets, 19630 extra have been created. The tweets have been incorrectly parsed. Removing some tweets with a smaller length may mitigate this effect somewhat by removing tweets that were cut in half</b>

Let's see if it helps by adding it to the original training set and testing it against our dev data

In [0]:
"""dat['label'] = 1
dat['id'] = 80000
frames = [dat,data]
data = pd.concat(frames)
print(data.info())
data.head()"""

"dat['label'] = 1\ndat['id'] = 80000\nframes = [dat,data]\ndata = pd.concat(frames)\nprint(data.info())\ndata.head()"

We'll shuffle the dataframe to make sure there's no funny business with the training of the model and we'll then reset the id field to make it unique and sequential for each row

In [0]:
"""data = data.sample(frac=1, random_state = SEED)
data.reset_index(drop = True, inplace = True)

data['id'] = data.reset_index().index + 1
print(data.label.value_counts(), "\n")
print(data.info())
length = len(data.index)
print("\nNow there are", length , "tweets total in this database")
data.tail(10)"""

'data = data.sample(frac=1)\ndata.reset_index(drop = True, inplace = True)\n\ndata[\'id\'] = data.reset_index().index + 1\nprint(data.label.value_counts(), "\n")\nprint(data.info())\nlength = len(data.index)\nprint("\nNow there are", length , "tweets total in this database")\ndata.tail(10)'

<b>Training with both dev and training set. Then Testing with the holdout test set</b>

Loading in train and test data...

In [0]:
train = loadData(rawTrain, rawDev)

test = loadData(rawTest)
test.head()

Unnamed: 0,id,tweet,label
2317,31035,anyone whoever doubted louis & said he couldnt sing go listen to back to you & if u dont change ur mind ur just a bitter bitch,0
2867,34114,bitch i was fuckn up till 4:30 but your hoe ass didn't text me back. pouting face,1
1378,34255,this is nancynancy called my pay raise crumbs nancy doesn't want to fund the military nancy puts illegal aliens rights before citizen rights.nancy wants to house the illegals before our homeless veterans don't be a nancy. trump wednesday wisdom maga,0
2281,34280,well bitch tell me how you download viss,0
1481,33582,"tx: man arrested trying to get into house with knife - heriberto coronado, 28, is alleged to have held a knife to a female victim's throat at one point as well. ""he was also named in a detainer on an immigration charge."" deport them all",0


Function to get predictions on test data

In [0]:
def getPrediction(in_sentences):
  #Makes output less verbose
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

  labels = [0, 1]
  input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=True)
  predictions = list(estimator.predict(predict_input_fn))

  #Initialise empty predicted labels array
  predicted_classes = [None] * len(predictions)

  #Use a for loop to iterate through probabilities and for each prediction assign a label
  #corresponding to which label has the highest probability
  for i in range(0, len(predictions)):
    if predictions[i]['probabilities'][0] > predictions[i]['probabilities'][1]:
      predicted_classes[i] = 0
    else:
      predicted_classes[i] = 1
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # Reset tensorflow verboisty to normal

  return predicted_classes

<b> Converting to features, setting run and model configs.

Then training on train and dev set and predicting on unseen test set </b>

In [0]:
if DATASET == "HatEval":

  SAVE_CHECKPOINTS_STEPS = 1000
  run_config = tf.compat.v1.estimator.tpu.RunConfig(
      #I think the output file must be a sub-directory of the main BERT file
      model_dir=OUTPUT_DIR,
      tf_random_seed=SEED, 
      cluster=cluster_resolver,
      save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=200,
          num_shards=8,
          per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

  train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                        text_a = x[DATA_COLUMN], 
                                                                        text_b = None, 
                                                                        label = x[LABEL_COLUMN]), axis = 1)

  train_features =  run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)


  #Delete prior model graph, checkpoints and eval files to make room for new model each loop
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
  # Doesn't matter if the directory didn't exist
    pass
  tf.gfile.MakeDirs(OUTPUT_DIR)

  # Compute # warmup steps
  num_warmup_steps = int(NUM_TRAIN_STEPS * WARMUP_PROPORTION)

  # Model configs
  
  """
  model_fn = run_classifier.model_fn_builder(
  bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
  num_labels=len(label_list),
  init_checkpoint=bert_ckpt_file,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  use_one_hot_embeddings=True)
  """
  model_fn = model_fn_builder(
  bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
  num_labels=len(label_list),
  init_checkpoint=bert_ckpt_file,
  learning_rate=LEARNING_RATE,
  num_train_steps=NUM_TRAIN_STEPS,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  ft_params = FT_PARAMS)
    
  estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
    use_tpu=True,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=8,
    predict_batch_size=8)

  # Create an input function for training. drop_remainder = True for using TPUs.
  train_input_fn = run_classifier.input_fn_builder(
      features=train_features,
      seq_length=MAX_SEQ_LENGTH,
      is_training=True,
      drop_remainder=True)

  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

  print("\nThe model will stop training when it reaches", num_train_steps, "as a checkpoint")

  print(f'Beginning Training!')
  current_time = datetime.now()
  estimator.train(input_fn=train_input_fn, max_steps=NUM_TRAIN_STEPS)
  train_time = datetime.now() - current_time
  print("Training took time ", train_time)

  predictions = getPrediction(test.tweet)
  test['predictions'] = predictions

  test.to_csv('gs://csc3002/hateval2019/predictions.csv', sep=',',  index = True, encoding = 'utf-8')
  print("\n\nF1 Score:", metrics.f1_score(test.label, test.predictions))
  print("Accuracy", metrics.accuracy_score(test.label, test.predictions))

elif DATASET == "AnalyticsVidhya":
  
  SAVE_CHECKPOINTS_STEPS = 10000
  run_config = tf.compat.v1.estimator.tpu.RunConfig(
      #I think the output file must be a sub-directory of the main BERT file
      model_dir=OUTPUT_DIR,
      tf_random_seed=SEED, 
      cluster=cluster_resolver,
      save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=200,
          num_shards=8,
          per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

  train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                        text_a = x[DATA_COLUMN], 
                                                                        text_b = None, 
                                                                        label = x[LABEL_COLUMN]), axis = 1)

  train_features =  run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)


  #Delete prior model graph, checkpoints and eval files to make room for new model each loop
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
  # Doesn't matter if the directory didn't exist
    pass
  tf.gfile.MakeDirs(OUTPUT_DIR)

  # Compute # warmup steps
  num_warmup_steps = int(NUM_TRAIN_STEPS * WARMUP_PROPORTION)

  """ Model Configs"""

  model_fn = model_fn_builder(
  bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
  num_labels=len(label_list),
  init_checkpoint=bert_ckpt_file,
  learning_rate=LEARNING_RATE,
  num_train_steps=NUM_TRAIN_STEPS,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  ft_params = FT_PARAMS)

  estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
    use_tpu=True,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=8,
    predict_batch_size=8)

  # Create an input function for training. drop_remainder = True for using TPUs.
  train_input_fn = run_classifier.input_fn_builder(
      features=train_features,
      seq_length=MAX_SEQ_LENGTH,
      is_training=True,
      drop_remainder=True)

  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

  print("\nThe model will stop training when it reaches", NUM_TRAIN_STEPS, "as a checkpoint")

  print(f'Beginning Training!')
  current_time = datetime.now()
  estimator.train(input_fn=train_input_fn, max_steps=NUM_TRAIN_STEPS)
  train_time = datetime.now() - current_time
  print("Training took time ", train_time)

  predictions = getPrediction(test.tweet)
  test['label'] = predictions
  print(test.label.value_counts())
  print(predictions[0:20])
  test.drop(columns = ['tweet'], axis = 1,inplace = True)
  test.to_csv('gs://csc3002/trial/submission.csv', sep=',', index = False)


INFO:tensorflow:Writing example 0 of 31962
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] bi ##h day fl ##or . . . the best master mixer in the world . . . . . . x ##ox ##o fe ##li ##z cum ##ple ao ##s fl ##or . . . . el . . . [SEP]
INFO:tensorflow:input_ids: 101 12170 2232 2154 13109 2953 1012 1012 1012 1996 2190 3040 23228 1999 1996 2088 1012 1012 1012 1012 1012 1012 1060 11636 2080 10768 3669 2480 13988 10814 20118 2015 13109 2953 1012 1012 1012 1012 3449 1012 1012 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 

AttributeError: ignored

## Error Analysis

In [0]:
test.head(40)

Unnamed: 0,id,tweet,label,predictions
0,0,anyone whoever doubted louis & said he couldnt sing go listen to back to you & if u dont change ur mind ur just a bitter bitch,0,1
1,1,bitch i was fuckn up till 4:30 but your hoe ass didn't text me back. pouting face,1,1
2,2,this is nancynancy called my pay raise crumbs nancy doesn't want to fund the military nancy puts illegal aliens rights before citizen rights.nancy wants to house the illegals before our homeless veterans don't be a nancy. trump wednesday wisdom maga,0,1
3,3,well bitch tell me how you download viss,0,1
4,4,"tx: man arrested trying to get into house with knife - heriberto coronado, 28, is alleged to have held a knife to a female victim's throat at one point as well. ""he was also named in a detainer on an immigration charge."" deport them all",0,0
5,5,whatever it takes. take the money out of welfare payments to the lazy and use that. take it from aid to mexico and south and central america. just build that wall .,1,1
6,6,this bitch got one time to fuck wit me or my dad n ima let her whore ass know,1,1
7,7,ban rape fugees ban islam in the us ten-year-old german boy raped by migrant classmates on school trip via,1,1
8,8,when hoe's talk shit about me i started to get mad like hell bitch i can make your life like a living hell don't play with me!!,1,1
9,9,"and anyone who says otherwise is either a grade-a asshole or has no idea what they are talking about. being depressed is a bitch, but...",0,0


<b>Our test dataset is supervised for HatEval so perhaps we can do some analysis here by peeking at where we went wrong</b>

In [0]:
if DATASET == "HatEval":
  def get_missed_HS(df):
    df = df.loc[(df["label"] == 1) & (df["predictions"] == 0)]
    return df

  def get_misclassified_HS(df):
    df = df.loc[(df["label"] == 0) & (df["predictions"] == 1)]
    return df
  #Perhaps having a separate function for both is a good idea

  df = get_missed_HS(test)
  df.to_csv('gs://csc3002/hateval2019/falsenegatives.csv', sep=',',  index = True, encoding = 'utf-8')
df.head(50)

Unnamed: 0,id,tweet,label,predictions
17,17,if its gets you upset - hes the best man for the job.#gohome strong borders send them back,1,0
276,276,stephen miller the public charge law has been on the books for 100 years it prevents immigrants from burdening the u.s taxpayer if they cant take care of themselvers or their kids send them back trump maga red nation rising,1,0
282,282,""" it may have to do with plain honest speech. so many politicians have the """"political speech"""" bug. theyre accustomed to stating what people want to hear & not what they need to . like saying it is fucktard ed to believe the mass refugee rape fuge e immigration was good is idiodic""",1,0
285,285,morning. twitter is being anal and trying to be a safe space for terminally offended. don't say cunt fuck bitch wanker ... cunts,1,0
367,367,"""everyone's condemning miller with uncles article, yet no one mentioning in 1st para """"he set foot on ellis island on january 7, 1903, """" he came & sent for family legally! big differences than people overstaying visas, lying to get in, and crashing borders!#sendthemback""",1,0
438,438,"""#navy rescued manchester bomber salman abedi from war-torn libya three years before deadly terror attack via west is """"rescuing"""" people that will kill them.#sendthemhome bombing attack have some dignity and throw them out""",1,0
490,490,there are no innocent people in detention centres send them back,1,0
543,543,"""please don't call it """"rescue"""" - it's human trafficking ports closed send them back benefit seekers",1,0
741,741,"pouting face ragnar, the polygamous bitch ass cunt",1,0
879,879,1. never said you were taught. 2. you called me bitch first. 3. i don't fuckin,1,0


In [0]:
if DATASET == "HatEval":
  df = get_misclassified_HS(test)
  df.to_csv('gs://csc3002/hateval2019/falsepositives.csv', sep=',',  index = True, encoding = 'utf-8')
df.head(50)

Unnamed: 0,id,tweet,label,predictions
0,0,anyone whoever doubted louis & said he couldnt sing go listen to back to you & if u dont change ur mind ur just a bitter bitch,0,1
2,2,this is nancynancy called my pay raise crumbs nancy doesn't want to fund the military nancy puts illegal aliens rights before citizen rights.nancy wants to house the illegals before our homeless veterans don't be a nancy. trump wednesday wisdom maga,0,1
3,3,well bitch tell me how you download viss,0,1
10,10,"your shoes are ugly and anarcho primitivism sucks, bitch",0,1
11,11,this is a pretty sight build that wall,0,1
21,21,"what is the thinking behind veterans against trump ?#potus has reformed & improved the va, while hundreds died during obama reign waiting for treatment.they can think what they want, but let's be rational.#maga kag potus trump news vote red no daca walk away red wave",0,1
22,22,"y'all: ""you a hoe"" hoe me: ""that's hoe culture to you bitch""",0,1
25,25,i have an idea. build that wall detention centers would be reduced dramatically if not eliminated. problem solved.,0,1
28,28,bitch you the one that said you wanted to play,0,1
30,30,"my nail girl was hyping me up the whole time ""i love your hair"" ""i love your eyelashes"" ""i love your outfit"" like yass bitch, here's ya tip",0,1


In [0]:
df

Unnamed: 0,id,tweet,label,predictions
0,0,anyone whoever doubted louis & said he couldnt sing go listen to back to you & if u dont change ur mind ur just a bitter bitch,0,1
2,2,this is nancynancy called my pay raise crumbs nancy doesn't want to fund the military nancy puts illegal aliens rights before citizen rights.nancy wants to house the illegals before our homeless veterans don't be a nancy. trump wednesday wisdom maga,0,1
3,3,well bitch tell me how you download viss,0,1
10,10,"your shoes are ugly and anarcho primitivism sucks, bitch",0,1
11,11,this is a pretty sight build that wall,0,1
...,...,...,...,...
2986,2986,i hope you are not offering them a choice. to hell with the tories send them home,0,1
2988,2988,illegal alien is the correct term. there are no illegal immigrants. there are only legal immigrants. if you snuck into the usa then you are a criminal illegal alien. illegal aliens illegal immigration build the wall,0,1
2990,2990,"amen!finally, we have a puts americans first & our veterans first not a few really rich kneelers!#buildthatwall boycott nike trump2020 trump train trump army trump ville america first",0,1
2991,2991,trump: pentagon will build wall!trump says he could use the military to build his wall if congress won't fund it through homeland security's budget build that wall build the wall,0,1


# Using Tensorboard to Get Deeper Insight

In [0]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip   #Downloads file to google drive

--2020-02-12 18:59:42--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 34.233.35.85, 3.229.196.117, 34.193.139.214, ...
Connecting to bin.equinox.io (bin.equinox.io)|34.233.35.85|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13773305 (13M) [application/octet-stream]
Saving to: ‚Äòngrok-stable-linux-amd64.zip.1‚Äô


2020-02-12 18:59:43 (34.5 MB/s) - ‚Äòngrok-stable-linux-amd64.zip.1‚Äô saved [13773305/13773305]

Archive:  ngrok-stable-linux-amd64.zip
replace ngrok? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ngrok                   


In [0]:
def get_tensorboard(path_to_event_file = OUTPUT_DIR):
  get_ipython().system_raw('tensorboard --logdir {} --host 0.0.0.0 --port 6006 --reload_multifile=true &'
.format(path_to_event_file))
  
  get_ipython().system_raw('./ngrok http 6006 &')

  !curl -s http://localhost:4040/api/tunnels | python3 -c \
      "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

get_tensorboard(OUTPUT_DIR)

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/usr/lib/python3.6/json/__init__.py", line 299, in load
    parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
  File "/usr/lib/python3.6/json/__init__.py", line 354, in loads
    return _default_decoder.decode(s)
  File "/usr/lib/python3.6/json/decoder.py", line 339, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/usr/lib/python3.6/json/decoder.py", line 357, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
