In [0]:
# Copyright 2019 Google Inc.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<a href="https://colab.research.google.com/github/kpe/bert-for-tf2/blob/master/examples/tpu_movie_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Detecting Hate speech Tweets with BERT

We are using bert-tensorflow for this classification task. At the moment I'm making sure it's tensorflow version 1.x because tensorflow version 2 gives issues with Bert at the moment. I believe Tensorflow hopes to have this issue resolved in tensorflow v 2.1

We are using a TPU as a GPU does not have the required memory for Large BERT models- it can only cope with the base model. We'll see if there a TPU detected and we'll set it to a global environment variable so it can be accessed by our BERT functions later.

In [2]:
!pip install gcsfs 
import pandas as pd
import numpy as np

#Make sure to use tensorflow version 1.x, version 2 doesn't work with bert
%tensorflow_version 1.x
import tensorflow as tf
import os

#For cross-validation and grid search
from itertools import product
from tensorflow.python.summary.summary_iterator import summary_iterator
from google.cloud import storage
import ipywidgets as widgets
from IPython.display import display

import sklearn
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn import metrics


import html
import re
import json
import pprint
import random
import string
import nltk
from datetime import datetime
import time


assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

#Below we give ourselves as well as the TPU access to our private GCS bucket
from google.colab import auth
auth.authenticate_user()
tf.reset_default_graph()  
with tf.Session(TPU_ADDRESS) as session:
  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)

USE_TPU=True
try:
  #tf.config.experimental_connect_to_host(TPU_ADDRESS)
  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=TPU_ADDRESS)
  tf.config.experimental_connect_to_cluster(cluster_resolver)
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
except Exception as ex:
  print(ex)
  USE_TPU=False

print("        USE_TPU:", USE_TPU)
print("Eager Execution:", tf.executing_eagerly())

assert not tf.executing_eagerly(), "Eager execution on TPUs have issues currently"

Collecting gcsfs
  Downloading https://files.pythonhosted.org/packages/3e/9f/864a9ff497ed4ba12502c4037db8c66fde0049d9dd0388bd55b67e5c4249/gcsfs-0.6.0-py2.py3-none-any.whl
Installing collected packages: gcsfs
Successfully installed gcsfs-0.6.0
TPU address is grpc://10.1.124.234:8470
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

INFO:tensorflow:Initializing the TPU system: 10.1.124.234:8470
INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Querying Tensorflow master (grpc://10.1.124.234:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores P

Setting a random seed for reproducability of results and checking version of tensorflow

In [3]:
# Setting the graph-level random seed for the default graph. Different than operation level seed
SEED = 3060
tf.reset_default_graph()
os.environ['PYTHONHASHSEED'] = str(SEED)
tf.set_random_seed(SEED) 
random.seed(SEED)
np.random.seed(SEED)
print("Tensorflow Version:", tf.__version__)

Tensorflow Version: 1.15.0


Below we will set the directory where we will store our output model. To ensure the right variables are loaded in our run config function later, our ouput directory must be in the same directory as our pre-trained bert model directory.

Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist).

In [4]:
#Large whole word masking BERt pre-trained weights
bert_model_name = 'wwm_uncased_L-24_H-1024_A-16' 

#Where we output the fine tuned model
output_dir = os.path.join(bert_model_name, 'output1')

DATASET = 'HatEval' #@param ["HatEval", "AnalyticsVidhya"]

#@markdown Whether or not to use the further pretrained model
FURTHER_PRETRAINED = True #@param {type:"boolean"}
if FURTHER_PRETRAINED == True:

  further_pretrained_model = os.path.join(bert_model_name, 'further_pretrained_model1')
  output_dir = os.path.join(further_pretrained_model, 'output')

#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = True #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = True #@param {type:"boolean"}
BUCKET = 'csc3002' #@param {type:"string"}
os.environ["GCLOUD_PROJECT"] = "csc3002"

if USE_BUCKET:
  OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, output_dir)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))


***** Model output directory: gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model1/output *****


<b> If you're not connected to a TPU environment but still want to access GCS bucket - run below: </b>

In [5]:
"""from google.colab import drive
drive.mount('/content/drive')
!gcloud auth activate-service-account --key-file '/content/drive/My Drive/storageCreds.json'
"""

"from google.colab import drive\ndrive.mount('/content/drive')\n!gcloud auth activate-service-account --key-file '/content/drive/My Drive/storageCreds.json'\n"

#### Setting up Data Based Upon Choice of DATASET


In [0]:
if DATASET == 'HatEval':
  dirc = 'gs://csc3002/hateval2019'

  rawTrain = pd.read_csv(os.path.join(dirc, 'hateval2019_en_train.csv'), sep=',',  index_col = False, encoding = 'utf-8')
  rawTrain.rename(columns={'text': 'tweet', 'HS': 'label'}, inplace=True)
  rawTrain.drop(['TR', 'AG'], inplace = True, axis = 1)

  rawDev = pd.read_csv(os.path.join(dirc, 'hateval2019_en_dev.csv'), sep=',',  index_col = False, encoding = 'utf-8')
  rawDev.rename(columns={'text': 'tweet', 'HS': 'label'}, inplace=True)
  rawDev.drop(['TR', 'AG'], inplace = True, axis = 1)

  rawTest = pd.read_csv(os.path.join(dirc, 'hateval2019_en_test.csv'), sep=',',  index_col = False, encoding = 'utf-8')
  rawTest.rename(columns={'text': 'tweet', 'HS': 'label'}, inplace=True)
  rawTest.drop(['TR', 'AG'], inplace = True, axis = 1)

  imbalanced = False

elif DATASET == "AnalyticsVidhya":
  dirc = 'gs://csc3002/trial'

  rawTrain= pd.read_csv(os.path.join(dirc, 'train_E6oV3lV.csv'),  sep=',',  index_col = False, encoding = 'utf-8')
  
  rawTrain, rawDev = train_test_split(rawTrain, test_size=0.20, random_state=SEED)
  
  rawTest = pd.read_csv(os.path.join(dirc, 'test_tweets_anuFYb8.csv'), sep=',',  index_col = False, encoding = 'utf-8')
  
  imbalanced = True

else:
  raise ValueError('No Valid DATASET chosen')

#Training Data
I've stored all of the data, (train, dev and test),  in my google bucket for ease of access, authentication will have to be provided

In [7]:
!gcloud config set project 'my-project-csc3002'

train = rawTrain.sample(frac=1, random_state = SEED) #Shuffling really helps model performance
train.reset_index(drop = True, inplace = True)
train.id = train.index
pd.set_option('display.max_colwidth', -1)
print("Out of {} tweets in this database, {} are not hate, {} are hate".format(len(train.index), 
                                                      len(train[train['label']==0]),
                                                      len(train[train['label']==1])))

Updated property [core/project].
Out of 9000 tweets in this database, 5217 are not hate, 3783 are hate


<b>Original Dataset </b>

In [8]:
train.head(30)

Unnamed: 0,id,tweet,label
0,0,"How come Allah is not helping you it is up to Christian countries to protect you feed you ,The countries hit by violence from islam take refugees in feed them etcPlease no more explaining about your hard times we are doing our best for uYes there is good and bad every where",0
1,1,"With todays #JalalabadAttack &amp; other vicious attacks claimed by ISIS, I smell a spillover of refugees in Pak again. This time we should not open borders for them. We cant afford terrorists taking undue advantage. Let Americans deal with it.#Afghanistan #Jalalabad",1
2,2,"https://t.co/i9LJDjtGz7Migration greatest threat’ to Austrian security, says top military figure.EU and Europe bitterly dividedðŸ‘‰major confrontations between the two.Nothing more counterproductive than “centers” on European territory or euro bribes for migrants.#Visegrad #V4 https://t.co/VnPCTe7opC",0
3,3,When all your friends are out hoe'in and you're stuck at home in a shitty relationship https://t.co/X9oz1Tx7TC,0
4,4,I wonder if rick will make another deal with those crazy ass women 🤔 and if that crazy ass nigga will actually hoe Daryl again 😐,1
5,5,Worker Charged With Sexually Molesting Eight Children at Immigrant Shelter https://t.co/D6HcH03nGL via @CitizenTruth_ #realDonaldTrump do something about this disgrace and stop separating children from their parents.,0
6,6,UN seeks new funding pledges for Palestinian refugees... https://t.co/SNJhD1PWxT https://t.co/DlHQ8fc5N6,0
7,7,"If you really wanna know what someone you're fucking thinks about you, make them show you how you're stored in their phone...",0
8,8,Going to make Du'a at the shrine of Imam Reza(AS) for the refugees in Athens.,0
9,9,"Poor kid. Someone wise must have told him, ""When the world gives you lemons, make lemonade."" He listened. His lemonade should now be offered with ICE in abundance. #BuildTheWall #SendThemBack https://t.co/8AM7fgo9ph",0


# Text Preprocessing

The text pre-processing for this project is detailed in the notebook `Text_Preprocessing.ipynb` in the github repo. Below is an import of the repo into the google colab workspace so I can retrieve and use these functions at convenience

Also below is a function which loads whichever dataset I choose to load from my GCS bucket or local system. This will be useful later when I want to quickly load in data without the messy, long-winded code to go along with it.

<i>We'll put in the option for the function to load and combine two datasets, as later we'll use this when we combine training sets and dev sets for the cross-validation sets.</i>

In [9]:
!git clone https://github.com/fionnmcconville/Automatic-Detection-of-Hate-Speech-Online-Using-BERT.git
%cd Automatic-Detection-of-Hate-Speech-Online-Using-BERT 
#!ls
import preprocessing as pre
#Return to original workspace
%cd ..


params = {'replaceEmoji_v2': False, 'replaceEmoji': True, 'segmentHashtag': True,
             'remove_stop': False, 'lemmatize': False, 'remove_punct': False}

#Function caller can optionally load two dataframes and combine them
def loadData(data1, data2 = None, params_dict = params):

  if data2 is not None:
    frames = [data1,data2]
    data = pd.concat(frames)
  else:
    data = data1
  

  #Don't have both replaceEmoji functions as true. Otherwise they'll cancel each other out.
  #We throw an error here if this is the case
  assert not (params_dict['replaceEmoji'] == True and params_dict['replaceEmoji_v2'] == True), "You can't have two emojiReplace functions selected at the same time"

  #Replace emoji must be done before basic preprocess otherwise unicode will be wiped out
  #And this function will be ineffective
  if params_dict['replaceEmoji'] == True:
    data['tweet'] = data['tweet'].apply(pre.emojiReplace)

  if params_dict['replaceEmoji_v2'] == True:
    data['tweet'] = data['tweet'].apply(pre.emojiReplace_v2)

  #Must be performed after emoji translation
  data['tweet'] = data['tweet'].apply(pre.preprocess)

  if params_dict['segmentHashtag'] == True:
    data['tweet'] = data['tweet'].apply(pre.hashtagSegment)

  if params_dict['remove_punct'] == True:
    data['tweet'] = data['tweet'].apply(lambda x: pre.remove_punct(x))

  if params_dict['remove_stop'] == True:
    data['tweet'] = data['tweet'].apply(lambda x: pre.remove_stopwords(x))

  if params_dict['lemmatize'] == True:
    data['tweet'] = data['tweet'].apply(lambda x: pre.lemmatizing(x))


  #data = data[data['tweet'].apply(lambda x: len(x) > 10)] 
  data = data.sample(frac = 1, random_state=SEED)
  data.dropna(inplace = True)
  data.reset_index(drop = True, inplace = True)

  data.id = data.index
  return data

#Testing function
train = loadData(rawTrain, params_dict = params)

Cloning into 'Automatic-Detection-of-Hate-Speech-Online-Using-BERT'...
remote: Enumerating objects: 61, done.[K
remote: Counting objects: 100% (61/61), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 101 (delta 28), reused 46 (delta 15), pack-reused 40[K
Receiving objects: 100% (101/101), 86.99 MiB | 33.46 MiB/s, done.
Resolving deltas: 100% (32/32), done.
Checking out files: 100% (49/49), done.
/content/Automatic-Detection-of-Hate-Speech-Online-Using-BERT
[33mDownloading emoji data ...[0m
[92m... OK[0m (Got response in 0.11 seconds)
[33mWriting emoji data to /root/.demoji/codes.json ...[0m
[92m... OK[0m
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
/content


**Cleaned tweet text dataset**

In [10]:
train[:30]

Unnamed: 0,id,tweet,label
0,0,"how come allah is not helping you it is up to christian countries to protect you feed you ,the countries hit by violence from islam take refugees in feed them etcplease no more explaining about your hard times we are doing our best for uyes there is good and bad every where",0
1,1,"with todays jalalabad attack and other vicious attacks claimed by isis, i smell a spillover of refugees in pak again. this time we should not open borders for them. we cant afford terrorists taking undue advantage. let americans deal with it.#afghanistan jalalabad",1
2,2,"greatest threat to austrian security, says top military figure.eu and europe bitterly dividedmajor confrontations between the two.nothing more counterproductive than centers on european territory or euro bribes for migrants.#visegrad v4",0
3,3,when all your friends are out hoe'in and you're stuck at home in a shitty relationship,0
4,4,i wonder if rick will make another deal with those crazy ass women thinking face and if that crazy ass nigga will actually hoe daryl again neutral face,1
5,5,worker charged with sexually molesting eight children at immigrant shelter via real donald trump do something about this disgrace and stop separating children from their parents.,0
6,6,un seeks new funding pledges for palestinian refugees...,0
7,7,"if you really wanna know what someone you're fucking thinks about you, make them show you how you're stored in their phone...",0
8,8,going to make du'a at the shrine of imam reza(as) for the refugees in athens.,0
9,9,"poor kid. someone wise must have told him, ""when the world gives you lemons, make lemonade."" he listened. his lemonade should now be offered with ice in abundance. build the wall send them back",0


# Loading in development data. Also specifying label and text columns

We store the name of the Data column containing the text we wish to classify and the name of the corresponding label column in global variables for ease of access down line and also so this code is generalizable.

Label list is just a 0 or a 1 because the version of BERT we've created below only deals in binary classifcation and labels must be ints

In [11]:
dev = loadData(rawDev)

DATA_COLUMN = 'tweet'
LABEL_COLUMN = 'label'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

print("Size of training data", len(train.index))
print("Size of development data", len(dev.index), '\n')

Size of training data 9000
Size of development data 1000 



## Import Custom BERT Repositry

Instead of importing functions from the official BERT repo I have opted to import these functions from my custom, forked BERT repositry.

This allows for a more custom approach when building the BERT model and fine-tuning it. It also is a much more clean way of organising this project, rather than copying entire functions from the BERT repo in this notebook and modifying them here

The`create_model` function in my BERT repo will be edited later, the default function from BERT simply creates a single layer that will be trained to adapt BERT to our task (i.e. classifying whether a tweet is hate speech or not). This strategy of using a mostly trained model is called <i>fine-tuning</i>.

My hope is that later I will test more complex methods in fine-tuning, such as CNNs, RNNs and large scale LTSMs.

Also the `model_fn` method in my BERT repo is less verbose than the default one, as well as providing far more detailed metrics than just accuracy and loss, such as F - Score, AUC, precision and recall - so I can better analyse the performance of different models

In [12]:
!git clone https://github.com/fionnmcconville/bert.git
%cd bert
import run_classifier
import optimization
import tokenization
import modeling
#Return to original workspace
%cd ..

Cloning into 'bert'...
remote: Enumerating objects: 360, done.[K
Receiving objects:   0% (1/360)   Receiving objects:   1% (4/360)   Receiving objects:   2% (8/360)   Receiving objects:   3% (11/360)   Receiving objects:   4% (15/360)   Receiving objects:   5% (18/360)   Receiving objects:   6% (22/360)   Receiving objects:   7% (26/360)   Receiving objects:   8% (29/360)   Receiving objects:   9% (33/360)   Receiving objects:  10% (36/360)   Receiving objects:  11% (40/360)   Receiving objects:  12% (44/360)   Receiving objects:  13% (47/360)   Receiving objects:  14% (51/360)   Receiving objects:  15% (54/360)   Receiving objects:  16% (58/360)   Receiving objects:  17% (62/360)   Receiving objects:  18% (65/360)   Receiving objects:  19% (69/360)   Receiving objects:  20% (72/360)   Receiving objects:  21% (76/360)   Receiving objects:  22% (80/360)   Receiving objects:  23% (83/360)   Receiving objects:  24% (87/360)   Receiving objects:  25% (90/360)   R

#Data Preprocessing
We'll need to transform our data into a format BERT understands. This involves two steps. First, we create  `InputExample`'s using the constructor provided in the BERT library.

- `text_a` is the text we want to classify, which in this case, is the `tweet` field in our Dataframe. 
- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.
- `label` is the label for our example, i.e. True, False

In [0]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for book-keeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

dev_InputExamples = dev.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):


1. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
2. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
3. Map our words to indexes using a vocab file that BERT provides
4. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
5. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))

Happily, we don't have to worry about most of these details. It's automated with the below inbuilt functions




Below is a way to retrieve desired BERT parameters, such as it's pre-trained checkpoints and it's vocab file, from my google storage bucket where I've downloaded the uncased LARGE version of bert.

In [14]:
bucket_dir = 'gs://csc3002'
bert_ckpt_dir = os.path.join(bucket_dir, bert_model_name)

#For further pretrained model
if FURTHER_PRETRAINED:
  further_pretrained_model = os.path.join(bert_model_name, 'further_pretrained_model1')
  further_pretrained_model = os.path.join(bucket_dir, further_pretrained_model)
  bert_ckpt_file = tf.train.latest_checkpoint(further_pretrained_model)
  print("\nUsing BERT checkpoint from directory:", os.path.join(further_pretrained_model))

else:
  bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
  print("\nUsing BERT checkpoint from directory:", bert_ckpt_dir)

print("\nBERT checkpoint file is:", bert_ckpt_file)

#Setting up BERT config, vocab file and tokenizer - all default from the BERT repo
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")
vocab_file = os.path.join(bert_ckpt_dir, "vocab.txt")

if params['replaceEmoji_v2'] == True:
  vocab_file = os.path.join(bert_ckpt_dir, "vocab1.txt")
  
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file)


print("\nMake sure that the function loads a checkpoint, if it doesn't an error will be thrown here")
assert bert_ckpt_file is not None, "No BERT checkpoint file loaded"

print("\nUsing vocab file\n", vocab_file)
print("\nBelow is an example of the BERT tokenizer in action")
tokenizer.tokenize("This here's an example of using the BERT tokenizer")


Using BERT checkpoint from directory: gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model1

BERT checkpoint file is: gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model1/model.ckpt-40000


Make sure that the function loads a checkpoint, if it doesn't an error will be thrown here

Using vocab file
 gs://csc3002/wwm_uncased_L-24_H-1024_A-16/vocab.txt

Below is an example of the BERT tokenizer in action


['this',
 'here',
 "'",
 's',
 'an',
 'example',
 'of',
 'using',
 'the',
 'bert',
 'token',
 '##izer']

Using our tokenizer, we'll call `run_classifier.convert_examples_to_features` on our InputExamples to convert them into features BERT understands.

In [15]:
# BERT is limited to 512 tokens in length
MAX_SEQ_LENGTH = 256
# Convert our train and dev features to InputFeatures that BERT understands.
train_features =  run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
dev_features = run_classifier.convert_examples_to_features(dev_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)


INFO:tensorflow:Writing example 0 of 9000
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] how come allah is not helping you it is up to christian countries to protect you feed you , the countries hit by violence from islam take refugees in feed them etc ##ple ##ase no more explaining about your hard times we are doing our best for u ##yes there is good and bad every where [SEP]
INFO:tensorflow:input_ids: 101 2129 2272 16455 2003 2025 5094 2017 2009 2003 2039 2000 3017 3032 2000 4047 2017 5438 2017 1010 1996 3032 2718 2011 4808 2013 7025 2202 8711 1999 5438 2068 4385 10814 11022 2053 2062 9990 2055 2115 2524 2335 2057 2024 2725 2256 2190 2005 1057 23147 2045 2003 2204 1998 2919 2296 2073 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

The Run config will be the same across all evaluation options below for running BERT. In it we define the amount of summary steps, as well as how often we should checkpoint the model

In [16]:
# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
run_config = tf.compat.v1.estimator.tpu.RunConfig(  
    #I think the output file must be a sub-directory of the main BERT file
    model_dir=OUTPUT_DIR, 
    tf_random_seed=SEED,
    cluster=cluster_resolver,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=100,    #Shows us summary metrics every 100 steps
        num_shards=8,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))
print(run_config.session_config)

allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.1.124.234:8470"
    }
  }
}
isolate_session_state: true



### Setting up fine tuning model configurations and parameters

In [17]:
# Compute train and warmup steps from batch size
TRAIN_BATCH_SIZE = 32 #recommended 16 or 32
EVAL_BATCH_SIZE = 8
PREDICT_BATCH_SIZE = 8
LEARNING_RATE = 2e-5 # Recommended 5e-5, 3e-5 or 2e-5
NUM_TRAIN_EPOCHS = 3.0 # Recommended 2, 3 or 4
MAX_SEQ_LENGTH = 256
# Warmup is a period of time where the learning rate 
#is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1

# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

print("The model will stop training when it reaches", num_train_steps, "as a checkpoint")
print("\nThe bert checkpoint directory is", bert_ckpt_dir)
print("\nThe output directory is", OUTPUT_DIR, '\n')

#This is the model function, which feeds in the bert configurations, the pretrained model itself and the parameters for the fine tuning of the model
model_fn = run_classifier.model_fn_builder(
  bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
  num_labels=len(label_list),
  init_checkpoint=bert_ckpt_file,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  use_one_hot_embeddings=True)

#We use Tensorflow estimators to train, evaluate and test our model
#estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=True,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE,
    predict_batch_size=PREDICT_BATCH_SIZE)

The model will stop training when it reaches 843 as a checkpoint

The bert checkpoint directory is gs://csc3002/wwm_uncased_L-24_H-1024_A-16

The output directory is gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model1/output 

INFO:tensorflow:Using config: {'_model_dir': 'gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model1/output', '_tf_random_seed': 3060, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.1.124.234:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '

Next we create an input builder function that takes our training feature set (`train_features`) and produces a generator.

This is a pretty standard design pattern for working with Tensorflow Estimators

In [0]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True,)

# Input function for dev data, we feed in our previously created dev_features for this
test_input_fn = run_classifier.input_fn_builder(
    features=dev_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=True)

## Simple Train and then Evaluate
<b>Now we train our BERT fine-tuned model

In [0]:
print("\nThe model will stop training when it reaches", num_train_steps, "as a checkpoint")
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)#, hooks=[evaluator])
train_time = datetime.now() - current_time
print("Training took time ", train_time)


The model will stop training when it reaches 840 as a checkpoint
Beginning Training!
INFO:tensorflow:Querying Tensorflow master (grpc://10.8.208.162:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 15396571222743673123)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 3033160054236891001)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 16028321910086008709)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 10458696249324074875)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 17179869184

<b>And now we evaluate the performance of our model on the development data<b>

In [0]:
#You need to provide number of steps for a TPU
eval_steps = int(len(dev_features) / EVAL_BATCH_SIZE)

#Eval will be slightly WRONG on the TPU because it will drop the last batch (drop_remainder = True).
estimator.evaluate(input_fn=test_input_fn, steps=eval_steps)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:*** Features ***
INFO:tensorflow:  name = input_ids, shape = (1, 256)
INFO:tensorflow:  name = input_mask, shape = (1, 256)
INFO:tensorflow:  name = label_ids, shape = (1,)
INFO:tensorflow:  name = segment_ids, shape = (1, 256)
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.









INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-02-26T13:31:03Z
INFO:tensorflow:TPU job name worker
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from gs://csc3002/wwm_uncased_L-24_H-1024_A-16/output/model.ckpt-840
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Init TPU system
INFO:tensorflow:Initialized TPU in 9 seconds
INFO:tensorflow:Starting infeed thread controller.
INFO:tensorflow:Starting outfeed thread controller.
INFO:tensorflow:Initialized dataset iterators in 0 seconds
INFO:tensorflow:Enqueue next (124) 

{'F1_Score': 0.77377045,
 'auc': 0.7965411,
 'eval_accuracy': 0.79133064,
 'eval_loss': 1.067598,
 'false_negatives': 71.0,
 'false_positives': 136.0,
 'global_step': 840,
 'loss': 1.1221331,
 'precision': 0.722449,
 'recall': 0.8329412,
 'true_negatives': 431.0,
 'true_positives': 354.0}

## `tf.train_and_evaluate` function for tensorflow

### Setting the Run Config for Tensorflow train_and evaluate

Setting the TF_CONFIG environment variable so we can use the `train_and_evaluate` function. We need to set this because we need to explicitly define the roles of each node in our TPU cluster for this function so training and evaluation can run in concurrence. Otherwise the function will never evaluate the model as it will be too busy using all of the TPU resources to train the model unless told otherwise


In [0]:
#cluster_resolver.cluster_spec().as_dict()  - shows the json for the current cluster as defined by the cluster resolver
"""def _cluster():
    return {'worker': [TPU_ADDRESS,TPU_ADDRESS, TPU_ADDRESS],
             'ps': [TPU_ADDRESS, TPU_ADDRESS],
             'chief': [TPU_ADDRESS]}

def _set_tf_config():
    tf_config = {
            'cluster': _cluster(),
            'task': {'type': 'worker', 'index': 0}}
    os.environ['TF_CONFIG'] = json.dumps(tf_config)

_set_tf_config()
print(os.environ['TF_CONFIG'])

# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
run_config = tf.compat.v1.estimator.tpu.RunConfig(  
    #I think the output file must be a sub-directory of the main BERT file
    model_dir=OUTPUT_DIR, 
    tf_random_seed=SEED,
    cluster=cluster_resolver,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=100,    #Shows us summary metrics every 100 steps
        num_shards=8,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))
print(run_config.session_config)

"""

#### Evaluate whilst training classifier and export best performing model

Requires a factory reset runtime after run. The hope is to create a function which exports only the best model. I believe that we need two machines to run training and evaluation in concurrence. I only have one TPU worker in the colab environment.

The function may possibly be finagled into working, howevere many sources on the internet dispute it can work at all, including this medium article https://medium.com/tensorflow/how-to-write-a-custom-estimator-model-for-the-cloud-tpu-7d8bd9068c26

Besides, the hard-coded train_and_evaluate function I have works, it my be slower but it is quicker to just use it, rather than try and find a solution for the the `tf.train_and_evaluate` function - which is not readily available at this time

In [0]:
"""#Below is the serving Input function which is to be used with the BestExporter function of the estimator.
#In the function we define what sort of inputs are expected when we predict the model
def serving_input_fn():
    with tf.compat.v1.variable_scope("foo"):
      feature_spec = {
          "input_ids": tf.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
          "input_mask": tf.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
          "segment_ids": tf.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
          "label_ids": tf.FixedLenFeature([], tf.int64),
        }
      serialized_tf_example = tf.placeholder(dtype=tf.int64,
                                             shape=[1, MAX_SEQ_LENGTH])
      receiver_tensors = {'input_ids': serialized_tf_example}
      features = tf.parse_example(serialized_tf_example, feature_spec)
      return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)

def compare_fn(best_eval_result, current_eval_result):

  #F-Score isn't available, besides AUC is a good metric too.
  default_key = metric_keys.MetricKeys.AUC

  if not best_eval_result or default_key not in best_eval_result:
    raise ValueError(
        'best_eval_result cannot be empty or no loss is found in it.')

  if not current_eval_result or default_key not in current_eval_result:
    raise ValueError(
        'current_eval_result cannot be empty or no loss is found in it.')

  return best_eval_result[default_key] > current_eval_result[default_key]
  

train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=1000)
estimator._export_to_tpu = False
best_exporter = tf.estimator.BestExporter(serving_input_receiver_fn=serving_input_fn, compare_fn = compare_fn, exports_to_keep=5)
exporters = [best_exporter]
eval_spec = tf.estimator.EvalSpec(input_fn=test_input_fn, start_delay_secs = 10, throttle_secs = 30, exporters=exporters)
#Can only be called once per run as not all gRPC servers from the first call have been closed

#tf.estimator.train_and_evaluate does not seem to work properly with TPUs. 
#It cannot handle the distributed TPU strategy seemingly, or perhaps there are other problems
model = tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)"""

'#Below is the serving Input function which is to be used with the BestExporter function of the estimator.\n#In the function we define what sort of inputs are expected when we predict the model\ndef serving_input_fn():\n    with tf.compat.v1.variable_scope("foo"):\n      feature_spec = {\n          "input_ids": tf.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),\n          "input_mask": tf.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),\n          "segment_ids": tf.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),\n          "label_ids": tf.FixedLenFeature([], tf.int64),\n        }\n      serialized_tf_example = tf.placeholder(dtype=tf.int64,\n                                             shape=[1, MAX_SEQ_LENGTH])\n      receiver_tensors = {\'input_ids\': serialized_tf_example}\n      features = tf.parse_example(serialized_tf_example, feature_spec)\n      return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)\n\ndef compare_fn(best_eval_result, current_eval_result):\n\n  #F-Sc

## Running Evaluation Whilst Training - Hard-coded function

Below is a custom function to run evaluation on the fine-tuned BERT model whilst training. A grid search is quite inefficient for obtaining the best model as there are so many different values of steps one can attempt. 

The function below solves this by identifying the optimum number of steps the fine-tuning should run for based upon evaluation metrics by testing the trained model to that point against the dev set

Presently the problem is that every time we want to run an evaluation within training, tensorflow restores the parameters from the most recent checkpoint which has a lot of overhead as BERT large is a huge model. There is a way to overcome this in GPU training with the session hook InMemoryEvaluationHook in conjunction with estimator.train - however this does not work with TPUs. As the amount of data we train on is not that much, this is not a big problem

In [0]:
#We'll set a large value for train steps because we want to make this model run
#for as long as possible before it finds the optimimum model
hparams = {'train_steps': 3000, 
            'train_batch_size': 32,
            'eval_batch_size': 8,
            'use_tpu': True,
            'num_train_features': len(train_features),
            'num_eval_features': len(dev_features),
           'learning_rate': 2e-5 
            }
if DATASET == 'AnalyticsVidhya':
  hparams['train_steps'] = 12000
          
def load_global_step_from_checkpoint_dir(checkpoint_dir):
  try:
    checkpoint_reader = tf.train.NewCheckpointReader(
        tf.train.latest_checkpoint(checkpoint_dir))
    return checkpoint_reader.get_tensor(tf.GraphKeys.GLOBAL_STEP)
  except:  
    return 0

def train_and_evaluate(out_dir, hparams, steps_per_eval):

#Delete prior model graph, checkpoints and eval files to enable consecutive runs, rather than resetting runtime
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
  # Doesn't matter if the directory didn't exist
    pass
  tf.gfile.MakeDirs(OUTPUT_DIR)

  max_steps = hparams['train_steps']
  train_batch_size = hparams['train_batch_size']
  eval_batch_size = hparams['eval_batch_size']
  print('\ntrain_batch_size={:d}  eval_batch_size={:d}  max_steps={:d}'.format(
                  train_batch_size,
                  eval_batch_size,
                  max_steps))

  config = tf.contrib.tpu.RunConfig(
    cluster=cluster_resolver,
    model_dir=out_dir,
    save_checkpoints_steps=steps_per_eval,
    tpu_config=tf.contrib.tpu.TPUConfig(
      iterations_per_loop=steps_per_eval,
      per_host_input_for_training=True))

  model_fn = run_classifier.model_fn_builder(
  bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
  num_labels=len(label_list),
  init_checkpoint=bert_ckpt_file,
  learning_rate=hparams['learning_rate'],
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  use_one_hot_embeddings=True)

  estimator = tf.contrib.tpu.TPUEstimator(  # TPU change 4
    model_fn=model_fn,
    config=config,
    params=hparams,
    model_dir=out_dir,
    train_batch_size=train_batch_size,
    eval_batch_size=eval_batch_size,
    use_tpu=True
  )
 # load last checkpoint and start from there
  current_step = load_global_step_from_checkpoint_dir(out_dir)
  steps_per_epoch = hparams['num_train_features'] // train_batch_size
  print('\nTraining for {:d} steps ({:2f} epochs in total). Current'
                  ' step {:d}.'.format(
                  max_steps,
                  max_steps / steps_per_epoch,
                  current_step))

  start_timestamp = time.time()  # This time will include compilation time
  best_score = 0
  best_model = 0
  while current_step < max_steps:
    # Train for up to steps_per_eval number of steps.
    # At the end of training, a checkpoint will be written to --model_dir.
    next_checkpoint = min(current_step + steps_per_eval, max_steps)
    estimator.train(input_fn=train_input_fn, max_steps=next_checkpoint)
    current_step = next_checkpoint
    print('\nFinished training up to step {:d}. Elapsed seconds {:d}.\n'.format(
                    next_checkpoint, int(time.time() - start_timestamp)))

    print('\nStarting to evaluate at step {:d} \n'.format(next_checkpoint))
    eval_results = estimator.evaluate(
      input_fn=test_input_fn,
      steps=hparams['num_eval_features'] // eval_batch_size)
    print('\nEval results at step {:d}: \n'.format(next_checkpoint), eval_results)
    
    current_score = eval_results['F1_Score']
    if current_score > best_score:
      best_score = current_score 
      best_model = current_step
      score_buffer = [] #Reset buffer
    else:
      score_buffer.append(current_score)
    #If 3 times in a row evaluation results haven't improved; we stop training
    if len(score_buffer) == 3:
      elapsed_time = int(time.time() - start_timestamp)
      
      print('\nFinished training at step {:d} as there has been no improvement on the previous 3 iterations'.format(current_step),
      '\nElapsed seconds {:d}. \n'.format(elapsed_time), 
      "\nBest model is at step {:d} with the best F-score {:d}".format(best_model, best_score),
      "\nNow edit the protocol buffer file and set the most recent step to", best_model,
            "so this model checkpoint can be loaded using the tf.train.latest_checkpoint function")
      
      return best_model
    

  elapsed_time = int(time.time() - start_timestamp)
  print('\nFinished training up to step {:d}. Elapsed seconds {:d}. \n'.format(max_steps, elapsed_time))
  return best_model
  

Now run the train_and_evaluate function. We can toggle the steps_pereval in the params to control how often we checkpoint and evaluate

In [0]:
best_step = train_and_evaluate(OUTPUT_DIR, hparams, steps_per_eval=1000) # Will return the optimum step for the BERT model
print("\nBest step for model is at", best_step)


train_batch_size=32  eval_batch_size=8  max_steps=12000
INFO:tensorflow:Using config: {'_model_dir': 'gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model1/output', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.103.143.2:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f8a7f9e16a0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.103

# Cross Validation evaluation

Does not provide in depth tensorflow logging but it does provide evaluation at the end. As mentioned above, we combine the provided training and dev files



In [0]:
def bertCV(data, train_batch_size = 32, learn_rate = 2e-5,\
           num_epochs =3.0, folds = 5):

  #Filter out all log messages so console isn't consumed with memory
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

  """FIXED MODEL PARAMS"""
  EVAL_BATCH_SIZE = 8
  PREDICT_BATCH_SIZE = 8
  MAX_SEQ_LENGTH = 256

  # Warmup is a period of time where the learning rate 
  #is small and gradually increases - usually helps training.
  WARMUP_PROPORTION = 0.1

  #Dataframe where grid search results will be stored. Empty to begin with
  eval_df = pd.DataFrame(columns = ['F1 Score', 'auc', 'Accuracy'] )
  
  k = 1 # Fold counter

  #Stratified K fold ensures the folds are made by preserving the percentage of samples for each class.
  cv = StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)

  # Sticking within the training dataset for evaluation. Data is the combination of the provided train and dev sets
  for train_index, dev_index in cv.split(data.tweet, data.label): 
    
    #Shuffling again because otherwise the StratifiedKFold function groups a lot of 0's at the start
    training  = data.iloc[train_index]
    training = training.sample(frac = 1, random_state=SEED)
    develop = data.iloc[dev_index]
    develop = develop.sample(frac = 1, random_state=SEED)
    
    """Unlike before where I only one test set and one training set, this time I have K different sets of training and testing.
    Therefore, in each fold I need to get a new set of data and convert it to features each time."""
    
    # Use the InputExample class from BERT's run_classifier code to create examples from the data

    train_InputExamples = training.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                      text_a = x[DATA_COLUMN], 
                                                                      text_b = None, 
                                                                      label = x[LABEL_COLUMN]), axis = 1)

    dev_InputExamples = develop.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                      text_a = x[DATA_COLUMN], 
                                                                      text_b = None, 
                                                                      label = x[LABEL_COLUMN]), axis = 1)
    
    #Convert these examples to features that BERT can interpret
    train_features =  run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
    dev_features = run_classifier.convert_examples_to_features(dev_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

    #Delete prior model graph, checkpoints and eval files to make room for new model each loop
    try:
      tf.gfile.DeleteRecursively(OUTPUT_DIR)
    except:
    # Doesn't matter if the directory didn't exist
      pass
    tf.gfile.MakeDirs(OUTPUT_DIR)

    # Compute # train and warmup steps from batch size
    num_train_steps = int(len(train_features) / train_batch_size * NUM_TRAIN_EPOCHS)
    num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

    # Model configs
    model_fn = run_classifier.model_fn_builder(
    bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
    num_labels=len(label_list),
    init_checkpoint=bert_ckpt_file,
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=True,
    use_one_hot_embeddings=True)

    estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
      use_tpu=True,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=TRAIN_BATCH_SIZE,
      eval_batch_size=EVAL_BATCH_SIZE,
      predict_batch_size=PREDICT_BATCH_SIZE)
    
    # Create an input function for training. drop_remainder = True for using TPUs.
    train_input_fn = run_classifier.input_fn_builder(
        features=train_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=True)

    #input function for dev data, we feed in our previously created dev_features for this
    dev_input_fn = run_classifier.input_fn_builder(
        features=dev_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=True)
    
    current_time = datetime.now()
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps )
    train_time = datetime.now() - current_time
    

    #You need to provide number of steps for a TPU
    eval_steps = int(len(dev_InputExamples) / EVAL_BATCH_SIZE)

    #Eval may be slightly WRONG on the TPU because it will truncate the last batch.
    eval_results = estimator.evaluate(input_fn=dev_input_fn, steps=eval_steps)

    row = pd.Series({'F1 Score': eval_results['F1_Score'], 'auc': eval_results['auc'], 'Accuracy': eval_results['eval_accuracy'],'Precision': eval_results['precision'],'Recall': eval_results['recall'],\
                                    'False Negatives': eval_results['false_negatives'],'False Positives': eval_results['false_positives'],\
                    'True Negatives':eval_results['true_negatives'] ,'True Positives': eval_results['true_positives'], 'Training Time': train_time })
    #row = get_metrics(OUTPUT_DIR, train_time, k)
    row = pd.Series(row, name = 'Fold ' + str(k))

    """Below statement controls for whenever we get a bad fold which results in a model predicting only one class.
    This isn't truly representative of normal performance and can bring down CV score, so we omit model evaluation
    if the below statement is true"""
    if eval_results['false_negatives'] < 1 or eval_results['false_positives'] < 1: 
      print("Classifier predicts one class. Thus not recording this metric as it will skew CV\n")
      #k = k + 1
      continue

    eval_df = eval_df.append(row)
    print("Fold " + str(k) + ":\tF-Score:", eval_df["F1 Score"][k-1])
    print("Training took time ", train_time)
    print('---------------------------------------------------------------------------------------------------------\n')
    k = k + 1 #Increment on fold counter

  row = eval_df.mean(axis = 0)
  row = pd.Series(row, name = 'CV Average')
  eval_df = eval_df.append(row)
  print("\nTraining Batch Size: ", train_batch_size, "\tLearn Rate: ", learn_rate, "\tNum Epochs: ", num_epochs)
  display(eval_df)

  return row # Also return row of CV-Average

### Cross-Validation
Basic cross-validation can be performed here

In [0]:
data = loadData(rawTrain, rawDev, params)

CV_Av = bertCV(data, learn_rate = 2e-5, num_epochs=4.0)

TypeError: ignored

### Cross-Validation of cross-validation

Tensorflow 1.x is non-deterministic, which has resulted in the variability between each run to be greater than the difference in performance gained between introductions of different configurations and parameters. This makes it difficult to determine what is the best pre-trainig and text preprocessing pipeline to undertake.

To better ensure the reliability of experiments my solution is to have a 5 fold cross-validation of a cross-validated sample of my data. My hope is that by doing this I can better identify what pre-processing pipeline works best in my program and also which further pre-trained checkpoint is best (i.e. after 40,000 steps or 120,000 steps)

In [0]:
data = loadData(rawTrain, rawDev, params)

#Stratified K fold ensures the folds are made by preserving the percentage of samples for each class.
folds = 5
cv = StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
eval_df = pd.DataFrame(columns = ['F1 Score', 'auc', 'Accuracy'])

#This will be a 5-fold CV so the sample each time will be a fifth of the data
i = 1
for __, data_index in cv.split(data.tweet, data.label):
  dat = data.iloc[data_index]
  CV_Av = bertCV(dat, learn_rate = 2e-5, num_epochs=4.0)
  CV_Av = pd.Series(CV_Av, name = 'CV Average' + str(i))
  eval_df = eval_df.append(CV_Av)

row = eval_df.mean(axis = 0)
row = pd.Series(row, name = '40,000')
eval_df1 = pd.read_csv('gs://csc3002/hateval2019/pretraining_eval_df1.csv', sep=',',  index_col = 0, encoding = 'utf-8')
eval_df1 = eval_df1.append(row)
eval_df1.to_csv('gs://csc3002/hateval2019/pretraining_eval_df1.csv', sep=',',  index = True, encoding = 'utf-8')
eval_df1


Fold 1:	F-Score: 0.722741425037384
Training took time  0:08:53.766756
---------------------------------------------------------------------------------------------------------

Fold 2:	F-Score: 0.703601062297821
Training took time  0:05:09.879504
---------------------------------------------------------------------------------------------------------

Fold 3:	F-Score: 0.7641791105270386
Training took time  0:05:17.234892
---------------------------------------------------------------------------------------------------------

Fold 4:	F-Score: 0.7848836779594421
Training took time  0:05:19.039668
---------------------------------------------------------------------------------------------------------

Fold 5:	F-Score: 0.7942028045654297
Training took time  0:05:22.761278
---------------------------------------------------------------------------------------------------------


Training Batch Size:  32 	Learn Rate:  2e-05 	Num Epochs:  4.0


Unnamed: 0,F1 Score,auc,Accuracy,False Negatives,False Positives,Precision,Recall,Training Time,True Negatives,True Positives
Fold 1,0.722741,0.765497,0.7775,52.0,37.0,0.75817,0.690476,00:08:53.766756,195.0,116.0
Fold 2,0.703601,0.735735,0.7325,41.0,66.0,0.658031,0.755952,00:05:09.879504,166.0,127.0
Fold 3,0.764179,0.796901,0.8025,40.0,39.0,0.766467,0.761905,00:05:17.234892,193.0,128.0
Fold 4,0.784884,0.812828,0.815,34.0,40.0,0.771429,0.798817,00:05:19.039668,191.0,135.0
Fold 5,0.794203,0.82091,0.8225,32.0,39.0,0.778409,0.810651,00:05:22.761278,192.0,137.0
CV Average,0.753922,0.786374,0.79,39.8,44.2,0.746501,0.76356,00:06:00.536419,187.4,128.6


Fold 1:	F-Score: 0.7830985188484192
Training took time  0:05:22.738472
---------------------------------------------------------------------------------------------------------

Fold 2:	F-Score: 0.7797618508338928
Training took time  0:05:27.449404
---------------------------------------------------------------------------------------------------------

Fold 3:	F-Score: 0.7048192024230957
Training took time  0:05:20.619050
---------------------------------------------------------------------------------------------------------

Fold 4:	F-Score: 0.7485713362693787
Training took time  0:05:35.600859
---------------------------------------------------------------------------------------------------------

Fold 5:	F-Score: 0.7323076128959656
Training took time  0:05:31.675044
---------------------------------------------------------------------------------------------------------


Training Batch Size:  32 	Learn Rate:  2e-05 	Num Epochs:  4.0


Unnamed: 0,F1 Score,auc,Accuracy,False Negatives,False Positives,Precision,Recall,Training Time,True Negatives,True Positives
Fold 1,0.783099,0.810242,0.8075,29.0,48.0,0.743316,0.827381,00:05:22.738472,184.0,139.0
Fold 2,0.779762,0.81014,0.815,37.0,37.0,0.779762,0.779762,00:05:27.449404,195.0,131.0
Fold 3,0.704819,0.746921,0.755,51.0,47.0,0.713415,0.696429,00:05:20.619050,185.0,117.0
Fold 4,0.748571,0.779349,0.78,38.0,50.0,0.723757,0.775148,00:05:35.600859,181.0,131.0
Fold 5,0.732308,0.771984,0.7825,50.0,37.0,0.762821,0.704142,00:05:31.675044,194.0,119.0
CV Average,0.749712,0.783727,0.788,41.0,43.8,0.744614,0.756572,00:05:27.616565,187.8,127.4


Fold 1:	F-Score: 0.7607361078262329
Training took time  0:05:26.349457
---------------------------------------------------------------------------------------------------------

Fold 2:	F-Score: 0.786786675453186
Training took time  0:05:23.930032
---------------------------------------------------------------------------------------------------------

Fold 3:	F-Score: 0.768817126750946
Training took time  0:05:37.883686
---------------------------------------------------------------------------------------------------------

Fold 4:	F-Score: 0.7603304982185364
Training took time  0:05:31.956526
---------------------------------------------------------------------------------------------------------

Fold 5:	F-Score: 0.6624203324317932
Training took time  0:05:42.495866
---------------------------------------------------------------------------------------------------------


Training Batch Size:  32 	Learn Rate:  2e-05 	Num Epochs:  4.0


Unnamed: 0,F1 Score,auc,Accuracy,False Negatives,False Positives,Precision,Recall,Training Time,True Negatives,True Positives
Fold 1,0.760736,0.795772,0.805,44.0,34.0,0.78481,0.738095,00:05:26.349457,198.0,124.0
Fold 2,0.786787,0.816605,0.8225,37.0,34.0,0.793939,0.779762,00:05:23.930032,198.0,131.0
Fold 3,0.768817,0.79413,0.785,25.0,61.0,0.70098,0.85119,00:05:37.883686,171.0,143.0
Fold 4,0.76033,0.787072,0.7825,31.0,56.0,0.71134,0.816568,00:05:31.956526,175.0,138.0
Fold 5,0.66242,0.718948,0.735,65.0,41.0,0.717241,0.615385,00:05:42.495866,190.0,104.0
CV Average,0.747818,0.782505,0.786,40.4,45.2,0.741662,0.7602,00:05:32.523113,186.4,128.0


Classifier predicts one class. Thus not recording this metric as it will skew CV

Fold 1:	F-Score: 0.766570508480072
Training took time  0:05:31.500574
---------------------------------------------------------------------------------------------------------

Fold 2:	F-Score: 0.8192089200019836
Training took time  0:05:40.513690
---------------------------------------------------------------------------------------------------------

Fold 3:	F-Score: 0.7683284878730774
Training took time  0:05:43.482192
---------------------------------------------------------------------------------------------------------

Fold 4:	F-Score: 0.7546011805534363
Training took time  0:05:42.643961
---------------------------------------------------------------------------------------------------------


Training Batch Size:  32 	Learn Rate:  2e-05 	Num Epochs:  4.0


Unnamed: 0,F1 Score,auc,Accuracy,False Negatives,False Positives,Precision,Recall,Training Time,True Negatives,True Positives
Fold 1,0.766571,0.796089,0.7975,36.0,45.0,0.747191,0.786982,00:05:31.500574,186.0,133.0
Fold 2,0.819209,0.843186,0.84,23.0,41.0,0.77957,0.863095,00:05:40.513690,191.0,145.0
Fold 3,0.768328,0.799364,0.8025,37.0,42.0,0.757225,0.779762,00:05:43.482192,190.0,131.0
Fold 4,0.754601,0.79064,0.8,45.0,35.0,0.778481,0.732143,00:05:42.643961,197.0,123.0
CV Average,0.777177,0.80732,0.81,35.25,40.75,0.765617,0.790496,00:05:39.535104,191.0,133.0


Fold 1:	F-Score: 0.792022705078125
Training took time  0:05:41.983810
---------------------------------------------------------------------------------------------------------

Fold 2:	F-Score: 0.765714168548584
Training took time  0:05:44.589089
---------------------------------------------------------------------------------------------------------

Fold 3:	F-Score: 0.7272726893424988
Training took time  0:05:37.342505
---------------------------------------------------------------------------------------------------------



<b> Collecting and saving results </b>

In [0]:
emoji = CV_Av
print(emoji)
emoji = pd.Series(emoji, name = 'Emoji Replacement')

F1 Score           0.786213              
auc                0.814614              
Accuracy           0.81757               
False Negatives    171.6                 
False Positives    191.8                 
Precision          0.777174              
Recall             0.795718              
Training Time      0 days 00:08:17.360357
True Negatives     960.2                 
True Positives     668.4                 
Name: CV Average, dtype: object


In [0]:
eval_df = pd.read_csv('gs://csc3002/hateval2019/preprocess_eval_df.csv', sep=',',  index_col = 0, encoding = 'utf-8')
eval_df

Unnamed: 0,F1 Score,precision,false_positives,Training Time,auc,eval_accuracy,false_negatives,recall,true_negatives,true_positives,Accuracy,False Negatives,False Positives,Precision,Recall,True Negatives,True Positives
Basic,0.796373,0.785083,185.8,0 days 00:09:16.609999,0.822897,0.825302,160.8,0.808148,959.8,677.6,,,,,,,
Contractions,0.796084,0.788785,180.4,0 days 00:09:13.619132400,0.823061,0.826109,164.6,0.803677,965.2,673.8,,,,,,,
Hashtag Segmentation,0.792463,0.778531,192.4,0 days 00:09:21.422566,0.819521,0.821573,161.6,0.807077,953.2,676.8,,,,,,,
Emoji Replacement,0.793692,0.79395,172.8,0 days 00:09:15.707040,0.821786,0.826205,173.4,0.793561,978.8,667.0,,,,,,,
All Preprocessing,0.796205,0.785145,185.6,0 days 00:09:23.425535,0.823387,0.825803,161.4,0.807766,966.0,679.0,,,,,,,
Removing Stopwords,0.729251,,,0 days 00:09:22.530510,0.737039,,,,,,0.757831,317.6,164.8,0.608553,0.617838,987.2,522.4
Basic,0.791682,,,0 days 00:08:05.076683,0.818791,,,,,,0.820565,161.4,194.6,0.77698,0.807362,951.6,676.4


In [0]:
#eval_df = pd.DataFrame(columns = ['F1 Score', 'auc', 'Accuracy'] ) # Instantise
eval_df = eval_df.append(emoji, ignore_index=False)
eval_df

Unnamed: 0,F1 Score,precision,false_positives,Training Time,auc,eval_accuracy,false_negatives,recall,true_negatives,true_positives,Accuracy,False Negatives,False Positives,Precision,Recall,True Negatives,True Positives
Basic,0.796373,0.785083,185.8,0 days 00:09:16.609999,0.822897,0.825302,160.8,0.808148,959.8,677.6,,,,,,,
Contractions,0.796084,0.788785,180.4,0 days 00:09:13.619132400,0.823061,0.826109,164.6,0.803677,965.2,673.8,,,,,,,
Hashtag Segmentation,0.792463,0.778531,192.4,0 days 00:09:21.422566,0.819521,0.821573,161.6,0.807077,953.2,676.8,,,,,,,
Emoji Replacement,0.793692,0.79395,172.8,0 days 00:09:15.707040,0.821786,0.826205,173.4,0.793561,978.8,667.0,,,,,,,
All Preprocessing,0.796205,0.785145,185.6,0 days 00:09:23.425535,0.823387,0.825803,161.4,0.807766,966.0,679.0,,,,,,,
Removing Stopwords,0.729251,,,0 days 00:09:22.530510,0.737039,,,,,,0.757831,317.6,164.8,0.608553,0.617838,987.2,522.4
Basic,0.791682,,,0 days 00:08:05.076683,0.818791,,,,,,0.820565,161.4,194.6,0.77698,0.807362,951.6,676.4
Emoji Replacement,0.786213,,,0 days 00:08:17.360357,0.814614,,,,,,0.81757,171.6,191.8,0.777174,0.795718,960.2,668.4


In [0]:
eval_df.to_csv('gs://csc3002/hateval2019/preprocess_eval_df.csv', sep=',',  index = True, encoding = 'utf-8')


## Adding in augmented back-translated hate speech tweets as extra data

We have very few instances of hate speech labelled in this dataset. To remedy this I performed back_translation augmentation on this training set.

Below I load in in the extra hate speech tweets I created via back-translation augmentation I performed in another colab notebook and I append it to the existing dataframe

In [0]:
"""dat = '/content/drive/My Drive/hateval2019/backtranslated_hatEval.txt' 
dat = pd.read_csv(dat, sep = '\t', names = ['tweet'], header = None, encoding = 'utf-8')
pd.set_option('display.max_colwidth', -1)
dat = dat.astype(str)
dat.head(50)"""

"dat = '/content/drive/My Drive/hateval2019/backtranslated_hatEval.txt' \ndat = pd.read_csv(dat, sep = '\t', names = ['tweet'], header = None, encoding = 'utf-8')\npd.set_option('display.max_colwidth', -1)\ndat = dat.astype(str)\ndat.head(50)"

**See how the english is a little off?** 

That's because these are the hate speech tweets in the training set translated to french, then translated back again. This creates a whole new, yet similar set of hate speech tweets to train on. (Slightly augmented text)

In [0]:
"""print("There are", len(dat.index), "tweets")
dat = dat[dat['tweet'].apply(lambda x: len(x) > 10)]
print("There are now", len(dat.index), "tweets")
dat.head()"""

'print("There are", len(dat.index), "tweets")\ndat = dat[dat[\'tweet\'].apply(lambda x: len(x) > 10)]\nprint("There are now", len(dat.index), "tweets")\ndat.head()'

<b>Rather than creating 3768 extra tweets, 19630 extra have been created. The tweets have been incorrectly parsed. Removing some tweets with a smaller length may mitigate this effect somewhat by removing tweets that were cut in half</b>

Let's see if it helps by adding it to the original training set and testing it against our dev data

In [0]:
"""dat['label'] = 1
dat['id'] = 80000
frames = [dat,data]
data = pd.concat(frames)
print(data.info())
data.head()"""

"dat['label'] = 1\ndat['id'] = 80000\nframes = [dat,data]\ndata = pd.concat(frames)\nprint(data.info())\ndata.head()"

We'll shuffle the dataframe to make sure there's no funny business with the training of the model and we'll then reset the id field to make it unique and sequential for each row

In [0]:
"""data = data.sample(frac=1, random_state = SEED)
data.reset_index(drop = True, inplace = True)

data['id'] = data.reset_index().index + 1
print(data.label.value_counts(), "\n")
print(data.info())
length = len(data.index)
print("\nNow there are", length , "tweets total in this database")
data.tail(10)"""

'data = data.sample(frac=1)\ndata.reset_index(drop = True, inplace = True)\n\ndata[\'id\'] = data.reset_index().index + 1\nprint(data.label.value_counts(), "\n")\nprint(data.info())\nlength = len(data.index)\nprint("\nNow there are", length , "tweets total in this database")\ndata.tail(10)'

# Training with both dev and training set. Then Testing with the holdout test set
<b>Loading in train and test data...

In [0]:
train = loadData(rawTrain, rawDev, params)


test = loadData(rawTest, params_dict = params)
test.head()

Unnamed: 0,id,tweet
0,0,catch us on and tonight at 5pm nervous recovery
1,1,fathers day dubai uae my dubai father
2,2,can't wait to see tonight
3,3,i am awesome. i am positive affirmation
4,4,care free. stress free. happy. {#carefree stress free bernese mountain dog berner


<b>Function to get predictions on test data </b>

In [0]:
def getPrediction(in_sentences):
  #Makes output less verbose
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

  labels = [0, 1]
  input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=True)
  predictions = list(estimator.predict(predict_input_fn))

  #Initialise empty predicted labels array
  predicted_classes = [None] * len(predictions)

  #Use a for loop to iterate through probabilities and for each prediction assign a label
  #corresponding to which label has the highest probability
  for i in range(0, len(predictions)):
    if predictions[i]['probabilities'][0] > predictions[i]['probabilities'][1]:
      predicted_classes[i] = 0
    else:
      predicted_classes[i] = 1
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # Reset tensorflow verboisty to normal

  return predicted_classes

<b> Converting to features, setting run and model configs.

Then training on train and dev set and predicting on unseen test set </b>

In [0]:
LEARNING_RATE = 2e-5
num_train_steps = 6000 # Recommend 1000 for HatEval, 10000? for AnalyticsVidhya
train_batch_size = 32
if DATASET == "HatEval":

  SAVE_CHECKPOINTS_STEPS = 1000
  run_config = tf.compat.v1.estimator.tpu.RunConfig(
      #I think the output file must be a sub-directory of the main BERT file
      model_dir=OUTPUT_DIR,
      tf_random_seed=SEED, 
      cluster=cluster_resolver,
      save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=200,
          num_shards=8,
          per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

  train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                        text_a = x[DATA_COLUMN], 
                                                                        text_b = None, 
                                                                        label = x[LABEL_COLUMN]), axis = 1)

  train_features =  run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)


  #Delete prior model graph, checkpoints and eval files to make room for new model each loop
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
  # Doesn't matter if the directory didn't exist
    pass
  tf.gfile.MakeDirs(OUTPUT_DIR)

  # Compute # warmup steps
  num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

  # Model configs
  model_fn = run_classifier.model_fn_builder(
  bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
  num_labels=len(label_list),
  init_checkpoint=bert_ckpt_file,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  use_one_hot_embeddings=True)

  estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
    use_tpu=True,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=train_batch_size,
    eval_batch_size=8,
    predict_batch_size=8)

  # Create an input function for training. drop_remainder = True for using TPUs.
  train_input_fn = run_classifier.input_fn_builder(
      features=train_features,
      seq_length=MAX_SEQ_LENGTH,
      is_training=True,
      drop_remainder=True)

  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

  print("\nThe model will stop training when it reaches", num_train_steps, "as a checkpoint")

  print(f'Beginning Training!')
  current_time = datetime.now()
  estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
  train_time = datetime.now() - current_time
  print("Training took time ", train_time)

  predictions = getPrediction(test.tweet)
  test['predictions'] = predictions

  test.to_csv('gs://csc3002/hateval2019/predictions.csv', sep=',',  index = True, encoding = 'utf-8')
  print("\n\nF1 Score:", metrics.f1_score(test.label, test.predictions))
  print("Accuracy", metrics.accuracy_score(test.label, test.predictions))

elif DATASET == "AnalyticsVidhya":
  
  SAVE_CHECKPOINTS_STEPS = 10000
  run_config = tf.compat.v1.estimator.tpu.RunConfig(
      #I think the output file must be a sub-directory of the main BERT file
      model_dir=OUTPUT_DIR,
      tf_random_seed=SEED, 
      cluster=cluster_resolver,
      save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=200,
          num_shards=8,
          per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

  train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                        text_a = x[DATA_COLUMN], 
                                                                        text_b = None, 
                                                                        label = x[LABEL_COLUMN]), axis = 1)

  train_features =  run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)


  #Delete prior model graph, checkpoints and eval files to make room for new model each loop
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
  # Doesn't matter if the directory didn't exist
    pass
  tf.gfile.MakeDirs(OUTPUT_DIR)

  # Compute # warmup steps
  num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

  # Model configs
  model_fn = run_classifier.model_fn_builder(
  bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
  num_labels=len(label_list),
  init_checkpoint=bert_ckpt_file,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  use_one_hot_embeddings=True)

  estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
    use_tpu=True,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=train_batch_size,
    eval_batch_size=8,
    predict_batch_size=8)

  # Create an input function for training. drop_remainder = True for using TPUs.
  train_input_fn = run_classifier.input_fn_builder(
      features=train_features,
      seq_length=MAX_SEQ_LENGTH,
      is_training=True,
      drop_remainder=True)

  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

  print("\nThe model will stop training when it reaches", num_train_steps, "as a checkpoint")

  print(f'Beginning Training!')
  current_time = datetime.now()
  estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
  train_time = datetime.now() - current_time
  print("Training took time ", train_time)

  predictions = getPrediction(test.tweet)
  test['label'] = predictions
  print(test.label.value_counts())
  print(predictions[0:20])
  test.to_csv('gs://csc3002/trial/submission.csv', sep=',', index = False)


INFO:tensorflow:Writing example 0 of 31814
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] new jersey beautiful sunny calm peaceful ##l flowers grass life in nyc [SEP]
INFO:tensorflow:input_ids: 101 2047 3933 3376 11559 5475 9379 2140 4870 5568 2166 1999 16392 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

## Error Analysis

In [0]:
test.to_csv('gs://csc3002/trial/submission.csv', sep=',', index = False)
test.head(40)

Unnamed: 0,id,tweet,label
0,0,catch us on and tonight at 5pm nervous recovery,0
1,1,fathers day dubai uae my dubai father,0
2,2,can't wait to see tonight,0
3,3,i am awesome. i am positive affirmation,0
4,4,care free. stress free. happy. {#carefree stress free bernese mountain dog berner,0
5,5,live now girlfriend: webcam female webcam model babe pussy,0
6,6,live your dream of ice skating in nigeria visit the rink i promise you would be glad you came.,0
7,7,ricky know what gets me so upset the president gives his condolence to victims that's not good enough all talk no action,0
8,8,nothing makes dad happier than a tasty meal! fathers day,0
9,9,12 hours to go. tonight is going to epic culture clash,0


<b>Our test dataset is supervised for HatEval so perhaps we can do some analysis here by peeking at where we went wrong</b>

In [0]:
if DATASET == "HatEval":
  def get_missed_HS(df):
    df = df.loc[(df["label"] == 1) & (df["predictions"] == 0)]
    return df

  def get_misclassified_HS(df):
    df = df.loc[(df["label"] == 0) & (df["predictions"] == 1)]
    return df
  #Perhaps having a separate function for both is a good idea

  df = get_missed_HS(test)
  df.to_csv('gs://csc3002/hateval2019/falsenegatives.csv', sep=',',  index = True, encoding = 'utf-8')
  df.head(50)

In [0]:
if DATASET == "HatEval":
  df = get_misclassified_HS(test)
  df.to_csv('gs://csc3002/hateval2019/falsepositives.csv', sep=',',  index = True, encoding = 'utf-8')
  df.head(50)

Unnamed: 0,id,tweet,label,predictions
2,2,"y'all: ""you a hoe"" hoe me: ""that's hoe culture to you bitch""",0,1
4,4,"the usa has a major debt problem and thus cannot afford big tax cuts for the wealthy, big globalist corporations, rinos, & wall street. usa americans america pj net end daca no amnesty congress",0,1
6,6,theres nothing more i hate than joffery. cunt lookin ass bitch,0,1
7,7,"ben, you are shit on my shoe still bitch.",0,1
11,11,thank you president trump. keeping his promise to farmers. confirm judge kavanaugh keep maga build that wall thanks to all that support trump and his supporters.,0,1
12,12,pocohantas speaks with forked tongue. liberalism is a mental disorder maga america first term limits drain the swamp build the wall send them back,0,1
17,17,you got me with the bitch ass hoe comment!! face with tears of joy face with tears of joy loudly crying face skull,0,1
18,18,you can rent a hoe but you cant buy a bitch,0,1
21,21,build that wall ted cruz has a plan on how to help republicans win big in november!,0,1
24,24,"this is way scary, someone stop this evil person build the wall end daca boycott nfl boycott nike lock them all up",0,1


# Using Tensorboard to get deeper insight

In [0]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip   #Downloads file to google drive

--2020-02-12 18:59:42--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 34.233.35.85, 3.229.196.117, 34.193.139.214, ...
Connecting to bin.equinox.io (bin.equinox.io)|34.233.35.85|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13773305 (13M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip.1’


2020-02-12 18:59:43 (34.5 MB/s) - ‘ngrok-stable-linux-amd64.zip.1’ saved [13773305/13773305]

Archive:  ngrok-stable-linux-amd64.zip
replace ngrok? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ngrok                   


In [0]:

def get_tensorboard(path_to_event_file = OUTPUT_DIR):
  get_ipython().system_raw('tensorboard --logdir {} --host 0.0.0.0 --port 6006 --reload_multifile=true &'
.format(path_to_event_file))
  
  get_ipython().system_raw('./ngrok http 6006 &')

  !curl -s http://localhost:4040/api/tunnels | python3 -c \
      "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

get_tensorboard(OUTPUT_DIR)

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/usr/lib/python3.6/json/__init__.py", line 299, in load
    parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
  File "/usr/lib/python3.6/json/__init__.py", line 354, in loads
    return _default_decoder.decode(s)
  File "/usr/lib/python3.6/json/decoder.py", line 339, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/usr/lib/python3.6/json/decoder.py", line 357, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
