In [0]:
# Copyright 2019 Google Inc.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<b>Open in colab for full functionality (if not already open in colab) </b>

<a href="https://drive.google.com/file/d/1PUEEOCedG7BHVYfxG2t5cwwf0cUNij93/view?usp=sharing" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> 

# Detecting Hate Speech Tweets With BERT

We are using bert-tensorflow for this classification task. At the moment I'm making sure it's tensorflow version 1.x because tensorflow version 2 gives issues with Bert at the moment. I believe Tensorflow hopes to have this issue resolved in tensorflow v 2.1

We are using a TPU as a GPU does not have the required memory for Large BERT models- it can only cope with the base model. We'll see if there a TPU detected and we'll set it to a global environment variable so it can be accessed by our BERT functions later.

In [2]:
!pip install gcsfs 
import pandas as pd
import numpy as np

#Make sure to use tensorflow version 1.x, version 2 doesn't work with bert
%tensorflow_version 1.x
import tensorflow as tf
#!pip install gast==0.2.2
import os

#For cross-validation and grid search
from itertools import product
from google.cloud import storage
from IPython.display import display

import sklearn
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn import metrics


import html
import re
import json
import pprint
import random
import string
import nltk
from datetime import datetime
import time

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

#Below we give ourselves as well as the TPU access to our private GCS bucket
from google.colab import auth
auth.authenticate_user()
tf.reset_default_graph()  
with tf.Session(TPU_ADDRESS) as session:
  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)

USE_TPU=True
try:
  #tf.config.experimental_connect_to_host(TPU_ADDRESS)
  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=TPU_ADDRESS)
  tf.config.experimental_connect_to_cluster(cluster_resolver)
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
except Exception as ex:
  print(ex)
  USE_TPU=False

print("        USE_TPU:", USE_TPU)
print("Eager Execution:", tf.executing_eagerly())

assert not tf.executing_eagerly(), "Eager execution on TPUs have issues currently"

Collecting gcsfs
  Downloading https://files.pythonhosted.org/packages/18/3b/454be7c97d05e15eb20a0099f425f0ed6b7552e352c77adb923c3872ba14/gcsfs-0.6.1-py2.py3-none-any.whl
Installing collected packages: gcsfs
Successfully installed gcsfs-0.6.1
TensorFlow 1.x selected.
TPU address is grpc://10.24.148.34:8470
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

INFO:tensorflow:Initializing the TPU system: 10.24.148.34:8470
INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Querying Tensorflow master (grpc://10.24.148.34:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tenso

Setting a random seed so we can attempt reproducability of results.

Also checking version of tensorflow

In [3]:
# Setting the graph-level random seed for the default graph. Different than operation level seed
SEED = 3060
tf.reset_default_graph()
os.environ['PYTHONHASHSEED'] = str(SEED)
tf.set_random_seed(SEED) 
random.seed(SEED)
np.random.seed(SEED)
print("Tensorflow Version:", tf.__version__)

Tensorflow Version: 1.15.2


Below we will set the directory where we will store our output model. To ensure the right variables are loaded in our run config function later, our output directory must be in the same directory as our pre-trained bert model directory.

Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist).

In [4]:
#Large whole word masking BERT pre-trained weights
bert_model_name = 'wwm_uncased_L-24_H-1024_A-16' 

#Where we output the fine tuned model
output_dir = os.path.join(bert_model_name, 'output1')

DATASET = "HatEval" #@param ["HatEval", "AnalyticsVidhya", "Custom_HS", "Custom_OFF"]

#@markdown Whether or not to use the further pretrained model
FURTHER_PRETRAINED = True #@param {type:"boolean"}
if FURTHER_PRETRAINED == True:

  further_pretrained_model = os.path.join(bert_model_name, 'further_pretrained_model1')
  output_dir = os.path.join(further_pretrained_model, 'output1')#output1

#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = True #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = True #@param {type:"boolean"}
BUCKET = 'csc3002' #@param {type:"string"}
os.environ["GCLOUD_PROJECT"] = "csc3002"

if USE_BUCKET:
  OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, output_dir)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))


***** Model output directory: gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model1/output1 *****


<b> If you're not connected to a TPU environment but still want to access GCS bucket - run below: </b>

In [5]:
"""from google.colab import drive
drive.mount('/content/drive')
!gcloud auth activate-service-account --key-file '/content/drive/My Drive/storageCreds.json'
"""

"from google.colab import drive\ndrive.mount('/content/drive')\n!gcloud auth activate-service-account --key-file '/content/drive/My Drive/storageCreds.json'\n"

<b>Setting up Data Based Upon Choice of DATASET</b>


In [0]:
if DATASET == 'HatEval':
  dirc = 'gs://csc3002/hateval2019'

  rawTrain = pd.read_csv(os.path.join(dirc, 'hateval2019_en_train.csv'), sep=',',  index_col = False, encoding = 'utf-8')
  rawTrain.rename(columns={'text': 'tweet', 'HS': 'label'}, inplace=True)
  rawTrain.drop(['TR', 'AG'], inplace = True, axis = 1)

  rawDev = pd.read_csv(os.path.join(dirc, 'hateval2019_en_dev.csv'), sep=',',  index_col = False, encoding = 'utf-8')
  rawDev.rename(columns={'text': 'tweet', 'HS': 'label'}, inplace=True)
  rawDev.drop(['TR', 'AG'], inplace = True, axis = 1)

  rawTest = pd.read_csv(os.path.join(dirc, 'hateval2019_en_test.csv'), sep=',',  index_col = False, encoding = 'utf-8')
  rawTest.rename(columns={'text': 'tweet', 'HS': 'label'}, inplace=True)
  rawTest.drop(['TR', 'AG'], inplace = True, axis = 1)

elif DATASET == "AnalyticsVidhya":
  dirc = 'gs://csc3002/trial'

  rawTrain= pd.read_csv(os.path.join(dirc, 'train_E6oV3lV.csv'),  sep=',',  index_col = False, encoding = 'utf-8')
  
  #Make sure it's a stratified sample so evaluation is truly representative
  rawTrain, rawDev = train_test_split(rawTrain, test_size=0.20, random_state=SEED, stratify = rawTrain.label)

  rawTest = pd.read_csv(os.path.join(dirc, 'test_tweets_anuFYb8.csv'), sep=',',  index_col = False, encoding = 'utf-8')

#Classifying Hate Speech in Dataset constructed in the CSC3002-Hate_Speech_Detection_Assembling_and_Cleaning_the_Fine_Tuning_Data.ipynb notebook
elif DATASET == 'Custom_HS': 

  rawTrain= pd.read_csv('gs://csc3002/Raw_Data/final.csv',  sep=',',  index_col = False, encoding = 'utf-8')
  rawTrain.rename(columns={'Tweet': 'tweet', 'Hate_Speech': 'label'}, inplace=True)
  rawTrain['label'] = rawTrain['label'].astype(int)
  rawTrain, rawTest = train_test_split(rawTrain, test_size=0.20, random_state=SEED, stratify = rawTrain.label)

#Classifying Offensive Speech in Dataset constructed in the CSC3002-Hate_Speech_Detection_Assembling_and_Cleaning_the_Fine_Tuning_Data.ipynb notebook
elif DATASET == 'Custom_OFF':
  rawTrain= pd.read_csv('gs://csc3002/Raw_Data/final.csv',  sep=',',  index_col = False, encoding = 'utf-8')
  rawTrain.rename(columns={'Tweet': 'tweet', 'Offensive': 'label'}, inplace=True)
  rawTrain = rawTrain[rawTrain.label != '-']
  rawTrain['label'] = rawTrain['label'].astype(int)
  rawTrain, rawTest = train_test_split(rawTrain, test_size=0.20, random_state=SEED, stratify = rawTrain.label)

else:
  raise ValueError('No Valid DATASET chosen')

# Training Data
I've stored all of the data, (train, dev and test),  in my google bucket for ease of access, authentication will have to be provided

In [7]:
!gcloud config set project 'my-project-csc3002'

train = rawTrain.sample(frac=1, random_state = SEED) #Shuffling really helps model performance
train.reset_index(drop = True, inplace = True)
pd.set_option('display.max_colwidth', None)
if DATASET == 'Custom_OFF':
  classification_type = 'offensive'
else:
  classification_type = 'hate'
print("Out of {} tweets in this database, {} are not {}, {} are {}".format(len(train.index),
                                                      len(train[train['label']==0]), classification_type,
                                                      len(train[train['label']==1]), classification_type,))

Updated property [core/project].


To take a quick anonymous survey, run:
  $ gcloud survey

Out of 9000 tweets in this database, 5217 are not hate, 3783 are hate


<b>Original Dataset </b>

In [8]:
train.head(30)

Unnamed: 0,id,tweet,label
0,680,"How come Allah is not helping you it is up to Christian countries to protect you feed you ,The countries hit by violence from islam take refugees in feed them etcPlease no more explaining about your hard times we are doing our best for uYes there is good and bad every where",0
1,3995,"With todays #JalalabadAttack &amp; other vicious attacks claimed by ISIS, I smell a spillover of refugees in Pak again. This time we should not open borders for them. We cant afford terrorists taking undue advantage. Let Americans deal with it.#Afghanistan #Jalalabad",1
2,2713,"https://t.co/i9LJDjtGz7Migration greatest threat’ to Austrian security, says top military figure.EU and Europe bitterly dividedðŸ‘‰major confrontations between the two.Nothing more counterproductive than “centers” on European territory or euro bribes for migrants.#Visegrad #V4 https://t.co/VnPCTe7opC",0
3,7509,When all your friends are out hoe'in and you're stuck at home in a shitty relationship https://t.co/X9oz1Tx7TC,0
4,6384,I wonder if rick will make another deal with those crazy ass women 🤔 and if that crazy ass nigga will actually hoe Daryl again 😐,1
5,628,Worker Charged With Sexually Molesting Eight Children at Immigrant Shelter https://t.co/D6HcH03nGL via @CitizenTruth_ #realDonaldTrump do something about this disgrace and stop separating children from their parents.,0
6,3241,UN seeks new funding pledges for Palestinian refugees... https://t.co/SNJhD1PWxT https://t.co/DlHQ8fc5N6,0
7,7620,"If you really wanna know what someone you're fucking thinks about you, make them show you how you're stored in their phone...",0
8,4533,Going to make Du'a at the shrine of Imam Reza(AS) for the refugees in Athens.,0
9,3755,"Poor kid. Someone wise must have told him, ""When the world gives you lemons, make lemonade."" He listened. His lemonade should now be offered with ICE in abundance. #BuildTheWall #SendThemBack https://t.co/8AM7fgo9ph",0


### Text Preprocessing

The text pre-processing for this project is detailed in the notebook `Text_Preprocessing.ipynb` in the github repo. Below is an import of the repo into the google colab workspace so I can retrieve and use these functions at convenience

Also below is a function which loads whichever dataset I choose to load from my GCS bucket or local system. This will be useful later when I want to quickly load in data without the messy, long-winded code to go along with it.

In [9]:
#@title Text Pre-Processing Options
HASHTAG_SEGMENTATION = True #@param {type:"boolean"}
EMOJI_REPLACEMENT = "Replace_Emoji_v1" #@param ["None", "Replace_Emoji_v1", "Replace_Emoji_v2"]
LEMMATIZE = False #@param {type:"boolean"}
REMOVE_STOPWORDS = False #@param {type:"boolean"}
REMOVE_PUNCTUATION = True #@param {type:"boolean"}

options = [HASHTAG_SEGMENTATION, EMOJI_REPLACEMENT, LEMMATIZE, REMOVE_STOPWORDS, REMOVE_PUNCTUATION]

!git clone https://gitlab2.eeecs.qub.ac.uk/csc3002_fionn/csc3002_detecting_hate_speech.git
%cd csc3002_detecting_hate_speech
%cd Text_Preprocessing/
#!ls
import preprocessing as pre
#Return to original workspace
%cd ../..

train = pre.loadData(train, options = options, dataset = DATASET)

Cloning into 'csc3002_detecting_hate_speech'...
remote: Enumerating objects: 473, done.[K
remote: Counting objects: 100% (473/473), done.[K
remote: Compressing objects: 100% (293/293), done.[K
remote: Total 473 (delta 258), reused 362 (delta 170)[K
Receiving objects: 100% (473/473), 1.66 GiB | 24.88 MiB/s, done.
Resolving deltas: 100% (258/258), done.
Checking out files: 100% (76/76), done.
/content/csc3002_detecting_hate_speech
/content/csc3002_detecting_hate_speech/Text_Preprocessing
/content


**Cleaned tweet text dataset**

In [10]:
train[:30]

Unnamed: 0,id,tweet,label
479,794,germany claims success with program to support migrant returnees,0
3794,2648,many people are deluded to the facts concerning the migrant crisis in europe people think that most are refugees fleeing war and destruction many think that most are syrian others believe that they are all skilled educated and civilised people just trying to find a better life,0
2512,8887,user unless cow tipping is classed as physical assault you should shut your fucking whore mouth i hope you never show your face outside of whatever hole you went to hide in because frankly i would love to drop kick you off a fucking balcony xoxoxo,1
7308,4459,germany moroccan migrant spits on a german woman and gets slapped in the face by a german grocery worker free speech time user israel,1
6183,4059,user democrats and liberals are lying about illegal immigration its time we take back our country send them back indepen ce day happy 4th of juluy fourth of july happy birthday america god bless america trump maga red nation rising,1
427,1862,boat carrying 160 migrants sinks off northern cyprus coast 16 dead,0
3040,782,illegal alien with long criminal record murdered man in north carolina via user build the wall enforce us laws end daca ban sanctuary cities keep americans safe,1
7419,4555,it weirds me out how obsessed right wingers are with pedophiles it seems like they spend a lot of their days trying to figure out how and where people are fucking kids hey guys maybe check out the immigrant concentration camps you might get some hits there if you cared,0
4332,8279,sex t from daddy my little toy which hole shall i stuff slut be right back just being carried away in a tsunami of cunt juice ocean,0
3554,1374,china invites all un countries to use its future space station cbc news united nations china space station,0


In [11]:
#@markdown Tick box if you wish to oversample the hate speech data to correct imbalance in the dataset
OVERSAMPLE = False #@param {type:"boolean"}

#@markdown You can oversample hate speech or non-hate speech data
oversampleLabel = "Hate" #@param ["Hate", "Not Hate"]
#@markdown Choose the number of times over you want the subsample you've selected to be mulitplied.
multiplier = 1 #@param {type:"slider", min:1, max:10, step:1}
#@markdown If you just want to balance the class labelling of the set, set multiplier to 1.

#@markdown If the label is set to the majority class and multiplier = 1 then this will result in an undersample

def oversample(train, lab = "Hate", multiplier = 1):
    
    neg_train = train.loc[train['label'] == 0]
    pos_train = train.loc[train['label'] == 1]
    
     #Whether we're sampling from hate or not hate for the term
    if lab == "Hate":
      aug_set = pos_train
    else:
      aug_set = neg_train

    ids = np.arange(len(aug_set))

    #You can multiply your set by a chosen number
    if multiplier > 1:
      choices = np.random.choice(ids, len(aug_set) * multiplier )

    #Or you can simply match the opposite label and either undersmple of oversample
    else:
      choices = np.random.choice(ids, (len(train) - len(aug_set)))

    aug_train = aug_set.iloc[choices]

    size = len(train)
    if lab == "Hate":
      train = pd.concat([aug_train, neg_train], axis=0)
    else:
      train = pd.concat([aug_train, pos_train], axis=0)
    
    print(len(train)-size , "added tweets\n")
    #shuffle
    train = train.sample(frac = 1, random_state=SEED) #Shuffle data
    return train

if OVERSAMPLE == True:
  print("OVERSAMPLING DATA")
  train = oversample(train,oversampleLabel, multiplier )                                         
print("Out of {} tweets in this database, {} are not {}, {} are {}".format(len(train.index),
                                                      len(train[train['label']==0]), classification_type,
                                                      len(train[train['label']==1]), classification_type))

Out of 9000 tweets in this database, 5217 are not hate, 3783 are hate


Loading in dev data and specifying global variables

In [12]:
dev = pre.loadData(rawDev, options = options, dataset = DATASET)

DATA_COLUMN = 'tweet'
LABEL_COLUMN = 'label'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

print("Size of training data", len(train.index))
print("Size of development data", len(dev.index), '\n')

Size of training data 9000
Size of development data 1000 



# Setting Up BERT Training 

##Custom BERT Repositroy For Custom Functionality

Rather than using the official BERT setup, I have instead forked the BERT repo and customised it to allow for a more tailor-made approach when evaluating and fine-tuning the model

The`create_model` function in my BERT repo has been edited to allow for multiple <b>Fine-Tuning</b> strategies. Normally, the default function from BERT simply fine-tunes a single layer that will be trained on top of BERT to adapt it to our classification problem. This strategy of using a pre-trained model, then fine-tuning it is called <b>Transfer Learning</b>.

Also the `model_fn` method in my BERT repo provides far more detailed metrics than just accuracy and loss - which is all the default repo provides. It has metrics such as F-Score, AUC, precision and recall; so I can better analyse the performance of different models

In [13]:
%cd csc3002_detecting_hate_speech
%cd bert
import run_classifier
import optimization
import tokenization
import modeling
#Return to original workspace
%cd ../..

/content/csc3002_detecting_hate_speech
/content/csc3002_detecting_hate_speech/bert

/content


## BERT Preprocessing and Setup
We'll need to transform our data into a format BERT understands. This involves two steps. First, we create  `InputExample`'s using the constructor provided in the BERT library.

- `text_a` is the text we want to classify, which in this case, is the `tweet` field in our Dataframe. 
- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.
- `label` is the label for our example, i.e. HS, Not HS

In [0]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for book-keeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

dev_InputExamples = dev.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

Next, we need to preprocess our data so that it matches the data BERT was trained on.


1. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
2. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
3. Map our words to indexes using a vocab file that BERT provides
4. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
5. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))

Happily, we don't have to worry about most of these details. It's automated with the below inbuilt functions




Below is a way to retrieve desired BERT parameters, such as it's pre-trained checkpoints and it's vocab file, from my google storage bucket where I've downloaded the uncased LARGE version of bert.

In [15]:
bucket_dir = 'gs://csc3002'
bert_ckpt_dir = os.path.join(bucket_dir, bert_model_name)

#For further pretrained model
if FURTHER_PRETRAINED:
  further_pretrained_model = os.path.join(bert_model_name, 'further_pretrained_model1')
  further_pretrained_model = os.path.join(bucket_dir, further_pretrained_model)
  bert_ckpt_file = tf.train.latest_checkpoint(further_pretrained_model)
  print("\nUsing BERT checkpoint from directory:", os.path.join(further_pretrained_model))

else:
  bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
  print("\nUsing BERT checkpoint from directory:", bert_ckpt_dir)

print("\nBERT checkpoint file is:", bert_ckpt_file)

#Setting up BERT config, vocab file and tokenizer - all default from the BERT repo
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")
vocab_file = os.path.join(bert_ckpt_dir, "vocab1.txt")
  
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file)

print("\nMake sure that the function loads a checkpoint, if it doesn't an error will be thrown here")
assert bert_ckpt_file is not None, "No BERT checkpoint file loaded"

print("\nUsing vocab file:", vocab_file)
print("\nBelow is an example of the BERT tokenizer in action:")
tokenizer.tokenize("This here's an example of using the BERT tokenizer")


Using BERT checkpoint from directory: gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model1

BERT checkpoint file is: gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model1/model.ckpt-80000


Make sure that the function loads a checkpoint, if it doesn't an error will be thrown here

Using vocab file: gs://csc3002/wwm_uncased_L-24_H-1024_A-16/vocab1.txt

Below is an example of the BERT tokenizer in action:


['this',
 'here',
 "'",
 's',
 'an',
 'example',
 'of',
 'using',
 'the',
 'bert',
 'token',
 '##izer']

Using our tokenizer, we'll call `run_classifier.convert_examples_to_features` on our InputExamples to convert them into features BERT understands.

In [0]:
# BERT is limited to 512 tokens in length
MAX_SEQ_LENGTH = 256 #@param {type:"slider", min:128, max:512, step:32}

In [17]:
# Convert our train and dev features to InputFeatures that BERT understands.
train_features =  run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
dev_features = run_classifier.convert_examples_to_features(dev_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)


INFO:tensorflow:Writing example 0 of 9000
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] germany claims success with program to support migrant return ##ees [SEP]
INFO:tensorflow:input_ids: 101 2762 4447 3112 2007 2565 2000 2490 20731 2709 10285 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

##Fine-Tuning Model

I'm using Tensorflow estimators as they're easier to use with TPU disributed training. Also the BERT tutorials have been demonstrated using estimators.

Below we load in our `RUN_CONFIG` for fine tuning - where we define our distribution strategy, how often we get summary metrics and how often we checkpoint. Our parameters for BERT Fine-Tuning - defined below, and our model function - as defined in our custom BERT repositry.

In [0]:
#Set below to a high value if you do not wish to checkpoint model while training. The train_and_evaluate function below will checkpoint at every evaluation regardless
SAVE_CHECKPOINTS_STEPS = 100000 
#@markdown Summary steps gives us an idea of how the model is performing by returning average loss every nth step. Changing this does not effect performance, however there is overhead if this value is too small.
SUMMARY_STEPS =  100#@param {type:"number"}

In [19]:
RUN_CONFIG = tf.compat.v1.estimator.tpu.RunConfig(  
    #I think the output file must be a sub-directory of the main BERT file
    model_dir=OUTPUT_DIR, 
    tf_random_seed=SEED,
    cluster=cluster_resolver,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=SUMMARY_STEPS,    #Shows us summary metrics every 100 steps
        num_shards=8,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))
print(RUN_CONFIG.session_config)

allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.24.148.34:8470"
    }
  }
}
isolate_session_state: true



In [0]:
#@title Fine-Tuning Options
#@markdown Toggle options for fine-tuning model below
FT_MODEL = "Default" #@param ['Default', 'Multi-Layer Perceptron', 'BiLTSM']
#@markdown If oversampling, weighted loss is not necessary. Weights can be class weights or custom, edit within cell to chooose
LOSS_FN = "weighted_loss" #@param  ['Default','focal_loss','weighted_loss']
WEIGHTED_BIAS = False 
#@markdown Normalise the pooled [CLS] token for text classification?
NORMALISE_EMBEDDINGS = False #@param {type:"boolean"}
#@markdown Dropout probability of hidden neurons in the fine-tuning stage
DROPOUT = 0.1 #@param {type:"slider", min:0.1, max:0.6, step:0.05}
TRAIN_BATCH_SIZE = 32 #@param {type:"slider", min:16, max:32, step:16}

#Must be set to 8 because on a TPU, model will truncate last few entries in prediction/evaluation if they don't fit in the specified batch size
#As there are 8 TPU cores, this ensures each instance will be attended to
EVAL_BATCH_SIZE = 8 
PREDICT_BATCH_SIZE = 8 

LEARNING_RATE = 0.00002 #@param {type:"slider", min:1e-5, max:5e-5, step:1e-6}
#@markdown The parameters below are not relevant if the FT_MODEL is set to 'Default
NUM_EXTRA_LAYERS = 2 #@param {type:"slider", min:1, max:3, step:1}
HIDDEN_SIZE = 256 #@param {type:"slider", min:32, max:384, step:4}
#pos = train.label.value_counts()[1]
#neg = train.label.value_counts()[0]
pos = 1
neg = 2
#neg = train.label.value_counts()[0]
FT_PARAMS = [FT_MODEL, LOSS_FN, NUM_EXTRA_LAYERS, HIDDEN_SIZE, pos, neg, WEIGHTED_BIAS, NORMALISE_EMBEDDINGS, DROPOUT]

Next we create an input builder function that takes our training feature set (`train_features`) and produces a generator.

This is a pretty standard design pattern for working with Tensorflow Estimators

In [0]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True,)

# Input function for dev data, we feed in our previously created dev_features for this
test_input_fn = run_classifier.input_fn_builder(
    features=dev_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=True)

The estimator will not be defined globally, as the parameters we will feed into it will dynamically change depending on the function.

# Cross Validation Evaluation

Does not provide in depth tensorflow logging but it does provide evaluation at the end. We combine the  training and dev data created above and evaluate on a stratified fraction of the combined data for each fold.

This evaluation is used to test if a change in code or addition of a feature has benefited the overall system

In [0]:
#@markdown Cross-Validation Params:
NUM_TRAIN_STEPS = 750 #@param {type:"slider", min:0, max:10000, step:50}
FOLDS = 5 #@param {type:"slider", min:3, max:10, step:1}

In [0]:
def bertCV(data, folds = FOLDS):

  #Filter out all log messages so console isn't consumed with memory
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

  #Dataframe where grid search results will be stored. Empty to begin with
  eval_df = pd.DataFrame(columns = ['F1 Score', 'auc', 'Accuracy'] )
  
  WARMUP_PROPORTION = 0.1
  
  k = 1 # Fold counter
  
  #Stratified K fold ensures the folds are made by preserving the percentage of samples for each class.
  cv = StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)

  # Sticking within the training dataset for evaluation. Data is the combination of the provided train and dev sets
  for train_index, dev_index in cv.split(data.tweet, data.label): 
    
    #Shuffling again because otherwise the StratifiedKFold function groups a lot of 0's at the start
    training  = data.iloc[train_index]
    training = training.sample(frac = 1, random_state=SEED)
    develop = data.iloc[dev_index]
    develop = develop.sample(frac = 1, random_state=SEED)

    
    """Unlike before where I only one test set and one training set, this time I have K different sets of training and testing.
    Therefore, in each fold I need to get a new set of data and convert it to features each time."""
    
    # Use the InputExample class from BERT's run_classifier code to create examples from the data

    train_InputExamples = training.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                      text_a = x[DATA_COLUMN], 
                                                                      text_b = None, 
                                                                      label = x[LABEL_COLUMN]), axis = 1)

    dev_InputExamples = develop.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                      text_a = x[DATA_COLUMN], 
                                                                      text_b = None, 
                                                                      label = x[LABEL_COLUMN]), axis = 1)
    
    #Convert these examples to features that BERT can interpret
    train_features =  run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
    dev_features = run_classifier.convert_examples_to_features(dev_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

    #Delete prior model graph, checkpoints and eval files to make room for new model each loop
    try:
      tf.gfile.DeleteRecursively(OUTPUT_DIR)
    except:
    # Doesn't matter if the directory didn't exist
      pass
    tf.gfile.MakeDirs(OUTPUT_DIR)

    num_warmup_steps = int(NUM_TRAIN_STEPS * WARMUP_PROPORTION)

    # Model configs
    
    model_fn = run_classifier.model_fn_builder(
    bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
    num_labels=len(label_list),
    init_checkpoint=bert_ckpt_file,
    learning_rate=LEARNING_RATE,
    num_train_steps=NUM_TRAIN_STEPS,
    num_warmup_steps=num_warmup_steps,
    use_tpu=True,
    ft_params = FT_PARAMS)

    estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
      use_tpu=True,
      model_fn=model_fn,
      config=RUN_CONFIG,
      train_batch_size=TRAIN_BATCH_SIZE,
      eval_batch_size=EVAL_BATCH_SIZE,
      predict_batch_size=PREDICT_BATCH_SIZE)
    
    # Create an input function for training. drop_remainder = True for using TPUs.
    train_input_fn = run_classifier.input_fn_builder(
        features=train_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=True)

    #input function for dev data, we feed in our previously created dev_features for this
    dev_input_fn = run_classifier.input_fn_builder(
        features=dev_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=True)
   
    
    current_time = datetime.now()
    estimator.train(input_fn=train_input_fn, max_steps=NUM_TRAIN_STEPS )
    train_time = datetime.now() - current_time
    

    #You need to provide number of steps for a TPU
    eval_steps = int(len(dev_InputExamples) / EVAL_BATCH_SIZE)

    #Eval may be slightly WRONG on the TPU because it will truncate the last batch.
    eval_results = estimator.evaluate(input_fn=dev_input_fn, steps=eval_steps)

    row = pd.Series({'F1 Score': eval_results['F1_Score'], 'auc': eval_results['auc'], 'Accuracy': eval_results['eval_accuracy'] * 100,'Precision': eval_results['precision'],'Recall': eval_results['recall'],\
                                    'False Negatives': eval_results['false_negatives'],'False Positives': eval_results['false_positives'],\
                    'True Negatives':eval_results['true_negatives'] ,'True Positives': eval_results['true_positives'], 'Training Time': train_time })
    row = pd.Series(row, name = 'Fold ' + str(k))

    """Below statement controls for whenever we get a bad fold which results in a model predicting only one class.
    This isn't truly representative of normal performance and can bring down CV score, so we omit model evaluation
    if the below statement is true"""
    if eval_results['false_negatives'] < 1 or eval_results['false_positives'] < 1: 
      print("Classifier predicts one class. Thus not recording this metric as it will skew CV\n")
      #k = k + 1
      continue

    eval_df = eval_df.append(row)
    print("Fold " + str(k) + ":\tF-Score:", eval_df["F1 Score"][k-1])
    print("Training took time ", train_time)
    print('---------------------------------------------------------------------------------------------------------\n')
    k = k + 1 #Increment on fold counter

  row = eval_df.mean(axis = 0)
  row = pd.Series(row, name = 'CV Average')
  eval_df = eval_df.append(row)
  print("\nTraining Batch Size: ", TRAIN_BATCH_SIZE, "\tLearn Rate: ", LEARNING_RATE, "\tNumSteps: ", NUM_TRAIN_STEPS)
  display(eval_df)

  return row # Also return row of CV-Average

#### Cross-Validation of Cross-Validation

Tensorflow 1.x is non-deterministic, which has resulted in the variability between each run to be greater than the difference in performance gained between introductions of different configurations and parameters. This makes it difficult to determine what is the best pre-training, text preprocessing and fine-tuning pipeline to undertake.

To better ensure the reliability of experiments my solution is to have a 5 fold cross-validation of a cross-validated sample of my data which will reduce the variance run to run significantly.

In [0]:
data = pre.loadData(rawTrain, rawDev, options = options, dataset = DATASET)

#Stratified K fold ensures the folds are made by preserving the percentage of samples for each class.
folds = 5
cv = StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
eval_df = pd.DataFrame(columns = ['F1 Score', 'auc', 'Accuracy'])

#This will be a 5-fold CV so the sample each time will be a fifth of the data
i = 1
for __, data_index in cv.split(data.tweet, data.label):
  dat = data.iloc[data_index]
  CV_Av = bertCV(dat)
  CV_Av = pd.Series(CV_Av, name = 'CV Average' + str(i))
  eval_df = eval_df.append(CV_Av)

row = eval_df.mean(axis = 0)
row = pd.Series(row, name = '2MLP + 0.3 dropout')
eval_df1 = pd.read_csv('gs://csc3002/hateval2019/models_eval_df.csv', sep=',',  index_col = 0, encoding = 'utf-8')
eval_df1 = eval_df1.append(row)
eval_df1.to_csv('gs://csc3002/hateval2019/models_eval_df.csv', sep=',',  index = True, encoding = 'utf-8')
eval_df1



Fold 1:	F-Score: 0.7507330775260925
Training took time  0:08:09.590302
---------------------------------------------------------------------------------------------------------




# Running Evaluation Whilst Training - Early Stopping hooks

Below is a custom function to run evaluation on the fine-tuned BERT model whilst training. The `tf.train_and_evaluate` function for tensorflow estimators doesn't support early stopping hooks with a distributed TPU strategy as of writing. 

Thus, instead of being able to evaluate the model in memory, we must save the model graph and metadata to a checkpoint and reload it every n steps we want to run an evaluation.

The function finds the optimum number of steps the fine-tuning should run for based upon F1 Score by testing the trained model to that point against the previously created dev set.


<b>First setting up params and function that dynamically loads global setp from checkpoint dir

In [0]:
#We'll set a large value for train steps because we want to make this model run
#for as long as possible before it finds the optimimum model

params = {'train_steps': 1500000, #An early stop will occur before this step is reached
            'num_train_features': len(train_features),
            'num_eval_features': len(dev_features)
            }

          
def load_global_step_from_checkpoint_dir(checkpoint_dir):
  try:
    checkpoint_reader = tf.train.NewCheckpointReader(
        tf.train.latest_checkpoint(checkpoint_dir))
    return checkpoint_reader.get_tensor(tf.GraphKeys.GLOBAL_STEP)
  except:  
    return 0

<b>Train and evaluate function </b>

In [0]:
def train_and_evaluate(out_dir, params, steps_per_eval, eval_after_step, stop_after_iter, metric):

#Delete prior model graph, checkpoints and eval files to enable consecutive runs, rather than resetting runtime
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
  # Doesn't matter if the directory didn't exist
    pass
  tf.gfile.MakeDirs(OUTPUT_DIR)

  if DATASET == 'Hateval':
    
    #Cannot be dynamically set so we'll set warmup to these statuc values, should be around 8 - 12% of steps
    num_warmup_steps = 70
  else:
    num_warmup_steps = 300

  max_steps = params['train_steps']


  model_fn = run_classifier.model_fn_builder(
    bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
    num_labels=len(label_list),
    init_checkpoint=bert_ckpt_file,
    learning_rate=LEARNING_RATE,
    num_train_steps=NUM_TRAIN_STEPS,
    num_warmup_steps=num_warmup_steps,
    use_tpu=True,
    ft_params = FT_PARAMS)

  estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=True,
    model_fn=model_fn,
    config=RUN_CONFIG,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE,
    predict_batch_size=PREDICT_BATCH_SIZE)
      
 # load last checkpoint step and start from there
  current_step = load_global_step_from_checkpoint_dir(out_dir)
  
  print('\nMax number of training steps is {:d}. Current step {:d}.'\
        .format(max_steps , current_step))

  start_timestamp = time.time()  # This time will include compilation time
  best_score = 0
  best_model = 0
  while current_step < max_steps:
    # Train for up to steps_per_eval number of steps.
    # At the end of training, a checkpoint will be written to --model_dir.
    if current_step < eval_after_step:
      next_checkpoint = eval_after_step
    else:
      next_checkpoint = min(current_step + steps_per_eval, max_steps)
    estimator.train(input_fn=train_input_fn, max_steps=next_checkpoint)
    current_step = next_checkpoint
    print('\nFinished training up to step {:d}. Elapsed seconds {:d}.\n'.format(
                    next_checkpoint, int(time.time() - start_timestamp)))

    print('\nStarting to evaluate at step {:d} \n'.format(next_checkpoint))
    eval_results = estimator.evaluate(
      input_fn=test_input_fn,
      steps=params['num_eval_features'] // EVAL_BATCH_SIZE)
    print('\nEval results at step {:d}: \n'.format(next_checkpoint), eval_results)
    print('\n')
    
    current_score = eval_results[metric]
    if current_score > best_score:
      best_score = current_score 
      best_model = current_step
      score_buffer = [] #Reset buffer
    else:
      score_buffer.append(current_score)
    #If 3 times in a row evaluation results haven't improved; we stop training
    if len(score_buffer) == stop_after_iter:
      elapsed_time = int(time.time() - start_timestamp)
      
      print('\nFinished training at step {:d} as there has been no improvement on the previous {:d} iterations'.format(current_step, stop_after_iter),
      '\nElapsed seconds {:d}. \n'.format(elapsed_time), 
      "\nBest model is at step {:d} with the best F-score {:f}".format(best_model, best_score))
      
      # Remotely edit the protocol buffer file so best model step is loaded
      storage_client = storage.Client()
      bucket = storage_client.get_bucket(BUCKET)
      blob = bucket.get_blob(os.path.join(output_dir, 'checkpoint'))
      string_blob = blob.download_as_string()
      temp_str = string_blob.decode()
      temp_str = temp_str.replace(str(current_step), str(best_model), 1)
      string_blob = temp_str.encode()
      blob.upload_from_string(string_blob)
      new_ckpt_file = tf.train.latest_checkpoint(OUTPUT_DIR)
      
      assert new_ckpt_file is not None, "File was not edited correctly"
      return new_ckpt_file, best_model
    
  elapsed_time = int(time.time() - start_timestamp)
  print('\nFinished training up to step {:d}. Elapsed seconds {:d}. \n'.format(max_steps, elapsed_time))
  # Remotely edit the protocol buffer file so best model step is loaded
  storage_client = storage.Client()
  bucket = storage_client.get_bucket(BUCKET)
  blob = bucket.get_blob(os.path.join(output_dir, 'checkpoint'))
  string_blob = blob.download_as_string()
  temp_str = string_blob.decode()
  temp_str = temp_str.replace(str(current_step), str(best_model), 1)
  string_blob = temp_str.encode()
  blob.upload_from_string(string_blob)
  new_ckpt_file = tf.train.latest_checkpoint(OUTPUT_DIR)
  
  assert new_ckpt_file is not None, "File was not edited correctly"
  return new_ckpt_file, best_model

Now run the train_and_evaluate function. We can toggle the steps_per_eval in the params to control how often we checkpoint and evaluate

In [0]:
#@title `train_and_evaluate` Params:
#@markdown The metric we evaluate on:
metric = "F1_Score" #@param ["F1_Score", "auc", "eval_loss", "eval_accuracy"]
#@markdown The step to begin evaluation at:
eval_after_step = 600 #@param {type:"slider", min:100, max:10000, step:100}
#@markdown After eval_after_step value, how often we evaluate model:
steps_per_eval = 100 #@param {type:"slider", min:50, max:1000, step:50}
#@markdown If no increase in metric after this many iterations, we stop function:
stop_after_iter = 2 #@param {type:"slider", min:1, max:10, step:1}


In [0]:
new_ckpt_file, best_model = train_and_evaluate(OUTPUT_DIR, params, steps_per_eval , eval_after_step, stop_after_iter, metric) 
print("\nBest checkpoint for model is at", new_ckpt_file)
print("If training from scratch below - train with both train & dev set - recommended steps for training is", int((best_model + (best_model) *0.2)))


train_batch_size=32  eval_batch_size=8  max_steps=1500000
INFO:tensorflow:Using config: {'_model_dir': 'gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model1/output1', '_tf_random_seed': 3060, '_save_summary_steps': 100, '_save_checkpoints_steps': 100000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.93.26.106:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f746f5d86a0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://1

# Testing Model



In [0]:
#@title  Final Train + Test Params:
#@markdown - <b>JUST_PREDICT:</b> If you wish to use already trained model, which is the best checkpoint found in `train_and_evaluate()` if it has been executed.

#@markdown - <b>TRAIN_FROM_SCRATCH:</b> If you want to build new model from scratch - using data from both train and dev data for training - then set NUM_TRAIN_STEPS to recommened value printed above.
OPTION = "TRAIN_FROM_SCRATCH" #@param ["TRAIN_FROM_SCRATCH", "JUST_PREDICT"]

#@markdown Below params only necessary if training from scratch
LEARNING_RATE = 0.00002 #@param {type:"slider", min:1e-5, max:5e-5, step:1e-6}
NUM_TRAIN_STEPS = 1050 #@param {type:"slider", min:0, max:10000, step:50}

#@markdown Tick box if you wish to oversample the hate speech data to correct imbalance in the dataset
OVERSAMPLE = False #@param {type:"boolean"}

#@markdown You can oversample hate speech or non-hate speech data
oversampleLabel = "Hate" #@param ["Hate", "Not Hate"]
#@markdown Choose the number of times over you want the subsample you've selected to be mulitplied.
multiplier = 2 #@param {type:"slider", min:1, max:10, step:1}
#@markdown If you just want to balance the class labelling of the set, set multiplier to 1. If the label is set to the majority class and multiplier = 1 then this will result in an undersample

def oversample(train, lab = "Hate", multiplier = 1):
    
    neg_train = train.loc[train['label'] == 0]
    pos_train = train.loc[train['label'] == 1]
    
     #Whether we're sampling from hate or not hate for the term
    if lab == "Hate":
      aug_set = pos_train
    else:
      aug_set = neg_train

    ids = np.arange(len(aug_set))

    #You can multiply your set by a chosen number
    if multiplier > 1:
      choices = np.random.choice(ids, len(aug_set) * multiplier )

    #Or you can simply match the opposite label and either undersmple of oversample
    else:
      choices = np.random.choice(ids, (len(train) - len(aug_set)))

    aug_train = aug_set.iloc[choices]

    size = len(train)
    if lab == "Hate":
      train = pd.concat([aug_train, neg_train], axis=0)
    else:
      train = pd.concat([aug_train, pos_train], axis=0)
    
    print(len(train)-size , "added tweets\n")
    #shuffle
    train = train.sample(frac = 1, random_state=SEED) #Shuffle data
    return train

if OVERSAMPLE == True:
  print("OVERSAMPLING DATA")


Loading in train and test data...

<i>N.B Edit words array within this cell if you'd like to oversample based on words</i>

In [33]:
train = pre.loadData(rawTrain, rawDev, options = options, dataset = DATASET)
if OVERSAMPLE == True:
  
  train = oversample(train,oversampleLabel, multiplier) 
  
print("\nOut of {} tweets in the training database, {} are not {}, {} are {}".format(len(train.index),
                                                      len(train[train['label']==0]), classification_type,
                                                      len(train[train['label']==1]), classification_type))
test = pre.loadData(rawTest, options = options, dataset = DATASET)

#Authors have identified some dupliactes in the set 
test.drop_duplicates(subset = "tweet", inplace = True)

print("\nOut of {} tweets in the testing database, {} are not {}, {} are {}".format(len(test.index),
                                                      len(test[test['label']==0]), classification_type,
                                                      len(test[test['label']==1]), classification_type))
test.head()


Out of 10000 tweets in the training database, 5790 are not hate, 4210 are hate

Out of 2971 tweets in the testing database, 1714 are not hate, 1257 are hate


Unnamed: 0,id,tweet,label
2317,31035,anyone whoever doubted louis and said he couldnt sing go listen to back to you and if u dont change ur mind ur just a bitter bitch,0
2867,34114,user bitch i was fuckn up till 430 but your hoe ass didnt text me back rage,1
1378,34255,this is nancynancy called my pay raise crumbs nancy doesnt want to fund the military nancy puts illegal aliens rights before citizen rightsnancy wants to house the illegals before our homeless veterans dont be a nancyuser trump wednesday wisdom maga,0
2281,34280,user well bitch tell me how you download viss,0
1481,33582,tx man arrested trying to get into house with knife heriberto coronado 28 is alleged to have held a knife to a female victims throat at one point as well he was also named in a detainer on an immigration charge deport them all,0


Function to get predictions on test data

In [0]:
def getPrediction(in_sentences):
  #Makes output less verbose
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

  labels = [0, 1]
  input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=True)
  predictions = list(estimator.predict(predict_input_fn))

  #Initialise empty predicted labels array
  predicted_classes = [None] * len(predictions)

  #Use a for loop to iterate through probabilities and for each prediction assign a label
  #corresponding to which label has the highest probability
  for i in range(0, len(predictions)):
    if predictions[i]['probabilities'][0] > predictions[i]['probabilities'][1]:
      predicted_classes[i] = 0
    else:
      predicted_classes[i] = 1
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # Reset tensorflow verboisty to normal

  return predicted_classes

Converting to features, setting run and model configs.

Then training on train and dev set and predicting on unseen test set 

In [35]:
if OPTION == 'TRAIN_FROM_SCRATCH':
  
  bert_ckpt_file = tf.train.latest_checkpoint(OUTPUT_DIR.replace('output1', ''))
  train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                        text_a = x[DATA_COLUMN], 
                                                                        text_b = None, 
                                                                        label = x[LABEL_COLUMN]), axis = 1)

  train_features =  run_classifier.convert_examples_to_features(train_InputExamples, label_list, \
                                                                MAX_SEQ_LENGTH, tokenizer)


  #Delete prior model graph, checkpoints and eval files to make room for new model each loop
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
  # Doesn't matter if the directory didn't exist
    pass
  tf.gfile.MakeDirs(OUTPUT_DIR)

if OPTION == 'JUST_PREDICT':
  bert_ckpt_file = new_ckpt_file
  

# Compute # warmup steps
num_warmup_steps = int(NUM_TRAIN_STEPS * 0.1)

# Model configs

model_fn = run_classifier.model_fn_builder(
  bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
  num_labels=len(label_list),
  init_checkpoint=bert_ckpt_file,
  learning_rate=LEARNING_RATE,
  num_train_steps=NUM_TRAIN_STEPS,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  ft_params = FT_PARAMS)

estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
  use_tpu=True,
  model_fn=model_fn,
  config=RUN_CONFIG,
  train_batch_size=TRAIN_BATCH_SIZE,
  eval_batch_size=EVAL_BATCH_SIZE,
  predict_batch_size=PREDICT_BATCH_SIZE)

# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True)

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)


if OPTION != "JUST_PREDICT":

  print("\nThe model will stop training when it reaches", NUM_TRAIN_STEPS, "as a checkpoint")
  print(f'Beginning Training!')
  current_time = datetime.now()
  estimator.train(input_fn=train_input_fn, max_steps=NUM_TRAIN_STEPS)
  train_time = datetime.now() - current_time
  print("Training took time ", train_time)
 
#Run testing
predictions = getPrediction(test.tweet)
if DATASET == "HatEval":

  test['predictions'] = predictions
  test.to_csv('gs://csc3002/hateval2019/predictions.csv', sep=',',  index = True, encoding = 'utf-8')

  row = pd.Series({'F1 Score': metrics.f1_score(test.label, test.predictions),\
                   "Macro F1 Score": metrics.f1_score(test.label, test.predictions, average = 'macro'),\
                   'auc': metrics.roc_auc_score(test.label, test.predictions),\
                   'Accuracy': metrics.accuracy_score(test.label, test.predictions),\
                   'Precision': metrics.precision_score(test.label, test.predictions),\
                   'Recall': metrics.recall_score(test.label, test.predictions),\
                   'Training Time': train_time, 'steps':  NUM_TRAIN_STEPS})
    
  display(row)

elif DATASET == "AnalyticsVidhya":
  test['label'] = predictions
  print(test.label.value_counts())
  print(predictions[0:20])
  test.drop(columns = ['tweet'], axis = 1,inplace = True)
  test.to_csv('gs://csc3002/trial/submission.csv', sep=',', index = False)


INFO:tensorflow:Writing example 0 of 10000
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] drake i love you we should date rihanna the savage [SEP]
INFO:tensorflow:input_ids: 101 7867 1045 2293 2017 2057 2323 3058 25439 1996 9576 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

F1 Score                0.648506
Macro F1 Score          0.516756
auc                     0.609023
Accuracy                0.552676
Precision               0.485737
Recall                  0.975338
Training Time     0:11:03.402173
steps                       1050
dtype: object

## Error Analysis

NOTE: Not available for AnalyticsVidhya dataset as it is an unsupervised set

In [36]:
test.head(40)

Unnamed: 0,id,tweet,label,predictions
2317,31035,anyone whoever doubted louis and said he couldnt sing go listen to back to you and if u dont change ur mind ur just a bitter bitch,0,0
2867,34114,user bitch i was fuckn up till 430 but your hoe ass didnt text me back rage,1,1
1378,34255,this is nancynancy called my pay raise crumbs nancy doesnt want to fund the military nancy puts illegal aliens rights before citizen rightsnancy wants to house the illegals before our homeless veterans dont be a nancyuser trump wednesday wisdom maga,0,1
2281,34280,user well bitch tell me how you download viss,0,1
1481,33582,tx man arrested trying to get into house with knife heriberto coronado 28 is alleged to have held a knife to a female victims throat at one point as well he was also named in a detainer on an immigration charge deport them all,0,0
408,30771,user whatever it takes take the money out of welfare payments to the lazy and use that take it from aid to mexico and south and central america just build that wall,1,1
2370,30691,this bitch got one time to fuck wit me or my dad n ima let her whore ass know,1,1
248,32096,ban rape fugees ban islam in the us ten year old german boy raped by migrant classmates on school trip via user,1,1
2861,31583,when hoes talk shit about me i started to get mad like hell bitch i can make your life like a living hell dont play with me,1,1
2200,34306,and anyone who says otherwise is either a grade a asshole or has no idea what they are talking about being depressed is a bitch but,0,0


### False Negatives

There are far fewer false negatives than false positives

In [0]:
def get_missed_HS(df):
  df = df.loc[(df["label"] == 1) & (df["predictions"] == 0)]
  return df

def get_misclassified_HS(df):
  df = df.loc[(df["label"] == 0) & (df["predictions"] == 1)]
  return df
#Perhaps having a separate function for both is a good idea

df = get_missed_HS(test)
df.to_csv('gs://csc3002/hateval2019/falsenegatives.csv', sep=',',  index = True, encoding = 'utf-8')
df.sample(50)

Unnamed: 0,id,tweet,label,predictions
2580,31940,user another son of a dirty fucking korean whore bitch,1,0
57,30401,the other invasion according to homeland security stats more than 600k aliens overstayed us visas in 2017 deport ice send them home,1,0
54,30321,stephen miller the public charge law has been on the books for 100 years it prevents immigrants from burdening the us taxpayer if they cant take care of themselvers or their kids send them back trump maga red nation rising,1,0
152,33854,multuser it may have to do with plain honest speech so many politicians have the political speech bug theyre accustomed to stating what people want to hear not what they need to like saying it is fucktard ed to believe the mass refugee rape fuge e immigration was good is idiodic,1,0
2681,30599,morning twitter is being anal and trying to be a safe space for terminally offended dont say cunt fuck bitch wanker cunts,1,0
98,33193,everyones condemning miller with uncles article yet no one mentioning in 1st para he set foot on ellis island on january 7 1903 he came sent for family legally big differences than people overstaying visas lying to get in and crashing borders send them back,1,0
148,31745,user hes wanted for murder but who exactly was paying for their medical bills taxpayers yet funds to support tax payers is running out send them back,1,0
22,32754,navy rescued manchester bomber salman abedi from war torn libya three years before deadly terror attack via user west is rescuing people that will kill them send them home bombing attack have some dignity and throw them out,1,0
2740,30844,bitch whore son of bitch shit pig dinosaur elephant rhino monkey dog,1,0
261,30542,multuser there are no innocent people in detention centres send them back,1,0


### False Positives

By far the most misclassifications are false postives, (over 90%)

In [0]:
df = get_misclassified_HS(test)
df.to_csv('gs://csc3002/hateval2019/falsepositives.csv', sep=',',  index = True, encoding = 'utf-8')
df.sample(50)

Unnamed: 0,id,tweet,label,predictions
1378,34255,this is nancynancy called my pay raise crumbs nancy doesnt want to fund the military nancy puts illegal aliens rights before citizen rightsnancy wants to house the illegals before our homeless veterans dont be a nancyuser trump wednesday wisdom maga,0,1
2281,34280,user well bitch tell me how you download viss,0,1
2336,32255,user your shoes are ugly and anarcho primitivism sucks bitch,0,1
1160,30974,multuser this is a pretty sight build that wall,0,1
2173,30707,im tryna go ghost n come back a whole new bitch like bam,0,1
1790,31768,hate seeing articles about what men should or should not weardo to whoever comes up with this sht bitch mind your own business,0,1
1032,30385,what is the thinking behind veterans against trump potus has reformed improved the va while hundreds died during obama reign waiting for treatmentthey can think what they want but lets be rational maga kag potus trump news vote red no daca walk away red wave,0,1
1551,33424,yall you a hoe hoe me thats hoe culture to you bitch,0,1
1230,32576,user i have an idea build that wall detention centers would be reduced dramatically if not eliminated problem solved,0,1
1653,32425,user bitch you the one that said you wanted to play,0,1


 ### Inspecting textual content of datasets

In [0]:
hatetrain = train[train['label'] == 1]
nothatetrain = train[train['label'] == 0]

hatetest = test[test['label'] == 1]
nothatetest = test[test['label'] == 0]

falsepos = get_misclassified_HS(test)
falseneg = get_missed_HS(test)

dfs = [train, hatetrain, nothatetrain, test, hatetest, nothatetest, falsepos, falseneg]

#### % of tweets containing a particular word/phrase

Use function below to get a percent of tweets within a dataset containing a certain term

In [0]:
def containsWordPercent(word, dfs, saveresult = False):
  inds = ["Train", "Train HS", "Train ~HS", "Test", "Test HS", "Test ~HS", "False Positives", "False Negatives"]
  finaldf = pd.DataFrame(columns = ["No. of Rows Containing {}".format(word), "% of Rows Containing {}".format(word)])
  for i, df in  enumerate(dfs):
    wordnum = (df.tweet.str.contains(word).sum())
    wordperc = wordnum/len(df.index) * 100

    row = pd.Series({"No. of Rows Containing {}".format(word): wordnum, "% of Rows Containing {}".format(word): wordperc })
    row = pd.Series(row, name = inds[i])
    finaldf = finaldf.append(row)
  
  finaldf["No. of Rows Containing {}".format(word)] =\
  finaldf["No. of Rows Containing {}".format(word)].astype(int)

  display(finaldf)

  if saveresult == True:
    finaldf.to_csv('gs://csc3002/hateval2019/{}percent.csv'.format(word), sep=',',  index = True, encoding = 'utf-8')


containsWordPercent("maga", dfs)

Unnamed: 0,No. of Rows Containing maga,% of Rows Containing maga
Train,288,2.88
Train HS,249,5.914489
Train ~HS,39,0.673575
Test,374,12.466667
Test HS,107,8.492063
Test ~HS,267,15.344828
False Positives,267,18.619247
False Negatives,0,0.0


#### Most Common Words in Dataset
Function below shows the most common words - excluding stopwords. 

If less than n words are returned then increase maxval within the function `n_mostCommonWords()`


In [0]:
from collections import Counter

def n_mostCommonWords(n, df):
  maxval = 100
  finaldf = pd.DataFrame(columns = ['words', 'count'])
  commonwords = Counter(" ".join(df["tweet"]).split()).most_common(maxval)
  wordlist = [pre.remove_stopwords(word[0]) for word in commonwords]
  wordlist = [w for w in wordlist if w != '']
  countlist = [word[1] for word in commonwords if word[0] in wordlist]
  finaldf['words'] = wordlist[:n]
  finaldf['count'] = countlist[:n]

  percentlist = [(df.tweet.str.contains(word).sum()/len(df.index)) * 100 for word in wordlist[:n]]
  finaldf["% of tweets containing word"] = percentlist

  display(finaldf)

n_mostCommonWords(20, falsepos)

Unnamed: 0,words,count,% of tweets containing word
0,user,640,54.588235
1,wall,567,36.54902
2,build,511,35.921569
3,bitch,489,37.411765
4,multuser,315,20.784314
5,maga,261,20.862745
6,daca,220,15.921569
7,trump,210,16.392157
8,america,191,15.843137
9,illegal,190,12.313725


# Using Tensorboard to Get Deeper Insight

In [0]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip  

--2020-04-21 13:33:21--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 52.73.195.55, 34.192.215.160, 52.3.53.111, ...
Connecting to bin.equinox.io (bin.equinox.io)|52.73.195.55|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13773305 (13M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip’


2020-04-21 13:33:21 (28.3 MB/s) - ‘ngrok-stable-linux-amd64.zip’ saved [13773305/13773305]

Archive:  ngrok-stable-linux-amd64.zip
  inflating: ngrok                   


In [0]:
def get_tensorboard(path_to_event_file = OUTPUT_DIR):
  get_ipython().system_raw('tensorboard --logdir {} --host 0.0.0.0 --port 6006 --reload_multifile=true &'
.format(path_to_event_file))
  
  get_ipython().system_raw('./ngrok http 6006 &')

  !curl -s http://localhost:4040/api/tunnels | python3 -c \
      "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

get_tensorboard(OUTPUT_DIR)

https://828d683a.ngrok.io
