In [0]:
# Copyright 2019 Google Inc.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<a href="https://colab.research.google.com/github/kpe/bert-for-tf2/blob/master/examples/tpu_movie_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Detecting Hate speech Tweets with BERT

We are using bert-tensorflow for this classification task. At the moment I'm making sure it's tensorflow version 1.x because tensorflow version 2 gives issues with Bert at the moment. I believe Tensorflow hopes to have this issue resolved in tensorflow v 2.1

We are using a TPU as a GPU does not have the required memory for Large BERT models- it can only cope with the base model. We'll see if there a TPU detected and we'll set it to a global environment variable so it can be accessed by our BERT functions later.

In [42]:
%pip install bert-tensorflow
!pip install gcsfs
import pandas as pd
import numpy as np
from tensorflow.contrib.tpu.python.tpu import tpu_optimizer

#Make sure to use tensorflow version 1.x, version 2 doesn't work with bert
%tensorflow_version 1.x
import tensorflow as tf
import tensorflow_hub as hub
import os

import html
import re
import json
import pprint
import random
import string
from datetime import datetime

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)
USE_TPU=True
try:
  #tf.config.experimental_connect_to_host(TPU_WORKER)
  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
    tpu=TPU_ADDRESS)
  tf.config.experimental_connect_to_cluster(cluster_resolver)
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
except Exception as ex:
  print(ex)
  USE_TPU=False

print("        USE_TPU:", USE_TPU)
print("Eager Execution:", tf.executing_eagerly())

assert not tf.executing_eagerly(), "Eager execution on TPUs have issues currently"


TPU address is grpc://10.73.230.250:8470
TPU address is grpc://10.73.230.250:8470
        USE_TPU: True
Eager Execution: False


Setting a random seed for reproducability

In [43]:
tf.set_random_seed(3060)
print("Tensorflow Version:", tf.__version__)

Tensorflow Version: 1.15.0


Below we will set the directory where we will store our output model. To ensure the right variables are loaded in our run config function later, our ouput directory must be in the same directory as our pre-trained bert model directory.

Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist).

In [44]:
#Choose which model you'd like -
#MUST be in GCP bucket

#Bert uncased Large 
#bert_model_name = 'uncased_L-24_H-1024_A-16' 

#Large whole word masking
bert_model_name = 'wwm_uncased_L-24_H-1024_A-16' 

#Adding further pretrained model
"""further_pretrained_model = \
os.path.join(bert_model_name, 'further_pretrained_model')
"""
#Where we output the final, fine tuned model
output_dir = \
os.path.join(bert_model_name, 'output')

#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = True #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = True #@param {type:"boolean"}
BUCKET = 'csc3002' #@param {type:"string"}

if USE_BUCKET:
  OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, output_dir)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))


***** Model output directory: gs://csc3002/wwm_uncased_L-24_H-1024_A-16/output *****


#Data
I've stored the dataset in my google bucket for ease of access, authentication will have to be provided

In [45]:
!gcloud config set project 'my-project-csc3002'

data = 'gs://csc3002/hateval2019/hateval2019_en_train.csv'
data = pd.read_csv(data, sep=',',  index_col = False, encoding = 'utf-8')
data1 = 'gs://csc3002/hateval2019/hateval2019_en_dev.csv'
data1 = pd.read_csv(data1, sep=',',  index_col = False, encoding = 'utf-8')
frames = [data,data1]
data = pd.concat(frames)
data.rename(columns={'text': 'tweet', 'HS': 'label'}, inplace=True)
cols = ['TR', 'AG']
data.drop(cols, inplace = True, axis = 1)
data = data.sample(frac=1)
data.reset_index(drop = True, inplace = True)
data.id = data.index
pd.set_option('display.max_colwidth', -1)
data.head()

Updated property [core/project].


Unnamed: 0,id,tweet,label
0,0,The woman who accused Nelly of rape last weekend is dropping her case. Should he get a public apology? https://t.co/iBw0SdxuPL,0
1,1,Hysterical Hypocrite Liberals going after @ICEgov and Trump over 10 year old girl where were they when Obama deported 2 Million Hispanics ??,0
2,2,Come work for @IRCEurope advocacting on behalf of refugees and migrants. Based in Tunis. DM for more details https://t.co/X3sEffRwf3,0
3,3,#Kakuma Refugee Camp benefits from renewable sources of energy | Business Today https://t.co/iRUrCSNzUH via @KTNNews @philkeits #WithRefugees @UNHCR_Kenya @refugeeaffairs @drckenya @NRC_HoA @tyrusmaina @imaana102,0
4,4,Understanding the California Mind - American Greatness https://t.co/ikZZPvG703#IllegalAlienInvasion#ProgressivePolicies #StopTheInvasion #UncheckedProgressivism,1


We store the name of the Data column containing the text we wish to classify and the name of the corresponding label column in global variables for ease of access down line and also so this code is generalizable. Label list is just a 0 or a 1 because the version of BERT we've created below only deals in binary classifcation and labels must be ints

In [46]:
print(data.label.value_counts(), "\n")
print("\nThere are", len(data.index), "tweets total in this database")

0    5790
1    4210
Name: label, dtype: int64 


There are 10000 tweets total in this database


# Adding in data from ICVSM 2017
Reliably annotated dataset, very useful because like HatEval it includes tweets with terms like n**ger and f-g that might be associated with hate speech, but these terms aren't used in a hate speech context. I believe this is where my model fails most as where it fails is it's rate of false positives comes from

<b>Although it's important to note that this HatEval dataset is hate speech targetting immigrants and women. ICVSM 2017 has a whole range of targets, not just women and immigrants and so possibly may not be beneficial towards improving performance because of this.

In [47]:
#This harms performance for now, whether it's adding the whole set or just the HS tweets.
#Maybe try again when you've another fine tuning model. Although  
"""sometweets = 'gs://csc3002/Raw_Data/ICVSM_2017.csv'
sometweets = pd.read_csv(sometweets, sep=',',  index_col = False, encoding = 'utf-8')
cols = ['Unnamed: 0', 'count','offensive_language','neither']
sometweets.drop(cols, inplace = True, axis = 1)
sometweets.reset_index(drop = True, inplace = True)

sometweets.loc[sometweets['class'] == 0, 'label'] = 1
sometweets.loc[sometweets['class'] != 0, 'label'] = 0
sometweets.drop(['class'], inplace = True, axis = 1)
print(sometweets.label.value_counts())

#Further refining this dataset. As I've demonstrated in the notebook where I inspected this data;
#there needs to be an agreeent of 3 or more annotators for it to qualify as hate speech to fit my description
sometweets = sometweets[sometweets.label == 1]
sometweets.label = sometweets.label.astype(int)
sometweets = sometweets[sometweets['hate_speech'] > 2]
print(sometweets.label.value_counts())
sometweets.head()"""

"sometweets = 'gs://csc3002/Raw_Data/ICVSM_2017.csv'\nsometweets = pd.read_csv(sometweets, sep=',',  index_col = False, encoding = 'utf-8')\ncols = ['Unnamed: 0', 'count','offensive_language','neither']\nsometweets.drop(cols, inplace = True, axis = 1)\nsometweets.reset_index(drop = True, inplace = True)\n\nsometweets.loc[sometweets['class'] == 0, 'label'] = 1\nsometweets.loc[sometweets['class'] != 0, 'label'] = 0\nsometweets.drop(['class'], inplace = True, axis = 1)\nprint(sometweets.label.value_counts())\n\n#Further refining this dataset. As I've demonstrated in the notebook where I inspected this data;\n#there needs to be an agreeent of 3 or more annotators for it to qualify as hate speech to fit my description\nsometweets = sometweets[sometweets.label == 1]\nsometweets.label = sometweets.label.astype(int)\nsometweets = sometweets[sometweets['hate_speech'] > 2]\nprint(sometweets.label.value_counts())\nsometweets.head()"

In [48]:
"""frames = [data,sometweets]
data = pd.concat(frames)
data = data.sample(frac=1)
data.reset_index(drop = True, inplace = True)
data.info()"""

'frames = [data,sometweets]\ndata = pd.concat(frames)\ndata = data.sample(frac=1)\ndata.reset_index(drop = True, inplace = True)\ndata.info()'

# Adding in augmented back-translated hate speech tweets as extra data

We have very few instances of hate speech labelled in this dataset. To remedy this I performed back_translation augmentation on this training set.

Below I load in in the extra hate speech tweets I created via back-translation augmentation I performed in another colab notebook and I append it to the existing dataframe

In [49]:
"""dat = '/content/drive/My Drive/hateval2019/backtranslated_hatEval.txt' 
dat = pd.read_csv(dat, sep = '\t', names = ['tweet'], header = None, encoding = 'utf-8')
pd.set_option('display.max_colwidth', -1)
dat = dat.astype(str)
dat.head(50)"""

"dat = '/content/drive/My Drive/hateval2019/backtranslated_hatEval.txt' \ndat = pd.read_csv(dat, sep = '\t', names = ['tweet'], header = None, encoding = 'utf-8')\npd.set_option('display.max_colwidth', -1)\ndat = dat.astype(str)\ndat.head(50)"

**Rather than creating 3768 extra tweets, 19630 extra have been created. The tweets have been incorrectly parsed. Removing some tweets with a smaller length may mitigate this effect somewhat by removing tweets that were cut in half**

In [50]:
"""print("There are", len(dat.index), "tweets")
dat = dat[dat['tweet'].apply(lambda x: len(x) > 10)]
print("There are now", len(dat.index), "tweets")"""

'print("There are", len(dat.index), "tweets")\ndat = dat[dat[\'tweet\'].apply(lambda x: len(x) > 10)]\nprint("There are now", len(dat.index), "tweets")'

ok didn't help much. Let's see if it improves our validation anyways

**See how the english is a little off?** 

That's because these are the hate speech tweets in the training set translated to french, then translated back again. This creates a whole new, yet similar set of hate speech tweets to train on. (Slightly augmented text)

In [51]:
"""dat['label'] = 1
dat['id'] = 80000
frames = [dat,data]
data = pd.concat(frames)
print(data.info())
data.head()"""

"dat['label'] = 1\ndat['id'] = 80000\nframes = [dat,data]\ndata = pd.concat(frames)\nprint(data.info())\ndata.head()"

We'll shuffle the dataframe to make sure there's no funny business with the training of the model and we'll then reset the id field to make it unique and sequential for each row

In [52]:
"""data = data.sample(frac=1)
data.reset_index(drop = True, inplace = True)

data['id'] = data.reset_index().index + 1
print(data.label.value_counts(), "\n")
print(data.info())
length = len(data.index)
print("\nNow there are", length , "tweets total in this database")
data.tail(10)"""

'data = data.sample(frac=1)\ndata.reset_index(drop = True, inplace = True)\n\ndata[\'id\'] = data.reset_index().index + 1\nprint(data.label.value_counts(), "\n")\nprint(data.info())\nlength = len(data.index)\nprint("\nNow there are", length , "tweets total in this database")\ndata.tail(10)'

# Text Pre-Processing
Below is our custom preprocess function which performs simple text preprocessing of the tweets. It's functions are explained in the method

In [53]:
#The below function translates emojis to text
"""
%cd '/content/drive/My Drive'
%pip install demoji-0.1.5-py3-none-any.whl

import demoji
demoji.download_codes()
def emojiReplace(text_string):
    
    emoji_dict = demoji.findall(text_string)
    for emoji in emoji_dict.keys():
        text_string = text_string.replace(emoji, ' '+  emoji_dict[emoji])
    
    return text_string

#demoji.replace(preprocess(testtweet1), repl = )
testtweet1 = data.loc[4521]
testtweet2 = data.loc[4549]

print("\nOriginal:", testtweet1['tweet'])
print('Label:', testtweet1['label'])
print("\nPreprocessed:", emojiReplace(testtweet1['tweet']))

print("\nOriginal:", testtweet2['tweet'])
print('Label:', testtweet2['label'])
print("\nPreprocessed:", emojiReplace(testtweet2['tweet']))"""

'\n%cd \'/content/drive/My Drive\'\n%pip install demoji-0.1.5-py3-none-any.whl\n\nimport demoji\ndemoji.download_codes()\ndef emojiReplace(text_string):\n    \n    emoji_dict = demoji.findall(text_string)\n    for emoji in emoji_dict.keys():\n        text_string = text_string.replace(emoji, \' \'+  emoji_dict[emoji])\n    \n    return text_string\n\n#demoji.replace(preprocess(testtweet1), repl = )\ntesttweet1 = data.loc[4521]\ntesttweet2 = data.loc[4549]\n\nprint("\nOriginal:", testtweet1[\'tweet\'])\nprint(\'Label:\', testtweet1[\'label\'])\nprint("\nPreprocessed:", emojiReplace(testtweet1[\'tweet\']))\n\nprint("\nOriginal:", testtweet2[\'tweet\'])\nprint(\'Label:\', testtweet2[\'label\'])\nprint("\nPreprocessed:", emojiReplace(testtweet2[\'tweet\']))'

The contraction mapping below is not perfect. There are many ambigious contractions which are impossible to definitively resolve (e.g. he's - he has or he is).

The mappings below are unambigious but they are mapped to the most likely contractions, we specifically choose flashtext as our library of choice for text replacement purposes because of it’s execution speed.

In [54]:
!pip install flashtext

from flashtext import KeywordProcessor
contraction_mapping = { 
"ain't": "is not","aren't": "are not", "can't": "cannot", "can't've": "cannot have",
"'cause": "because","cause": "because","could've": "could have","couldn't": "could not",
"couldn't've": "could not have","didn't": "did not","doesn't": "does not",
"don't": "do not","hadn't": "had not","hadn't've": "had not have","hasn't": "has not",
"haven't": "have not", "he'd": "he would","he'd've": "he would have", "he'll": "he will",
"he'll've": "he will have", "he's": "he is", "how'd": "how did", "how'll": "how will",
"how's": "how is ", "I'd": "I would", "I'd've": "I would have", "I'll": "I will",
"I'll've": "I will have","I'm": "I am","I've": "I have","isn't": "is not",
"it'd": "it had", "it'd've": "it would have","it'll": "it will", "it'll've": "it will have",
"it's": "it is", "let's": "let us", "ma'am": "madam","mayn't": "may not",
"might've": "might have","mightn't": "might not","mightn't've": "might not have",
"must've": "must have", "mustn't": "must not","mustn't've": "must not have",
"needn't": "need not", "needn't've": "need not have","oughtn't": "ought not",
"oughtn't've": "ought not have", "shan't": "shall not","sha'n't": "shall not",
"shan't've": "shall not have","she'd": "she had","she'd've": "she would have",
"she'll": "she will", "she'll've": "she will have","she's": "she is",
"should've": "should have","shouldn't": "should not","shouldn't've": "should not have",
"so've": "so have","so's": "so is","that'd": "that would", "that'd've": "that would have",
"that's": "that is", "there'd": "there had","there'd've": "there would have",
"there's": "there is","they'd": "they had", "they'd've": "they would have",
"they'll": "they will", "they'll've": "they will have", "they're": "they are",
"they've": "they have", "to've": "to have","wasn't": "was not",
"we'd": "we would","we'd've": "we would have","we'll": "we will","we'll've": "we will have",
"we're": "we are","we've": "we have","weren't": "were not","what'll": "what will",
"what'll've": "what will have","what're": "what are","what's": "what is",
"what've": "what have","when's": "when is","when've": "when have","where'd": "where did",
"where's": "where has","where've": "where have","who'll": "who will",
"who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is",
"why've": "why have","will've": "will have","won't": "will not","won't've": "will not have",
"would've": "would have","wouldn't": "would not", "wouldn't've": "would not have",
"y'all": "you all","y'all'd": "you all would","y'all'd've": "you all would have",
"y'all're": "you all are","y'all've": "you all have","you'd": "you would",
"you'd've": "you would have","you'll": "you will", "you'll've": "you will have",
"you're": "you are", "you've": "you have"}

keyword_processor = KeywordProcessor() #initialise
for k, v in contraction_mapping.items():
  keyword_processor.add_keyword(k,v)



In [0]:
def preprocess(text_string):
    """
    Accepts a text string and:
    1) Removes URLS
    2) lots of whitespace with one instance
    3) Removes mentions
    4) Uses the html.unescape() method to convert unicode to text counterpart
    5) Replace & with and
    6) Remove the fact the tweet is a retweet if it is - knowing the tweet is 
       a retweet does not help towards our classification task.
    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[#$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+:'
    mention_regex1 = '@[\w\-]+'
    RT_regex = '(RT|rt)[ ]*@[ ]*[\S]+'
    
    # Replaces urls with URL
    parsed_text = re.sub(giant_url_regex, '', text_string)
    parsed_text = re.sub('URL', '', parsed_text)
    
    # Remove the fact the tweet is a retweet. 
    # (we're only interested in the language of the tweet here)
    parsed_text = re.sub(RT_regex, ' ', parsed_text) 
    
    # Removes mentions as they're redundant information
    parsed_text = re.sub(mention_regex, '',  parsed_text)
    #including ones with semi-colons after - this seems to come up often
    parsed_text = re.sub(mention_regex1, '',  parsed_text)  

    #Remove unicode
    parsed_text = re.sub(r'[^\x00-\x7F]','', parsed_text) 
    parsed_text = re.sub(r'&#[0-9]+;', '', parsed_text)  

    # Convert unicode missed by regex to text
   #parsed_text = html.unescape(parsed_text)

    #Remove excess whitespace at the end
    parsed_text = re.sub(space_pattern, ' ', parsed_text) 
    
    #Set text to lowercase and strip
    parsed_text = parsed_text.lower()
    parsed_text = parsed_text.strip()
    #parsed_text = keyword_processor.replace_keywords(parsed_text)
    
    return parsed_text

**Let's see what it looks like**

In [56]:
testtweet = data.loc[2100]
print("Original:", testtweet['tweet'])
print("Preprocessed:", preprocess(testtweet['tweet']))
print('Label:', testtweet['label'])

testtweet1 = data.loc[4521]
print("\nOriginal:", testtweet1['tweet'])
print("Preprocessed:", preprocess(testtweet1['tweet']))
print('Label:', testtweet1['label'])

testtweet2 = data.loc[4549]
print("\nOriginal:", testtweet2['tweet'])
print("Preprocessed:", preprocess(testtweet2['tweet']))
print('Label:', testtweet2['label'])

data['tweet'] = data['tweet'].apply(preprocess)

Original: Get over yourselves ladies... You're wrong.... all the time!
Preprocessed: get over yourselves ladies... you're wrong.... all the time!
Label: 1

Original: They keep attacking us because we're not allowed 2 stop them. We let them take over regions here & bring sharia to some of our cities. Mosque's were unheard of & we never had to bow to Muslims before 9/11. That's what I'll #NeverForgetKick them the hell out & #BUILDTHATWALL
Preprocessed: they keep attacking us because we're not allowed 2 stop them. we let them take over regions here & bring sharia to some of our cities. mosque's were unheard of & we never had to bow to muslims before 9/11. that's what i'll #neverforgetkick them the hell out & #buildthatwall
Label: 1

Original: SHUT. IT. DOWN.#BorderWall #BuildThatWall #KAG
Preprocessed: shut. it. down.#borderwall #buildthatwall #kag
Label: 1


 **In training, let's remove any tweets that have a length less than 10. They could skew our model**

In [57]:
length = len(data.index)
data = data[data['tweet'].apply(lambda x: len(x) > 10)]
#data = data[data['tweet'].apply(lambda x: len(x) < 300)]
print(length - len(data.index), "tweets have been reomved from the dataframe\n")
data = data.sample(frac=1)
data.reset_index(drop = True, inplace = True)
data.id = data.index
data.info()

49 tweets have been reomved from the dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9951 entries, 0 to 9950
Data columns (total 3 columns):
id       9951 non-null int64
tweet    9951 non-null object
label    9951 non-null int64
dtypes: int64(2), object(1)
memory usage: 233.4+ KB


# Splitting data into train and dev. Also specifying label and text columns

In [0]:
#The test data provided is supervised. So we can use the evaluate function of estimator
#to get a real approximation of our true f score with respect to the competition
train = data
test = 'gs://csc3002/hateval2019/hateval2019_en_test.csv'
test =  pd.read_csv(test, sep=',',  index_col = False, encoding = 'utf-8')
test.rename(columns={'text': 'tweet', 'HS': 'label'}, inplace=True)
test['tweet'] = test['tweet'].apply(preprocess)

DATA_COLUMN = 'tweet'
LABEL_COLUMN = 'label'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

#Data Preprocessing
We'll need to transform our data into a format BERT understands. This involves two steps. First, we create  `InputExample`'s using the constructor provided in the BERT library.

- `text_a` is the text we want to classify, which in this case, is the `tweet` field in our Dataframe. 
- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.
- `label` is the label for our example, i.e. True, False

In [0]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):


1. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
2. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
3. Map our words to indexes using a vocab file that BERT provides
4. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
5. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))

Happily, we don't have to worry about most of these details. It's automated with the below inbuilt functions




Below is a way to retrieve desired BERT parameters, such as it's pre-trained checkpoints and it's vocab file, from my google storage bucket where I've downloaded the uncased LARGE version of bert.

In [60]:
bucket_dir = 'gs://csc3002'

bert_ckpt_dir = os.path.join(bucket_dir, bert_model_name) 

#For normal model
bert_ckpt_file   = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
print("Using BERT checkpoint from:", bert_ckpt_dir)

#For further pretrained model
#bert_ckpt_file = tf.train.latest_checkpoint(further_pretrained_model)
#print("Using BERT checkpoint from:", further_pretrained_model)

bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")
vocab_file = os.path.join(bert_ckpt_dir, "vocab.txt")


tokenizer = bert.tokenization.FullTokenizer(vocab_file=vocab_file)
tokenizer.tokenize("This here's an example of using the BERT tokenizer")

Using BERT checkpoint from: gs://csc3002/wwm_uncased_L-24_H-1024_A-16


['this',
 'here',
 "'",
 's',
 'an',
 'example',
 'of',
 'using',
 'the',
 'bert',
 'token',
 '##izer']

Using our tokenizer, we'll call `run_classifier.convert_examples_to_features` on our InputExamples to convert them into features BERT understands.

In [0]:
# BERT is limited to 512 tokens in length
MAX_SEQ_LENGTH = 256
# Convert our train and test features to InputFeatures that BERT understands.
train_features =  bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

#Creating a model

Now that we've prepared our data, let's focus on building a model. `create_model` does just this below. It loads the configs of the BERT model we specified earlier and it creates a single layer that will be trained to adapt BERT to our task (i.e. classifying whether a tweet is hate speech or not). This strategy of using a mostly trained model is called [fine-tuning](http://wiki.fast.ai/index.php/Fine_tuning).

In [0]:
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
  """Creates a classification model."""
  model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  # In the demo, we are doing a simple classification task on the entire
  # segment.
  #
  # If you want to use the token-level output, use model.get_sequence_output()
  # instead.
  output_layer = model.get_pooled_output()

  hidden_size = output_layer.shape[-1].value

  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):
    if is_training:
      # I.e., 0.1 dropout
      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    probabilities = tf.nn.softmax(logits, axis=-1)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)

    return (loss, per_example_loss, logits, probabilities)

In [0]:
def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu,
                     use_one_hot_embeddings):
  """Returns `model_fn` closure for TPUEstimator."""

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    tf.logging.info("*** Features ***")
    for name in sorted(features.keys()):
      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]
    is_real_example = None
    if "is_real_example" in features:
      is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
    else:
      is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    (total_loss, per_example_loss, logits, probabilities) = create_model(
        bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
        num_labels, use_one_hot_embeddings)

    tvars = tf.trainable_variables()
    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
      (assignment_map, initialized_variable_names
      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      if use_tpu:

        def tpu_scaffold():
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
          return tf.train.Scaffold()

        scaffold_fn = tpu_scaffold
      else:
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    """ tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)"""

    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:

      train_op = optimization.create_optimizer(
          total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          train_op=train_op,
          scaffold_fn=scaffold_fn)
    elif mode == tf.estimator.ModeKeys.EVAL:

      def metric_fn(per_example_loss, label_ids, logits, is_real_example):
        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)

        accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions, weights=is_real_example)
        loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
        f1_score = tf.contrib.metrics.f1_score(label_ids, predictions)
        auc = tf.metrics.auc( label_ids, predictions)
        recall = tf.metrics.recall(label_ids, predictions)
        precision = tf.metrics.precision(label_ids, predictions)
        true_pos = tf.metrics.true_positives(label_ids, predictions)
        true_neg = tf.metrics.true_negatives(label_ids, predictions)
        false_pos = tf.metrics.false_positives(label_ids, predictions)  
        false_neg = tf.metrics.false_negatives(label_ids, predictions)
        return {
            "eval_accuracy": accuracy,
            "eval_loss": loss,
            "F1_Score": f1_score,
            "auc": auc,
            "precision": precision,
            "recall": recall,
            "true_positives": true_pos,
            "true_negatives": true_neg,
            "false_positives": false_pos,
            "false_negatives": false_neg
        }

      eval_metrics = (metric_fn, [per_example_loss, label_ids, logits, is_real_example])

      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          eval_metrics=eval_metrics,
          scaffold_fn=scaffold_fn)
    else:
      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          predictions={"probabilities": probabilities},
          scaffold_fn=scaffold_fn)
    return output_spec

  return model_fn

Next we'll wrap our model function in a `model_fn_builder` function that adapts our model to work for training, evaluation, and prediction.

#Model and Config Parameters

In [0]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
TRAIN_BATCH_SIZE = 32 #recommended 16 or 32
EVAL_BATCH_SIZE = 8
PREDICT_BATCH_SIZE = 8
LEARNING_RATE = 2.5e-5 # Recommended 5e-5, 3e-5 or 2e-5
NUM_TRAIN_EPOCHS = 4.0 # Recommended 2, 3 or 4
MAX_SEQ_LENGTH = 256
# Warmup is a period of time where the learning rate 
#is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1

# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

# Model configs
SAVE_CHECKPOINTS_STEPS = 1000


In [65]:
print("The model will stop training when it reaches", num_train_steps, "as a checkpoint")
print("\nThe bert checkpoint directory is", bert_ckpt_dir)
print("\nThe output directory is", OUTPUT_DIR)

The model will stop training when it reaches 1243 as a checkpoint

The bert checkpoint directory is gs://csc3002/wwm_uncased_L-24_H-1024_A-16

The output directory is gs://csc3002/wwm_uncased_L-24_H-1024_A-16/output


**Below are TPU model functions**

In [0]:

#cluster = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

run_config = tf.compat.v1.estimator.tpu.RunConfig(
    #I think the output file must be a sub-directory of the main BERT file
    model_dir=OUTPUT_DIR, 
    cluster=cluster_resolver,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=500,
        num_shards=8,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

    
model_fn = model_fn_builder(
  bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
  num_labels=len(label_list),
  init_checkpoint=bert_ckpt_file,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  use_one_hot_embeddings=True)

estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
    use_tpu=True,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE,
    predict_batch_size=PREDICT_BATCH_SIZE)

Next we create an input builder function that takes our training feature set (`train_features`) and produces a generator. This is a pretty standard design pattern for working with Tensorflow [Estimators](https://www.tensorflow.org/guide/estimators).

In [0]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True)

#input function for test data, we feed in our previously created test_features for this
test_input_fn = run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=True)


# HyperParameter Grid Search
Can also just be used to train and evaluate on single parameters

In [0]:
from itertools import product
from tensorflow.python.summary.summary_iterator import summary_iterator
from google.cloud import storage

#Filter out all log messages so console isn't consumed with memory
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

"""FIXED MODEL PARAMS"""
TRAIN_BATCH_SIZE = 32 #recommended 16 or 32
EVAL_BATCH_SIZE = 8
PREDICT_BATCH_SIZE = 8
MAX_SEQ_LENGTH = 256
# Warmup is a period of time where the learning rate 
#is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1

cluster = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

#Run config which will stay constant throughout
run_config = tf.compat.v1.estimator.tpu.RunConfig(
    #I think the output file must be a sub-directory of the main BERT file
    model_dir=OUTPUT_DIR, 
    cluster=cluster,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=500,
        num_shards=8,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

#Dataframe where grid search results will be stored. Empty to begin with
eval_df = pd.DataFrame(columns = ['F1 Score', 'precision', 'false_positives'] )

#Param range
batch_size = [32]
lr_values = [1.75e-5, 2e-5]
num_epochs_values = [3.0, 5.0, 7.0]
for TRAIN_BATCH_SIZE, NUM_TRAIN_EPOCHS, LEARNING_RATE in product(batch_size, num_epochs_values, lr_values):

  #Delete prior model graph, checkpoints and eval files to make room for new model each loop
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
  # Doesn't matter if the directory didn't exist
    pass
  tf.gfile.MakeDirs(OUTPUT_DIR)

  # Compute # train and warmup steps from batch size
  num_train_steps = int(len(train_features) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
  num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

  # Model configs
  model_fn = model_fn_builder(
  bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
  num_labels=len(label_list),
  init_checkpoint=bert_ckpt_file,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  use_one_hot_embeddings=True)

  estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
    use_tpu=True,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE,
    predict_batch_size=PREDICT_BATCH_SIZE)
  
  # Create an input function for training. drop_remainder = True for using TPUs.
  train_input_fn = bert.run_classifier.input_fn_builder(
      features=train_features,
      seq_length=MAX_SEQ_LENGTH,
      is_training=True,
      drop_remainder=True)

  #input function for test data, we feed in our previously created test_features for this
  test_input_fn = run_classifier.input_fn_builder(
      features=test_features,
      seq_length=MAX_SEQ_LENGTH,
      is_training=False,
      drop_remainder=True)
  
  print("\nThe model will stop training when it reaches", num_train_steps, "as a checkpoint")

  print(f'Beginning Training!')
  current_time = datetime.now()
  estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
  train_time = datetime.now() - current_time
  print("Training took time ", train_time)

  #You need to provide number of steps for a TPU
  eval_steps = int(len(test_InputExamples) / EVAL_BATCH_SIZE)

  #Eval will be slightly WRONG on the TPU because it will truncate the last batch.
  estimator.evaluate(input_fn=test_input_fn, steps=eval_steps)

  os.environ["GCLOUD_PROJECT"] = "csc3002"
  client = storage.Client()
  bucket = client.bucket('csc3002')

  blobs = list(bucket.list_blobs(prefix='wwm_uncased_L-24_H-1024_A-16/output/eval'))
  for blob in blobs:
    if 'events' in blob.name:
      eval_dir = os.path.join('gs://csc3002/', blob.name)
        
  for e in tf.train.summary_iterator(eval_dir):
      for v in e.summary.value:
        if v.tag == 'F1_Score':
          fscore = v.simple_value
        
        if v.tag == 'auc':
          auc = v.simple_value
        
        if v.tag == 'eval_accuracy':
          accuracy = v.simple_value
        
        if v.tag == 'recall':
          recall = v.simple_value

        if v.tag == 'precision':
          precision = v.simple_value
        
        if v.tag == 'false_positives':
          false_positives = v.simple_value

        if v.tag == 'false_negatives':
          false_negatives = v.simple_value

        if v.tag == 'true_positives':
          true_positives = v.simple_value

        if v.tag == 'true_negatives':
          true_negatives = v.simple_value

  ind = str(TRAIN_BATCH_SIZE) + '__' + str(LEARNING_RATE) + '__' + str(NUM_TRAIN_EPOCHS) #String representation denoting model configs
  row = pd.Series({'F1 Score': fscore, 'auc': auc, 'eval_accuracy': accuracy,'precision': precision,'recall': recall,\
                                  'false_negatives': false_negatives,'false_positives': false_positives,\
                   'true_negatives':true_negatives ,'true_positives': true_positives, 'Training Time': train_time }, name = ind)
  
  eval_df = eval_df.append(row)
  print(ind, "F-Score:", fscore)
#We'll save each time inside the loop, so if the program crashes after overuse of TPU
#we still have eval_results for many hyperparam combos
  eval_df.to_csv('gs://csc3002/hateval2019/eval_df.csv', sep=',', index = True)
#Loop ends. Save eval csv and print to console
eval_df.to_csv('gs://csc3002/hateval2019/eval_df.csv', sep=',', index = True)
eval_df


The model will stop training when it reaches 932 as a checkpoint
Beginning Training!
Training took time  0:09:13.011973
32__1.75e-05__3.0 F-Score: 0.6399791836738586

The model will stop training when it reaches 932 as a checkpoint
Beginning Training!
Training took time  0:09:26.509635
32__2e-05__3.0 F-Score: 0.6249999403953552

The model will stop training when it reaches 1554 as a checkpoint
Beginning Training!
Training took time  0:13:32.513474
32__1.75e-05__5.0 F-Score: 0.6331866383552551

The model will stop training when it reaches 1554 as a checkpoint
Beginning Training!
Training took time  0:13:16.818867
32__2e-05__5.0 F-Score: 0.6365535259246826

The model will stop training when it reaches 2176 as a checkpoint
Beginning Training!
