In [0]:
# Copyright 2019 Google Inc.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<a href="https://colab.research.google.com/github/kpe/bert-for-tf2/blob/master/examples/tpu_movie_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Detecting Hate speech Tweets with BERT

We are using bert-tensorflow for this classification task. At the moment I'm making sure it's tensorflow version 1.x because tensorflow version 2 gives issues with Bert at the moment. I believe Tensorflow hopes to have this issue resolved in tensorflow v 2.1

We are using a TPU as a GPU does not have the required memory for Large BERT models- it can only cope with the base model. We'll see if there a TPU detected and we'll set it to a global environment variable so it can be accessed by our BERT functions later.

In [2]:
%pip install bert-tensorflow
!pip install gcsfs
import pandas as pd
import numpy as np

#Make sure to use tensorflow version 1.x, version 2 doesn't work with bert
%tensorflow_version 1.x
import tensorflow as tf
import os

#For cross-validation and grid search
from itertools import product
from tensorflow.python.summary.summary_iterator import summary_iterator
from google.cloud import storage
import ipywidgets as widgets
from IPython.display import display

import sklearn
from sklearn.model_selection import KFold
from sklearn import metrics

import html
import re
import json
import pprint
import random
import string
from datetime import datetime

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

#Below we give ourselves as well as the TPU access to our private GCS bucket
from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)

USE_TPU=True
try:
  #tf.config.experimental_connect_to_host(TPU_ADDRESS)
  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
    tpu=TPU_ADDRESS)
  tf.config.experimental_connect_to_cluster(cluster_resolver)
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
except Exception as ex:
  print(ex)
  USE_TPU=False

print("        USE_TPU:", USE_TPU)
print("Eager Execution:", tf.executing_eagerly())

assert not tf.executing_eagerly(), "Eager execution on TPUs have issues currently"


Collecting bert-tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/a6/66/7eb4e8b6ea35b7cc54c322c816f976167a43019750279a8473d355800a93/bert_tensorflow-1.0.1-py2.py3-none-any.whl (67kB)
[K     |████▉                           | 10kB 20.8MB/s eta 0:00:01[K     |█████████▊                      | 20kB 2.2MB/s eta 0:00:01[K     |██████████████▋                 | 30kB 2.9MB/s eta 0:00:01[K     |███████████████████▍            | 40kB 2.1MB/s eta 0:00:01[K     |████████████████████████▎       | 51kB 2.4MB/s eta 0:00:01[K     |█████████████████████████████▏  | 61kB 2.8MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 2.5MB/s 
Installing collected packages: bert-tensorflow
Successfully installed bert-tensorflow-1.0.1
Collecting gcsfs
  Downloading https://files.pythonhosted.org/packages/3e/9f/864a9ff497ed4ba12502c4037db8c66fde0049d9dd0388bd55b67e5c4249/gcsfs-0.6.0-py2.py3-none-any.whl
Installing collected packages: gcsfs
Successfully installed gcsfs-

Setting a random seed for reproducability of results and checking version of tensorflow

In [3]:
tf.set_random_seed(3060)
print("Tensorflow Version:", tf.__version__)

Tensorflow Version: 1.15.0


Below we will set the directory where we will store our output model. To ensure the right variables are loaded in our run config function later, our ouput directory must be in the same directory as our pre-trained bert model directory.

Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist).

In [4]:
#Choose which model you'd like - MUST be in GCP bucket

#Bert uncased Large 
#bert_model_name = 'uncased_L-24_H-1024_A-16' 

#Large whole word masking
bert_model_name = 'wwm_uncased_L-24_H-1024_A-16' 

#Adding further pretrained model
further_pretrained_model = \
os.path.join(bert_model_name, 'further_pretrained_model')

#Where we output the final, fine tuned model
output_dir = \
os.path.join(further_pretrained_model, 'output')

#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = True #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = True #@param {type:"boolean"}
BUCKET = 'csc3002' #@param {type:"string"}
os.environ["GCLOUD_PROJECT"] = "csc3002"

if USE_BUCKET:
  OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, output_dir)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))


***** Model output directory: gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model/output *****


<b> If you're not connected to a TPU environment but still want to access GCS bucket - run below: </b>

In [5]:
"""from google.colab import drive
drive.mount('/content/drive')
!gcloud auth activate-service-account --key-file '/content/drive/My Drive/storageCreds.json'
"""

"from google.colab import drive\ndrive.mount('/content/drive')\n!gcloud auth activate-service-account --key-file '/content/drive/My Drive/storageCreds.json'\n"

#Data
I've stored all of the data in my google bucket for ease of access, authentication will have to be provided

In [6]:
!gcloud config set project 'my-project-csc3002'

train = pd.read_csv('gs://csc3002/hateval2019/hateval2019_en_train.csv', sep=',',  index_col = False, encoding = 'utf-8')
train.rename(columns={'text': 'tweet', 'HS': 'label'}, inplace=True)

train = train.sample(frac=1, random_state = 3020) #Shuffling really helps model performance
train.reset_index(drop = True, inplace = True)
train.id = train.index
pd.set_option('display.max_colwidth', -1)

Updated property [core/project].


In [7]:
print("Out of {} tweets in this database, {} are not hate, {} are hate".format(len(train.index), 
                                                      len(train[train['label']==0]),
                                                      len(train[train['label']==1])))


Out of 9000 tweets in this database, 5217 are not hate, 3783 are hate


<b>Original Dataset </b>

In [8]:
train.head(30)

Unnamed: 0,id,tweet,label,TR,AG
0,0,Good Morning to;#MAGA #NoDACA #NoDACAAmnesty #BuildTheWall #LockThemAllUp #AmericaFirst #NotABot RT FOREVER We Love you #POTUS @realDonaldTrump,1,0,1
1,1,"People deprived of the vital family support promised to them by the state, during the hardest time of their lives, thanks to... the state: Home Office delays + this country’s incoherent, cruel, racist, unworkable bureaucratic mess of an immigration system. https://t.co/awyB9MncSU",0,0,0
2,2,"In New Orleans, .pPresenting results on immigrants and civic engagement project @ilctr @MasonCHSS https://t.co/fHUBLcX2ou",0,0,0
3,3,@keenondevonte 😘😘 down bitch. That’s what women do. Provide for her man and put his head towards God when he have doubts. I love you 💙,1,1,0
4,4,@LisaBloom It's women like Kathy Griffin that the word cunt is becoming just as popular here as it is in the United Kingdom and Australia.,1,1,0
5,5,Hole in one and you deserve for it.Congrats https://t.co/EXzIzEXWhM,0,0,0
6,6,Guys relax. Leave the twitter fights to the professionals. I mean vaginals. I mean women. #WomenSuck,1,0,0
7,7,Why? Everywhere they go!! WHY?? Act like normal phucking people!! #Scum #NoMoreRefugees https://t.co/xlitWTPmJ5,1,0,0
8,8,Credit card boy Ian Duncan Smith and other Politicians should be forced to pay their dues mind u its not tax credits cuts that's for us,0,0,0
9,9,"@FareedFromSyria @noyalevy_ @ElianeAlhussein @CrazyNormie @UN @HasanAlthoffy @UNICEFEMOPS @talentosprecato @Marcnelsonart @KenRoth @AmnestySyria @UOSSM_France Try @msf or @unicefmena. @ICRC_sy and @refugees, can you help or give direction?",0,0,0


# Text Pre-Processing

### Translating emojis to text

In [9]:
#The below function translates emojis to text
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive'
%pip install demoji-0.1.5-py3-none-any.whl
%cd ../.. # Reset back to original directory

import demoji
demoji.download_codes()
def emojiReplace(text_string):
    
    emoji_dict = demoji.findall(text_string)
    for emoji in emoji_dict.keys():
        text_string = text_string.replace(emoji, ' '+  emoji_dict[emoji])
    
    return text_string

#demoji.replace(preprocess(testtweet1), repl = )
testtweet1 = train.loc[4521]
testtweet2 = train.loc[4549]

print("\nOriginal:", testtweet1['tweet'])
print('Label:', testtweet1['label'])
print("\nPreprocessed:", emojiReplace(testtweet1['tweet']))

print("\nOriginal:", testtweet2['tweet'])
print('Label:', testtweet2['label'])
print("\nPreprocessed:", emojiReplace(testtweet2['tweet']))
train['tweet'] = train['tweet'].apply(emojiReplace)


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive
Processing ./demoji-0.1.5-py3-none-any.whl
Installing collected packages: demoji
Successfully installed demoji-0.1.5
[Errno 2] No such file or directory: '../.. # Reset back to original directory'
/content/drive/My Drive
[33mDownloading emoji data ...[0m
[92m... OK[0m (Got response in 0.11 seconds)
[33mWriting emoji data to /root/.demoji/codes.json ...[0m
[92m... OK[0m

Original: UK rejects Christian refugees recommended by UN, 

<b>Set after emoji replacement

In [10]:
train.head(50)

Unnamed: 0,id,tweet,label,TR,AG
0,0,Good Morning to;#MAGA #NoDACA #NoDACAAmnesty #BuildTheWall #LockThemAllUp #AmericaFirst #NotABot RT FOREVER We Love you #POTUS @realDonaldTrump,1,0,1
1,1,"People deprived of the vital family support promised to them by the state, during the hardest time of their lives, thanks to... the state: Home Office delays + this country’s incoherent, cruel, racist, unworkable bureaucratic mess of an immigration system. https://t.co/awyB9MncSU",0,0,0
2,2,"In New Orleans, .pPresenting results on immigrants and civic engagement project @ilctr @MasonCHSS https://t.co/fHUBLcX2ou",0,0,0
3,3,@keenondevonte face blowing a kiss face blowing a kiss down bitch. That’s what women do. Provide for her man and put his head towards God when he have doubts. I love you blue heart,1,1,0
4,4,@LisaBloom It's women like Kathy Griffin that the word cunt is becoming just as popular here as it is in the United Kingdom and Australia.,1,1,0
5,5,Hole in one and you deserve for it.Congrats https://t.co/EXzIzEXWhM,0,0,0
6,6,Guys relax. Leave the twitter fights to the professionals. I mean vaginals. I mean women. #WomenSuck,1,0,0
7,7,Why? Everywhere they go!! WHY?? Act like normal phucking people!! #Scum #NoMoreRefugees https://t.co/xlitWTPmJ5,1,0,0
8,8,Credit card boy Ian Duncan Smith and other Politicians should be forced to pay their dues mind u its not tax credits cuts that's for us,0,0,0
9,9,"@FareedFromSyria @noyalevy_ @ElianeAlhussein @CrazyNormie @UN @HasanAlthoffy @UNICEFEMOPS @talentosprecato @Marcnelsonart @KenRoth @AmnestySyria @UOSSM_France Try @msf or @unicefmena. @ICRC_sy and @refugees, can you help or give direction?",0,0,0


### Contraction Mapping

The contraction mapping below is not perfect. There are many ambigious contractions which are impossible to definitively resolve (e.g. he's - he has or he is).

Although this contractions library comes highly rated, and does seem to work often

In [11]:
!pip install contractions
import contractions
tweet = train.iloc[2312]['tweet']
print("\nOriginal: ", tweet)
print("\nReplaced Contractions: ", contractions.fix(tweet))


Collecting contractions
  Downloading https://files.pythonhosted.org/packages/85/41/c3dfd5feb91a8d587ed1a59f553f07c05f95ad4e5d00ab78702fbf8fe48a/contractions-0.0.24-py2.py3-none-any.whl
Collecting textsearch
  Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl
Collecting Unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 4.9MB/s 
[?25hCollecting pyahocorasick
[?25l  Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)
[K     |████████████████████████████████| 317kB 60.6MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  

### Custom tweet text pre-processing
Below is our custom preprocess function which performs simple text preprocessing of the tweets. It's functions are explained in the method.

One may use the contraction mapping in the code above to convert contractions to their full-word form. You can just uncomment the line at the bottom of the function to do so

We always use this method when preprocessing our tweets as it gives us the most basic text preprocessing and it's needed

In [0]:
import string

def preprocess(text_string):
    """
    Accepts a text string and:
    1) Removes URLS
    2) lots of whitespace with one instance
    3) Removes mentions
    4) Uses the html.unescape() method to convert unicode to text counterpart
    5) Replace & with and
    6) Remove the fact the tweet is a retweet if it is - knowing the tweet is 
       a retweet does not help towards our classification task.
    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[#$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+:'
    mention_regex1 = '@[\w\-]+'
    RT_regex = '(RT|rt)[ ]*@[ ]*[\S]+'
    
    # Replaces urls with URL
    parsed_text = re.sub(giant_url_regex, '', text_string)
    parsed_text = re.sub('URL', '', parsed_text)
    
    # Remove the fact the tweet is a retweet. 
    # (we're only interested in the language of the tweet here)
    parsed_text = re.sub(RT_regex, ' ', parsed_text) 
    
    # Removes mentions as they're redundant information
    parsed_text = re.sub(mention_regex, '',  parsed_text)
    #including mentions with colons after - this seems to come up often
    parsed_text = re.sub(mention_regex1, '',  parsed_text)  

    #Replace &amp; with and
    parsed_text = re.sub('&amp;', 'and', parsed_text)

    # Remove unicode
    parsed_text = re.sub(r'[^\x00-\x7F]','', parsed_text) 
    parsed_text = re.sub(r'&#[0-9]+;', '', parsed_text)  

    # Convert unicode missed by above regex to text
    parsed_text = html.unescape(parsed_text)
    
    # Remove excess whitespace at the end
    parsed_text = re.sub(space_pattern, ' ', parsed_text) 
    
    # Set text to lowercase and strip
    parsed_text = parsed_text.lower()
    parsed_text = parsed_text.strip()

    # Replace contractions with their full worded form
    parsed_text = contractions.fix(parsed_text)
    
    return parsed_text

**Let's see what it looks like**

In [13]:
testtweet = train.loc[2100]
print("Original:", testtweet['tweet'])
print("Preprocessed:", preprocess(testtweet['tweet']))
print('Label:', testtweet['label'])

testtweet1 = train.loc[4521]
print("\nOriginal:", testtweet1['tweet'])
print("Preprocessed:", preprocess(testtweet1['tweet']))
print('Label:', testtweet1['label'])

testtweet2 = train.loc[4549]
print("\nOriginal:", testtweet2['tweet'])
print("Preprocessed:", preprocess(testtweet2['tweet']))
print('Label:', testtweet2['label'])

Original: @ABCPolitics This bitch will be rape by Trump too and then denie it.
Preprocessed: this bitch will be rape by trump too and then denie it.
Label: 1

Original: UK rejects Christian refugees recommended by UN, admits only Muslims among 1,112 Syrians admitted Jan-March 2018 https://t.co/vpvmMFaAnf via @jihadwatchRS
Preprocessed: uk rejects christian refugees recommended by un, admits only muslims among 1,112 syrians admitted jan-march 2018 via
Label: 0

Original: Best gift ever  face blowing a kiss https://t.co/xdZL6RVWLU
Preprocessed: best gift ever face blowing a kiss
Label: 0


<b> Training set after basic preprocessing

In [14]:
train['tweet'] = train['tweet'].apply(preprocess)
train.head(30)

Unnamed: 0,id,tweet,label,TR,AG
0,0,good morning to;#maga #nodaca #nodacaamnesty #buildthewall #lockthemallup #americafirst #notabot rt forever we love you #potus,1,0,1
1,1,"people deprived of the vital family support promised to them by the state, during the hardest time of their lives, thanks to... the state: home office delays + this countrys incoherent, cruel, racist, unworkable bureaucratic mess of an immigration system.",0,0,0
2,2,"in new orleans, .ppresenting results on immigrants and civic engagement project",0,0,0
3,3,face blowing a kiss face blowing a kiss down bitch. that is what women do. provide for her man and put his head towards god when he have doubts. i love you blue heart,1,1,0
4,4,it is women like kathy griffin that the word cunt is becoming just as popular here as it is in the united kingdom and australia.,1,1,0
5,5,hole in one and you deserve for it.congrats,0,0,0
6,6,guys relax. leave the twitter fights to the professionals. i mean vaginals. i mean women. #womensuck,1,0,0
7,7,why? everywhere they go!! why?? act like normal phucking people!! #scum #nomorerefugees,1,0,0
8,8,credit card boy ian duncan smith and other politicians should be forced to pay their dues mind you its not tax credits cuts that is for us,0,0,0
9,9,"try or . and , can you help or give direction?",0,0,0


### Hashtag Segmentation

In [15]:
!pip install wordsegment
import wordsegment as ws
from wordsegment import load, segment

load()
#The values below of the bigrams reflect the amount of search results on google that come up
ws.BIGRAMS['alt right'] = 1.17e8 # update wordsegment dict so it recognises altright as "alt right" rather than salt right
ws.BIGRAMS['white supremacists'] = 3.86e6
ws.BIGRAMS['tweets'] = 6.26e10

!pip install wordsegment
import wordsegment as ws
from wordsegment import load, segment

load()
#The values below of the bigrams reflect the amount of search results on google that come up
ws.BIGRAMS['alt right'] = 1.17e8 # update wordsegment dict so 
                                #it recognises altright as "alt right" rather than salt right
ws.BIGRAMS['white supremacists'] = 3.86e6
ws.BIGRAMS['tweets'] = 6.26e10
ws.BIGRAMS['independece day'] = 6.21e7

def hashtagSegment(text_string):
    
    #We target hashtags so that we only segment the hashtag strings.
    #Otherwise the segment function may operate on misspelled words also; which
    #often appear in hate speech tweets owing to the ill education of those spewing it
    temp_str = []
    for word in text_string.split(' '):
        if word.startswith('#') == False:
            temp_str.append(word)
        else:
            temp_str = temp_str + segment(word)
            
    text_string = ' '.join(temp_str)       
    return text_string

teststr = train.iloc[1291]['tweet']
teststr1 = train.iloc[3892]['tweet']

print('\nNormal:\n',teststr,'\n')
print("Hashtag-Segmented:\n", hashtagSegment(teststr))

print('\n\nNormal:\n', teststr1,'\n')
print("Hashtag-Segmented:\n", hashtagSegment(teststr1))

train['tweet'] = train['tweet'].apply(hashtagSegment)

Collecting wordsegment
[?25l  Downloading https://files.pythonhosted.org/packages/cf/6c/e6f4734d6f7d28305f52ec81377d7ce7d1856b97b814278e9960183235ad/wordsegment-1.3.1-py2.py3-none-any.whl (4.8MB)
[K     |████████████████████████████████| 4.8MB 3.3MB/s 
[?25hInstalling collected packages: wordsegment
Successfully installed wordsegment-1.3.1

Normal:
 the queen of hysterical women tweets bitterly, terrified of losing her crown. 

Hashtag-Segmented:
 the queen of hysterical women tweets bitterly, terrified of losing her crown.


Normal:
 bad girls get spankings 

Hashtag-Segmented:
 bad girls get spankings


<b>Training set aftter Hashtag segmentation</b>

In [16]:
train.head(50)

Unnamed: 0,id,tweet,label,TR,AG
0,0,good morning to;#maga no daca no daca amnesty build the wall lock them all up america first not a bot rt forever we love you potus,1,0,1
1,1,"people deprived of the vital family support promised to them by the state, during the hardest time of their lives, thanks to... the state: home office delays + this countrys incoherent, cruel, racist, unworkable bureaucratic mess of an immigration system.",0,0,0
2,2,"in new orleans, .ppresenting results on immigrants and civic engagement project",0,0,0
3,3,face blowing a kiss face blowing a kiss down bitch. that is what women do. provide for her man and put his head towards god when he have doubts. i love you blue heart,1,1,0
4,4,it is women like kathy griffin that the word cunt is becoming just as popular here as it is in the united kingdom and australia.,1,1,0
5,5,hole in one and you deserve for it.congrats,0,0,0
6,6,guys relax. leave the twitter fights to the professionals. i mean vaginals. i mean women. women suck,1,0,0
7,7,why? everywhere they go!! why?? act like normal phucking people!! scum no more refugees,1,0,0
8,8,credit card boy ian duncan smith and other politicians should be forced to pay their dues mind you its not tax credits cuts that is for us,0,0,0
9,9,"try or . and , can you help or give direction?",0,0,0


<b>Removing stopwords</b>

In [17]:
"""import nltk
stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopwords]
    return text

train['text'] = train['text'].apply(lambda x: remove_stopwords(x))

train.head(10)"""

"import nltk\nstopwords = nltk.corpus.stopwords.words('english')\n\ndef remove_stopwords(tokenized_list):\n    text = [word for word in tokenized_list if word not in stopwords]\n    return text\n\ntrain['text'] = train['text'].apply(lambda x: remove_stopwords(x))\n\ntrain.head(10)"

<b> Lemmatizing text </b>

In [18]:
"""import nltk
wn = nltk.WordNetLemmatizer()
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

train['text'] = train['text'].apply( lambda x: lemmatizing(x))
data.head(10)"""

"import nltk\nwn = nltk.WordNetLemmatizer()\ndef lemmatizing(tokenized_text):\n    text = [wn.lemmatize(word) for word in tokenized_text]\n    return text\n\ntrain['text'] = train['text'].apply( lambda x: lemmatizing(x))\ndata.head(10)"

**Removing Punctuation**

In [19]:
"""
def remove_punct(text):
    
    #Return the charater as long as it's not punctuation
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

train['text'] = train['text'].apply(lambda x: remove_punct(x))
"""

'\ndef remove_punct(text):\n    \n    #Return the charater as long as it\'s not punctuation\n    text_nopunct = "".join([char for char in text if char not in string.punctuation])\n    return text_nopunct\n\ntrain[\'text\'] = train[\'text\'].apply(lambda x: remove_punct(x))\n'

 **In training, let's remove any tweets that have a length less than 10. They could skew our model**

In [20]:
length = len(train.index)
train = train[train['tweet'].apply(lambda x: len(x) > 10)]
#train = train[train['tweet'].apply(lambda x: len(x) < 300)]
print(length - len(train.index), "tweets have been removed from the dataframe\n")
train.reset_index(drop = True, inplace = True)
train.id = train.index
train.info()

29 tweets have been removed from the dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8971 entries, 0 to 8970
Data columns (total 5 columns):
id       8971 non-null int64
tweet    8971 non-null object
label    8971 non-null int64
TR       8971 non-null int64
AG       8971 non-null int64
dtypes: int64(4), object(1)
memory usage: 350.6+ KB


# Splitting data into train and dev. Also specifying label and text columns

We store the name of the Data column containing the text we wish to classify and the name of the corresponding label column in global variables for ease of access down line and also so this code is generalizable. Label list is just a 0 or a 1 because the version of BERT we've created below only deals in binary classifcation and labels must be ints

In [21]:
dev =  pd.read_csv('gs://csc3002/hateval2019/hateval2019_en_dev.csv', sep=',',  index_col = False, encoding = 'utf-8')
dev.rename(columns={'text': 'tweet', 'HS': 'label'}, inplace=True)

# Uncomment any you wish
dev['tweet'] = dev['tweet'].apply(emojiReplace) 
dev['tweet'] = dev['tweet'].apply(preprocess)
dev['tweet'] = dev['tweet'].apply(hashtagSegment) 

DATA_COLUMN = 'tweet'
LABEL_COLUMN = 'label'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

print("Size of training data", len(train.index))
print("Size of development data", len(dev.index))

Size of training data 8971
Size of development data 1000


#Data Preprocessing
We'll need to transform our data into a format BERT understands. This involves two steps. First, we create  `InputExample`'s using the constructor provided in the BERT library.

- `text_a` is the text we want to classify, which in this case, is the `tweet` field in our Dataframe. 
- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.
- `label` is the label for our example, i.e. True, False

In [0]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

dev_InputExamples = dev.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):


1. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
2. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
3. Map our words to indexes using a vocab file that BERT provides
4. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
5. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))

Happily, we don't have to worry about most of these details. It's automated with the below inbuilt functions




Below is a way to retrieve desired BERT parameters, such as it's pre-trained checkpoints and it's vocab file, from my google storage bucket where I've downloaded the uncased LARGE version of bert.

In [23]:
bucket_dir = 'gs://csc3002'

bert_ckpt_dir = os.path.join(bucket_dir, bert_model_name) 

#For normal model
#bert_ckpt_file   = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
#print("Using BERT checkpoint from:", bert_ckpt_dir)

#For further pretrained model
#bert_ckpt_file   = os.path.join(further_pretrained_model, "bert_model.ckpt")
bert_ckpt_file = tf.train.latest_checkpoint('gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model')
print("Using BERT checkpoint from:", further_pretrained_model)

bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")
vocab_file = os.path.join(bert_ckpt_dir, "vocab.txt")


tokenizer = bert.tokenization.FullTokenizer(vocab_file=vocab_file)
tokenizer.tokenize("This here's an example of using the BERT tokenizer")

print("Make sure that the function loads a checkpoint")
assert bert_ckpt_file is not None, "No BERT checkpoint file loaded"

Using BERT checkpoint from: wwm_uncased_L-24_H-1024_A-16/further_pretrained_model

Make sure that the function loads a checkpoint


Using our tokenizer, we'll call `run_classifier.convert_examples_to_features` on our InputExamples to convert them into features BERT understands.

In [24]:
# BERT is limited to 512 tokens in length
MAX_SEQ_LENGTH = 256
# Convert our train and dev features to InputFeatures that BERT understands.
train_features =  bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
dev_features = bert.run_classifier.convert_examples_to_features(dev_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)


INFO:tensorflow:Writing example 0 of 8971
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] good morning to ; # mag ##a no da ##ca no da ##ca amnesty build the wall lock them all up america first not a bot rt forever we love you pot ##us [SEP]
INFO:tensorflow:input_ids: 101 2204 2851 2000 1025 1001 23848 2050 2053 4830 3540 2053 4830 3540 16154 3857 1996 2813 5843 2068 2035 2039 2637 2034 2025 1037 28516 19387 5091 2057 2293 2017 8962 2271 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

#Creating a model

Now that we've prepared our data, let's focus on building a model. `create_model` does just this below. It loads the configs of the BERT model we specified earlier and it creates a single layer that will be trained to adapt BERT to our task (i.e. classifying whether a tweet is hate speech or not). This strategy of using a mostly trained model is called <i>fine-tuning</i>.

In [0]:
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
  """Creates a classification model."""
  model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  # In the demo, we are doing a simple classification task on the entire
  # segment.
  #
  # If you want to use the token-level output, use model.get_sequence_output()
  # instead.
  
  #output_layer = model.get_sequence_output()
  output_layer = model.get_pooled_output()

  hidden_size = output_layer.shape[-1].value

  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):
    if is_training:
      # I.e., 0.1 dropout
      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    probabilities = tf.nn.softmax(logits, axis=-1)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)

    return (loss, per_example_loss, logits, probabilities)

Next we'll wrap our model function in a `model_fn_builder` function that adapts our model to work for training, evaluation, and prediction.

In [0]:
def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu,
                     use_one_hot_embeddings):
  """Returns `model_fn` closure for TPUEstimator."""

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    tf.logging.info("*** Features ***")
    for name in sorted(features.keys()):
      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]
    is_real_example = None
    if "is_real_example" in features:
      is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
    else:
      is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    (total_loss, per_example_loss, logits, probabilities) = create_model(
        bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
        num_labels, use_one_hot_embeddings)

    tvars = tf.trainable_variables()
    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
      (assignment_map, initialized_variable_names
      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      if use_tpu:

        def tpu_scaffold():
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
          return tf.train.Scaffold()

        scaffold_fn = tpu_scaffold
      else:
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    """ tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)"""

    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:

      train_op = optimization.create_optimizer(
          total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          train_op=train_op,
          scaffold_fn=scaffold_fn)
    elif mode == tf.estimator.ModeKeys.EVAL:

      def metric_fn(per_example_loss, label_ids, logits, is_real_example):
        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)

        accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions, weights=is_real_example)
        loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
        f1_score = tf.contrib.metrics.f1_score(label_ids, predictions)
        auc = tf.metrics.auc( label_ids, predictions)
        recall = tf.metrics.recall(label_ids, predictions)
        precision = tf.metrics.precision(label_ids, predictions)
        true_pos = tf.metrics.true_positives(label_ids, predictions)
        true_neg = tf.metrics.true_negatives(label_ids, predictions)
        false_pos = tf.metrics.false_positives(label_ids, predictions)  
        false_neg = tf.metrics.false_negatives(label_ids, predictions)
        return {
            "eval_accuracy": accuracy,
            "eval_loss": loss,
            "F1_Score": f1_score,
            "auc": auc,
            "precision": precision,
            "recall": recall,
            "true_positives": true_pos,
            "true_negatives": true_neg,
            "false_positives": false_pos,
            "false_negatives": false_neg
        }

      eval_metrics = (metric_fn, [per_example_loss, label_ids, logits, is_real_example])

      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          eval_metrics=eval_metrics,
          scaffold_fn=scaffold_fn)
    else:
      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          predictions={"probabilities": probabilities},
          scaffold_fn=scaffold_fn)
    return output_spec

  return model_fn

The Run config will be the same across all evaluation options below for running BERT

In [0]:
# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
run_config = tf.compat.v1.estimator.tpu.RunConfig(
    #I think the output file must be a sub-directory of the main BERT file
    model_dir=OUTPUT_DIR, 
    cluster=cluster_resolver,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=100,    #Shows us summary metrics every 100 steps
        num_shards=8,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

#Training and Evaluating BERT as normal

In [28]:
# Compute train and warmup steps from batch size
TRAIN_BATCH_SIZE = 32 #recommended 16 or 32
EVAL_BATCH_SIZE = 8
PREDICT_BATCH_SIZE = 8
LEARNING_RATE = 2e-5 # Recommended 5e-5, 3e-5 or 2e-5
NUM_TRAIN_EPOCHS = 3.0 # Recommended 2, 3 or 4
MAX_SEQ_LENGTH = 256
# Warmup is a period of time where the learning rate 
#is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1

# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

print("The model will stop training when it reaches", num_train_steps, "as a checkpoint")
print("\nThe bert checkpoint directory is", bert_ckpt_dir)
print("\nThe output directory is", OUTPUT_DIR)

#This is the model function, which feeds in the bert configurations, the pretrained model itself and the parameters for the fine tuning of the model
model_fn = model_fn_builder(
  bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
  num_labels=len(label_list),
  init_checkpoint=bert_ckpt_file,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  use_one_hot_embeddings=True)

#We use Tensorflow estimators to train, evaluate and test our model
estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
    use_tpu=True,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE,
    predict_batch_size=PREDICT_BATCH_SIZE)

The model will stop training when it reaches 841 as a checkpoint

The bert checkpoint directory is gs://csc3002/wwm_uncased_L-24_H-1024_A-16

The output directory is gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model/output
INFO:tensorflow:Using config: {'_model_dir': 'gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model/output', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.79.186.138:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cl

Next we create an input builder function that takes our training feature set (`train_features`) and produces a generator.

This is a pretty standard design pattern for working with Tensorflow Estimators

In [0]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True)

# Input function for dev data, we feed in our previously created dev_features for this
test_input_fn = run_classifier.input_fn_builder(
    features=dev_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=True)

<b>Now we train our BERT fine-tuned model

In [0]:
print("\nThe model will stop training when it reaches", num_train_steps, "as a checkpoint")

print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
train_time = datetime.now() - current_time
print("Training took time ", train_time)


The model will stop training when it reaches 841 as a checkpoint
Beginning Training!
INFO:tensorflow:Querying Tensorflow master (grpc://10.28.171.74:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 8278283442579639588)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 15075832258415459024)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 9235653837146825852)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 1552753277433411770)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 

<b>And now we evaluate the performance of our model on the development data<b>

In [0]:
#You need to provide number of steps for a TPU
eval_steps = int(len(dev_InputExamples) / EVAL_BATCH_SIZE)

#Eval will be slightly WRONG on the TPU because it will drop the last batch (drop_remainder = True).
estimator.evaluate(input_fn=test_input_fn, steps=eval_steps)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:*** Features ***
INFO:tensorflow:  name = input_ids, shape = (1, 256)
INFO:tensorflow:  name = input_mask, shape = (1, 256)
INFO:tensorflow:  name = label_ids, shape = (1,)
INFO:tensorflow:  name = segment_ids, shape = (1, 256)
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-02-04T10:54:43Z
INFO:tensorflow:TPU job name worker
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from gs://csc3002/wwm_uncased_L-24_H-1024_A-16/further_pretrained_model/output/model.ckpt-841
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Init TPU system
INFO:tensorflow:Initialized TPU in 9 seconds
INFO:tensorflow:Starting infeed thread controller.
INFO:tensorflow:Starting outfeed thread controller.
INFO:tensorflow:Initialized dataset iterators in 0 seconds
INFO:tensorflow:Enq

{'F1_Score': 0.59845823,
 'auc': 0.5,
 'eval_accuracy': 0.573,
 'eval_loss': 0.682442,
 'false_negatives': 427.0,
 'false_positives': 0.0,
 'global_step': 841,
 'loss': 0.69324154,
 'precision': 0.0,
 'recall': 0.0,
 'true_negatives': 573.0,
 'true_positives': 0.0}

Now that the pre-processing methods have been demonstrated. We can just load in data quickly using the function below. One can load one set or load in two and combine them - just like I'll demonstrate with the training set and the dev set

<i>We'll combine these two sets for the cross-validation function below</i>

In [30]:
def loadData(data1, data2 = None, replaceEmoji = True, \
             segmentHashtag = True, removeSmall = True): #Function caller can optionally load two dataframes and combine them

  if data2 is not None:
    frames = [data1,data2]
    data = pd.concat(frames)
  else:
    data = data1
  

  data.rename(columns={'text': 'tweet', 'HS': 'label'}, inplace=True)
  cols = ['TR', 'AG']
  data.drop(cols, inplace = True, axis = 1)
  data = data.sample(frac=1, random_state= 3060)

  if replaceEmoji == True:
    data['tweet'] = data['tweet'].apply(emojiReplace)

  data['tweet'] = data['tweet'].apply(preprocess)

  if segmentHashtag == True:
    data['tweet'] = data['tweet'].apply(hashtagSegment)

  if removeSmall == True:
    data = data[data['tweet'].apply(lambda x: len(x) > 10)] 

  data.dropna(inplace = True)
  data.reset_index(drop = True, inplace = True)

  data.id = data.index
  return data

train = pd.read_csv('gs://csc3002/hateval2019/hateval2019_en_train.csv', sep=',',  index_col = False, encoding = 'utf-8')
dev =  pd.read_csv('gs://csc3002/hateval2019/hateval2019_en_dev.csv', sep=',',  index_col = False, encoding = 'utf-8')

data = loadData(train, dev)
print("Out of {} tweets in this database, {} are not hate, {} are hate\n".format(len(data.index), 
                                                    len(data[data['label']==0]),
                                                    len(data[data['label']==1])))
data[:50]

Out of 9968 tweets in this database, 5765 are not hate, 4203 are hate



Unnamed: 0,id,tweet,label
0,0,"drake- i love you, we should date rihanna the savage-",0
1,1,an honour to meet so many dedicated aid workers here. this is mitun from whod recently attended training on child protection provided by . building up specialist refugee protection expertise within national ngos is part of our work here. aid works rohingya,0
2,2,"despite boko haram threat, some displaced nigerians return: via",0
3,3,rage of women of trump in the white house made them individually fearless to turn on weinstein! blame 1 man not all men.,0
4,4,french police officers injured by 'drunk' uk-bound migrants in calais via !,0
5,5,"1/ I am reminded of an interaction that fuels me. soon after returning to finish my undergrad, an uncle came to visit. he is a greek immigrant..very greek. he visited my parents' restaurant, where i was working.",0
6,6,rip twitter bitch cunt,0
7,7,yorkshire woman who taught a man consent while he was rape -ing her launches 1st real anti - rape,0
8,8,election fraud true the vote voter id illegal immigration voter fraud no election fraud? i do not think so. if the left is so interested in true elections then we should true the vote. dead people...,0
9,9,"watch: british tourist slaps immigration officer over $4,000 fine",0


Function to get metrics from an event file:

In [0]:
 def get_metrics(OUTPUT_DIR, train_time, k=0):

   os.environ["GCLOUD_PROJECT"] = "csc3002"
   client = storage.Client()
   bucket = client.bucket('csc3002')

   blobs = list(bucket.list_blobs(prefix=os.path.join(OUTPUT_DIR[13:], 'eval')))
   for blob in blobs:
     if 'events' in blob.name:
       eval_file = os.path.join('gs://csc3002/', blob.name)
          
   for e in tf.train.summary_iterator(eval_file):
     for v in e.summary.value:

       if v.tag == 'F1_Score':
         fscore = v.simple_value

       if v.tag == 'auc':
         auc = v.simple_value

       if v.tag == 'eval_accuracy':
         accuracy = v.simple_value

       if v.tag == 'recall':
         recall = v.simple_value

       if v.tag == 'precision':
         precision = v.simple_value

       if v.tag == 'false_positives':
         false_positives = v.simple_value

       if v.tag == 'false_negatives':
         false_negatives = v.simple_value

       if v.tag == 'true_positives':
         true_positives = v.simple_value

       if v.tag == 'true_negatives':
         true_negatives = v.simple_value
    
   row = pd.Series({'F1 Score': fscore, 'auc': auc, 'Accuracy': accuracy,'Precision': precision,'Recall': recall,\
                                    'False Negatives': false_negatives,'False Positives': false_positives,\
                    'True Negatives':true_negatives ,'True Positives': true_positives, 'Training Time': train_time })
   if k != 0:
      row = pd.Series(row, name = 'Fold ' + str(k))
   else:
     row1 = pd.Series({'Batch Size': 32, 'Learn Rate': 2e-5, 'epochs': 3})
     row = pd.concat([row1, row], axis= 0, sort =False)
    
   return row

# Cross Validation evaluation

Does not provide in depth tensorflow logging but it does provide evaluation at the end. As mentioned above, we combine the provided training and dev files



In [0]:
def bertCV(train_batch_size = 32, learn_rate = 2e-5,\
           num_epochs =3.0, folds = 5,  gridSearch = False):

  #Filter out all log messages so console isn't consumed with memory
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

  train = pd.read_csv('gs://csc3002/hateval2019/hateval2019_en_train.csv', sep=',',  index_col = False, encoding = 'utf-8')
  dev =  pd.read_csv('gs://csc3002/hateval2019/hateval2019_en_dev.csv', sep=',',  index_col = False, encoding = 'utf-8')
  
  loadData(train, dev)

  """FIXED MODEL PARAMS"""
  EVAL_BATCH_SIZE = 8
  PREDICT_BATCH_SIZE = 8
  MAX_SEQ_LENGTH = 256
  # Warmup is a period of time where the learning rate 
  #is small and gradually increases--usually helps training.
  WARMUP_PROPORTION = 0.1

  #Dataframe where grid search results will be stored. Empty to begin with
  eval_df = pd.DataFrame(columns = ['F1 Score', 'auc', 'Accuracy'] )
  
  k = 1 # Fold counter
  cv = KFold(n_splits=folds, shuffle=False)
  for train_index, dev_index in cv.split(data.index): # Sticking within the training dataset for evaluation. Train is the combination of the provided train and dev sets
    
    training  = data.iloc[train_index]
    develop = data.iloc[dev_index]
    
    """Unlike before where I only one test set and one training set, this time I have K different sets of training and testing.
    Therefore, in each fold I need to get a new set of data and convert it to features each time."""
    
    # Use the InputExample class from BERT's run_classifier code to create examples from the data

    train_InputExamples = training.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                      text_a = x[DATA_COLUMN], 
                                                                      text_b = None, 
                                                                      label = x[LABEL_COLUMN]), axis = 1)

    dev_InputExamples = develop.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                      text_a = x[DATA_COLUMN], 
                                                                      text_b = None, 
                                                                      label = x[LABEL_COLUMN]), axis = 1)
    
    #Convert these examples to features that BERT can interpret
    train_features =  bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
    dev_features = bert.run_classifier.convert_examples_to_features(dev_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

    #Delete prior model graph, checkpoints and eval files to make room for new model each loop
    try:
      tf.gfile.DeleteRecursively(OUTPUT_DIR)
    except:
    # Doesn't matter if the directory didn't exist
      pass
    tf.gfile.MakeDirs(OUTPUT_DIR)

    # Compute # train and warmup steps from batch size
    num_train_steps = int(len(train_features) / train_batch_size * NUM_TRAIN_EPOCHS)
    num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

    # Model configs
    model_fn = model_fn_builder(
    bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
    num_labels=len(label_list),
    init_checkpoint=bert_ckpt_file,
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=True,
    use_one_hot_embeddings=True)

    estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
      use_tpu=True,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=TRAIN_BATCH_SIZE,
      eval_batch_size=EVAL_BATCH_SIZE,
      predict_batch_size=PREDICT_BATCH_SIZE)
    
    # Create an input function for training. drop_remainder = True for using TPUs.
    train_input_fn = bert.run_classifier.input_fn_builder(
        features=train_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=True)

    #input function for dev data, we feed in our previously created dev_features for this
    dev_input_fn = run_classifier.input_fn_builder(
        features=dev_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=True)
    
    current_time = datetime.now()
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    train_time = datetime.now() - current_time
    

    #You need to provide number of steps for a TPU
    eval_steps = int(len(dev_InputExamples) / EVAL_BATCH_SIZE)

    #Eval will be slightly WRONG on the TPU because it will truncate the last batch.
    estimator.evaluate(input_fn=dev_input_fn, steps=eval_steps)

   
    row = get_metrics(OUTPUT_DIR, train_time, k)
    eval_df = eval_df.append(row)
    print("Fold " + str(k) + ":\tF-Score:", eval_df["F1 Score"][k-1])
    print("Training took time ", train_time)
    print('---------------------------------------------------------------------------------------------------------\n')
    k = k + 1 #Increment on fold counter

  row = eval_df.mean(axis = 0)
  row = pd.Series(row, name = 'CV Average')
  eval_df = eval_df.append(row)
  
  if gridSearch == False:
    print("\nTraining Batch Size:", train_batch_size,  "\tLearn Rate: ",learn_rate, "\tNum Epochs: ", num_epochs )
    display(eval_df)

  return row # Also return row of CV-Average

# HyperParameter Grid Search
Can also just be used to train and evaluate on single parameters if one wishes. 

Does not provide extra info such as average loss at each checkpoint like normal training of BERT will do because I set the tensorflow verbosity to ERROR so as to not overload the console with information and thus overload memory.

However the metrics for each unique hyperparameter pairing are displayed upon completion of the function

In [0]:
def bertGridSearch(lr_values, num_epochs, train_batch_size = [32], CV = False):

  #Filter out all log messages so console isn't consumed with memory
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

  """FIXED MODEL PARAMS"""
  EVAL_BATCH_SIZE = 8
  PREDICT_BATCH_SIZE = 8
  MAX_SEQ_LENGTH = 256
  # Warmup is a period of time where the learning rate 
  #is small and gradually increases--usually helps training.
  WARMUP_PROPORTION = 0.1

  #Dataframe where grid search results will be stored. Empty to begin with
  eval_df = pd.DataFrame(columns = ['F1 Score', 'Precision', 'false_positives'] )

  for TRAIN_BATCH_SIZE, NUM_TRAIN_EPOCHS, LEARNING_RATE in product(train_batch_size, num_epochs, lr_values):

    if CV == True:
      row = bertCV(train_batch_size = TRAIN_BATCH_SIZE, learn_rate = LEARNING_RATE,\
                   num_epochs =NUM_TRAIN_EPOCHS, gridSearch = True)
      
    else:
      #Delete prior model graph, checkpoints and eval files to make room for new model each loop
      try:
        tf.gfile.DeleteRecursively(OUTPUT_DIR)
      except:
      # Doesn't matter if the directory didn't exist
        pass
      tf.gfile.MakeDirs(OUTPUT_DIR)


      # Compute # train and warmup steps from batch size
      num_train_steps = int(len(train_features) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
      num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

      # Model configs
      model_fn = model_fn_builder(
      bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
      num_labels=len(label_list),
      init_checkpoint=bert_ckpt_file,
      learning_rate=LEARNING_RATE,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=True,
      use_one_hot_embeddings=True)

      estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
        use_tpu=True,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=TRAIN_BATCH_SIZE,
        eval_batch_size=EVAL_BATCH_SIZE,
        predict_batch_size=PREDICT_BATCH_SIZE)
      
      # Create an input function for training. drop_remainder = True for using TPUs.
      train_input_fn = bert.run_classifier.input_fn_builder(
          features=train_features,
          seq_length=MAX_SEQ_LENGTH,
          is_training=True,
          drop_remainder=True)

      #input function for test data, we feed in our previously created dev_features for this
      dev_input_fn = run_classifier.input_fn_builder(
          features=dev_features,
          seq_length=MAX_SEQ_LENGTH,
          is_training=False,
          drop_remainder=True)
      
      current_time = datetime.now()
      estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
      train_time = datetime.now() - current_time
      print("Training took time ", train_time)

      #You need to provide number of steps for a TPU
      eval_steps = int(len(dev_InputExamples) / EVAL_BATCH_SIZE)

      #Eval will be slightly WRONG on the TPU because it will truncate the last batch.
      estimator.evaluate(input_fn=dev_input_fn, steps=eval_steps)

      row = get_metrics(OUTPUT_DIR, train_time)
      

    eval_df = eval_df.append(row, ignore_index = True)
    print("\nTraining Batch Size: " + str(TRAIN_BATCH_SIZE), \
          '\nLearning Rate: ' + str(LEARNING_RATE),  '\t\tF-Score:', fscore,
          '\nNumber of epochs: ' + str(NUM_TRAIN_EPOCHS),\
          '\n-----------------------------------------------------------------------------------------------------------------------------\n')
          
    
  #display(eval_df)
  idx = ['Batch Size', 'Learning Rate','Epochs']
  eval_df.set_index(idx, inplace = True)
  display(eval_df)  

# You can run a grid search or cross-validation evaluation from here



<b> Cross-Validation

In [0]:
CV_Av = bertCV(learn_rate = 2.5e-5, num_epochs=3.0)

In [0]:
e60 = CV_Av
print(e60)
e60 = pd.Series(e60, name = '60000')

F1 Score           0.807454              
auc                0.832936              
Accuracy           0.833534              
False Negatives    144.2                 
False Positives    187.4                 
Precision          0.787785              
Recall             0.828457              
Training Time      0 days 00:08:12.122312
True Negatives     964.6                 
True Positives     695.8                 
Name: CV Average, dtype: object


In [0]:
eval_df = pd.read_csv('gs://csc3002/hateval2019/pretraining_eval_df.csv', sep=',',  index_col = 0, encoding = 'utf-8')
eval_df

Unnamed: 0,F1 Score,auc,Accuracy,False Negatives,False Positives,Precision,Recall,Training Time,True Negatives,True Positives,false_negatives,false_positives,precision,recall,true_negatives,true_positives
120000,0.809541,0.834597,0.834337,138.4,191.6,0.785437,0.835473,0 days 00:08:52.374760600,960.4,701.6,,,,,,
0,0.796205,0.823387,0.825803,,,,,0 days 00:09:23.425535,,,161.4,185.6,0.785145,0.807766,966.0,679.0
40000,0.812154,0.837227,0.839056,146.6,174.0,0.799342,0.82552,0 days 00:08:16.699617,978.0,693.4,,,,,,
60000,0.807454,0.832936,0.833534,144.2,187.4,0.787785,0.828457,0 days 00:08:12.122312,964.6,695.8,,,,,,


In [0]:
#eval_df = pd.DataFrame(columns = ['F1 Score', 'auc', 'Accuracy'] ) # Instantise
eval_df = eval_df.append(e60)
eval_df

Unnamed: 0,F1 Score,auc,Accuracy,False Negatives,False Positives,Precision,Recall,Training Time,True Negatives,True Positives,false_negatives,false_positives,precision,recall,true_negatives,true_positives
120000,0.809541,0.834597,0.834337,138.4,191.6,0.785437,0.835473,0 days 00:08:52.374760600,960.4,701.6,,,,,,
0,0.796205,0.823387,0.825803,,,,,0 days 00:09:23.425535,,,161.4,185.6,0.785145,0.807766,966.0,679.0
40000,0.812154,0.837227,0.839056,146.6,174.0,0.799342,0.82552,0 days 00:08:16.699617,978.0,693.4,,,,,,
60000,0.807454,0.832936,0.833534,144.2,187.4,0.787785,0.828457,0 days 00:08:12.122312,964.6,695.8,,,,,,


In [0]:
eval_df.to_csv('gs://csc3002/hateval2019/pretraining_eval_df.csv', sep=',',  index = True, encoding = 'utf-8')


<b> Grid Serch - with optional Cross-Validation

In [0]:
#0.796205
#Grid Search
bertGridSearch(num_epochs=[3.0], lr_values=[2e-5, 2.5e-5, 3e-5], CV =True) # Obviously takes much longer when CV = True

Fold 1:	F-Score: 0.8163264393806458
Training took time  0:09:02.120107
---------------------------------------------------------------------------------------------------------

Fold 2:	F-Score: 0.8084357380867004
Training took time  0:09:05.778452
---------------------------------------------------------------------------------------------------------

Fold 3:	F-Score: 0.7985479831695557
Training took time  0:09:14.959567
---------------------------------------------------------------------------------------------------------

Fold 4:	F-Score: 0.7967666983604431
Training took time  0:09:13.903270
---------------------------------------------------------------------------------------------------------

Fold 5:	F-Score: 0.8262107372283936
Training took time  0:09:07.563189
---------------------------------------------------------------------------------------------------------



NameError: ignored

## Adding in augmented back-translated hate speech tweets as extra data

We have very few instances of hate speech labelled in this dataset. To remedy this I performed back_translation augmentation on this training set.

Below I load in in the extra hate speech tweets I created via back-translation augmentation I performed in another colab notebook and I append it to the existing dataframe

In [0]:
"""dat = '/content/drive/My Drive/hateval2019/backtranslated_hatEval.txt' 
dat = pd.read_csv(dat, sep = '\t', names = ['tweet'], header = None, encoding = 'utf-8')
pd.set_option('display.max_colwidth', -1)
dat = dat.astype(str)
dat.head(50)"""

"dat = '/content/drive/My Drive/hateval2019/backtranslated_hatEval.txt' \ndat = pd.read_csv(dat, sep = '\t', names = ['tweet'], header = None, encoding = 'utf-8')\npd.set_option('display.max_colwidth', -1)\ndat = dat.astype(str)\ndat.head(50)"

**See how the english is a little off?** 

That's because these are the hate speech tweets in the training set translated to french, then translated back again. This creates a whole new, yet similar set of hate speech tweets to train on. (Slightly augmented text)

In [0]:
"""print("There are", len(dat.index), "tweets")
dat = dat[dat['tweet'].apply(lambda x: len(x) > 10)]
print("There are now", len(dat.index), "tweets")
dat.head()"""

'print("There are", len(dat.index), "tweets")\ndat = dat[dat[\'tweet\'].apply(lambda x: len(x) > 10)]\nprint("There are now", len(dat.index), "tweets")\ndat.head()'

<b>Rather than creating 3768 extra tweets, 19630 extra have been created. The tweets have been incorrectly parsed. Removing some tweets with a smaller length may mitigate this effect somewhat by removing tweets that were cut in half</b>

Let's see if it helps by adding it to the original training set and testing it against our dev data

In [0]:
"""dat['label'] = 1
dat['id'] = 80000
frames = [dat,data]
data = pd.concat(frames)
print(data.info())
data.head()"""

"dat['label'] = 1\ndat['id'] = 80000\nframes = [dat,data]\ndata = pd.concat(frames)\nprint(data.info())\ndata.head()"

We'll shuffle the dataframe to make sure there's no funny business with the training of the model and we'll then reset the id field to make it unique and sequential for each row

In [0]:
"""data = data.sample(frac=1)
data.reset_index(drop = True, inplace = True)

data['id'] = data.reset_index().index + 1
print(data.label.value_counts(), "\n")
print(data.info())
length = len(data.index)
print("\nNow there are", length , "tweets total in this database")
data.tail(10)"""

'data = data.sample(frac=1)\ndata.reset_index(drop = True, inplace = True)\n\ndata[\'id\'] = data.reset_index().index + 1\nprint(data.label.value_counts(), "\n")\nprint(data.info())\nlength = len(data.index)\nprint("\nNow there are", length , "tweets total in this database")\ndata.tail(10)'

# Training with both dev and training set. Then Testing with test set
<b>Loading in train and test data...

In [0]:
train = pd.read_csv('gs://csc3002/hateval2019/hateval2019_en_train.csv', sep=',',  index_col = False, encoding = 'utf-8')
dev =  pd.read_csv('gs://csc3002/hateval2019/hateval2019_en_dev.csv', sep=',',  index_col = False, encoding = 'utf-8')
loadData(train, dev)

loadData(test)


Labels:
 0    1739
1    1259
Name: label, dtype: int64 


There are 2998 tweets total in the training database


<b>Function to get predictions on test data </b>

In [0]:
def getPrediction(in_sentences):

  labels = [0, 1]
  input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=True)
  predictions = list(estimator.predict(predict_input_fn))

  #Initialise empty predicted labels array
  predicted_classes = [None] * len(predictions)

  #Use a for loop to iterate through probabilities and for each prediction assign a label
  #corresponding to which label has the highest probability
  for i in range(0, len(predictions)):
    if predictions[i]['probabilities'][0] > predictions[i]['probabilities'][1]:
      predicted_classes[i] = 0
    else:
      predicted_classes[i] = 1
  return predicted_classes

<b> Converting to features.... </b>

In [0]:
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 5
train_batch_size = 32

SAVE_CHECKPOINTS_STEPS = 1000
run_config = tf.compat.v1.estimator.tpu.RunConfig(
    #I think the output file must be a sub-directory of the main BERT file
    model_dir=OUTPUT_DIR, 
    cluster=cluster_resolver,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=100,
        num_shards=8,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

train_InputExamples = data.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                      text_a = x[DATA_COLUMN], 
                                                                      text_b = None, 
                                                                      label = x[LABEL_COLUMN]), axis = 1)

train_features =  bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)


#Delete prior model graph, checkpoints and eval files to make room for new model each loop
try:
  tf.gfile.DeleteRecursively(OUTPUT_DIR)
except:
# Doesn't matter if the directory didn't exist
  pass
tf.gfile.MakeDirs(OUTPUT_DIR)

# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / train_batch_size * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

# Model configs
model_fn = model_fn_builder(
bert_config= run_classifier.modeling.BertConfig.from_json_file(bert_config_file),
num_labels=len(label_list),
init_checkpoint=bert_ckpt_file,
learning_rate=LEARNING_RATE,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
use_tpu=True,
use_one_hot_embeddings=True)

estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
  use_tpu=True,
  model_fn=model_fn,
  config=run_config,
  train_batch_size=TRAIN_BATCH_SIZE,
  eval_batch_size=EVAL_BATCH_SIZE,
  predict_batch_size=PREDICT_BATCH_SIZE)

# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True)



print("\nThe model will stop training when it reaches", num_train_steps, "as a checkpoint")

print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
train_time = datetime.now() - current_time
print("Training took time ", train_time)

predictions = getPrediction(test.tweet)
test['predictions'] = predictions

print(metrics.f1_score(test.label, test.predictions))

INFO:tensorflow:Writing example 0 of 9968
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] they are coming . and just because they made it here to america they think they have rights . des ##eased , une ##du ##cated and soon will be paid to be a loud ##mouth on the streets ! # build ##tha ##t ##wall america first [SEP]
INFO:tensorflow:input_ids: 101 2027 2024 2746 1012 1998 2074 2138 2027 2081 2009 2182 2000 2637 2027 2228 2027 2031 2916 1012 4078 25063 1010 16655 8566 12921 1998 2574 2097 2022 3825 2000 2022 1037 5189 14359 2006 1996 4534 999 1001 3857 8322 2102 9628 2637 2034 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [0]:
os.environ["GCLOUD_PROJECT"] = "csc3002"
client = storage.Client()
bucket = client.bucket('csc3002')

blobs = list(bucket.list_blobs(prefix=OUTPUT_DIR[13:])) # prefix needs to be the path of the dir you're trying to access in the bucket. Excluding 'gs://csc3002/'
for blob in blobs:
  if 'events' in blob.name:
    event_file = os.path.join('gs://csc3002/', blob.name)
  
lossList = []
for e in tf.train.summary_iterator(event_file):
    for v in e.summary.value:
      
      if v.tag == 'loss':
        print(v.simple_value)
        lossList.append(v.simple_value)

print(OUTPUT_DIR)

[]


# Using Tensorboard to get deeper insight

In [0]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip   #Downloads file to google drive

--2020-02-04 17:29:12--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 52.73.84.118, 52.3.157.51, 34.206.126.139, ...
Connecting to bin.equinox.io (bin.equinox.io)|52.73.84.118|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13773305 (13M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip’


2020-02-04 17:29:13 (14.0 MB/s) - ‘ngrok-stable-linux-amd64.zip’ saved [13773305/13773305]

Archive:  ngrok-stable-linux-amd64.zip
  inflating: ngrok                   


In [0]:

def get_tensorboard(path_to_event_file = OUTPUT_DIR):
  get_ipython().system_raw('tensorboard --logdir {} --host 0.0.0.0 --port 6006 --reload_multifile=true &'
.format(path_to_event_file))
  
  get_ipython().system_raw('./ngrok http 6006 &')

  !curl -s http://localhost:4040/api/tunnels | python3 -c \
      "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

get_tensorboard(OUTPUT_DIR)

http://2bc06d4c.ngrok.io
