In [3]:
import pandas as pd, numpy as np 
import pickle
from joblib import load, dump

import tensorflow_hub as hub
import tensorflow as tf
import bert
from tensorflow.keras.models import Model       # Keras is the new high level API for TensorFlow

import math

In [4]:
FullTokenizer = bert.bert_tokenization.FullTokenizer

In [5]:
f = open('../data/preprocessed.pkl','rb')
train, valid = pickle.load(f)
valid = valid.reset_index()
labels = train.columns[2:]
ys_valid = valid[labels]

## COMBINE TOXIC CATEGORIES
y_valid = ys_valid.sum(axis=1)
y_valid.loc[y_valid>1] = 1

In [7]:
max_seq_length = 100  # MAX FOR BERT.
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
# We don't need segments here, which split inputs into sentences
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=True) # consider trainable = false..? 
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

In [8]:
model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 100)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 100)]        0                                            
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [9]:
# See BERT paper: https://arxiv.org/pdf/1810.04805.pdf
# And BERT implementation convert_single_example() at https://github.com/google-research/bert/blob/master/run_classifier.py

def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    out = []
    for t in tokens:
      if len(t)>max_seq_length:
          t = t[:max_seq_length-2]+[t[-1]]
      out.append([1]*len(t) + [0] * (max_seq_length - len(t)))
    return np.stack(out)

def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    out = []
    for t in tokens:
      if len(t)>max_seq_length:
          t = t[:max_seq_length-2]+[t[-1]]
      segments = []
      current_segment_id = 0
      for i in t:
          segments.append(current_segment_id)
          if i == "[SEP]":
              current_segment_id = 1
      out.append(segments + [0] * (max_seq_length - len(t)))
    return np.stack(out)

def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    out=[]
    for token in tokens:
      if len(token)>max_seq_length:
          token = token[:max_seq_length-2]+[token[-1]]
      token_ids = tokenizer.convert_tokens_to_ids(token)
      input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
      out.append(input_ids)
    return np.stack(out)

In [10]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [11]:
subtrain = train['comment_text']#[1:10]
subtrain.head(2)

29614     sockpuppetry case you have been accused of soc...
109036    i've read the archives and various national an...
Name: comment_text, dtype: object

In [12]:
subtrain_token = subtrain.apply(tokenizer.tokenize)
subtrain_token.head(2)

29614     [sock, ##pu, ##ppet, ##ry, case, you, have, be...
109036    [i, ', ve, read, the, archives, and, various, ...
Name: comment_text, dtype: object

In [13]:
## ADD SEPARATOR TOKENS ACCORDING TO PAPER
# [SEP] token is used to separate sentences into segments.
# Here, we're just considering the entire input as a single sentence.
# For longer inputs in the future, we can consider multiple segments.
def add_sep_tokens(s): return ["[CLS]"] + s + ["[SEP]"]
subtrain_token = subtrain_token.apply(add_sep_tokens)

In [14]:
## GET MODEL INPUTS FROM TOKENS
input_ids = get_ids(subtrain_token, tokenizer, max_seq_length)
input_masks = get_masks(subtrain_token, max_seq_length)
input_segments = get_segments(subtrain_token, max_seq_length)

In [15]:
input_masks.shape, input_segments.shape, input_ids.shape

((106912, 100), (106912, 100), (106912, 100))

In [16]:
pool_embs, _ = model.predict([[input_ids],[input_masks],[input_segments]],batch_size=8)

KeyboardInterrupt: 

In [None]:
pool_embs.shape