Ok this time working with:  https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb#scrollTo=SCZWZtKxObjh

This uses an old version of bert for tf v1...  using

https://pypi.org/project/bert-for-tf2/
and
https://towardsdatascience.com/simple-bert-using-tensorflow-2-0-132cb19e9b22

In [131]:
from tensorflow.keras import layers
import tensorflow_hub as hub
import tensorflow as tf

import pandas as pd
import numpy as np
import bert

import re
import random
import math
import hashlib


TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    return sentence


In [130]:
# load the data
movie_reviews = pd.read_csv("/Users/druss/Downloads/IMDB Dataset.csv")
movie_reviews.shape

# preprocess the data
reviews = []
sentences = list(movie_reviews['review'])
for sen in sentences:
    reviews.append(preprocess_text(sen))

# get the labels...
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, movie_reviews['sentiment'])))

# tokenzize the data...
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

# create a function for tokenizing reviews...
def tokenize_reviews(text_reviews):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_reviews))

tokenized_reviews = [tokenize_reviews(review) for review in reviews]

random.seed(4);
reviews_with_len = [[review, y[i], len(review)]
                 for i, review in enumerate(tokenized_reviews)]
random.shuffle(reviews_with_len)

reviews_with_len.sort(key=lambda x: x[2])
sorted_reviews_labels = [(review_lab[0], review_lab[1]) for review_lab in reviews_with_len]

#processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_reviews_labels, output_types=(tf.int32, tf.int32))
#BATCH_SIZE = 32
#batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

#TOTAL_BATCHES = math.ceil(len(sorted_reviews_labels) / BATCH_SIZE)
#TEST_BATCHES = TOTAL_BATCHES // 10
#batched_dataset.shuffle(TOTAL_BATCHES)
#test_data = batched_dataset.take(TEST_BATCHES)
#train_data = batched_dataset.skip(TEST_BATCHES)


zz = pd.DataFrame(list(zip(reviews,tokenized_reviews,y)), columns=("text","tokens","label"))
zz["len"] = [ len(x) for x in zz["tokens"] ]

# split the data into Train/test based on the md5 hash...
zz["hash"] = [ hashlib.md5(x.encode('ascii')).hexdigest() for x in zz["text"] ]
zz["Test"] = [ (int(x,16) % 100 > 90) for x in zz["hash"]]
train_data = zz[zz["Test"] != True]
test_data = zz[zz["Test"] == True]
print(train_data.shape)
print(test_data.shape)

(45380, 2)
(4620, 2)


Ok following the step in BertTake2,  we are now at the point where we are ready to build models

In [141]:
zz = pd.DataFrame(list(zip(reviews,tokenized_reviews,y)), columns=("text","tokens","label"))
zz["len"] = [ len(x) for x in zz["tokens"] ]

# split the data into Train/test based on the md5 hash...
zz["hash"] = [ hashlib.md5(x.encode('ascii')).hexdigest() for x in zz["text"] ]
zz["Test"] = [ (int(x,16) % 100 > 90) for x in zz["hash"]]
train_data = zz[zz["Test"] != True]
test_data = zz[zz["Test"] == True]
print(train_data.shape)
print(test_data.shape)

(45380, 6)
(4620, 6)


In [144]:
print("train:\n",train_data)
print("test:\n",test_data)


train:
                                                     text  \
0      One of the other reviewers has mentioned that ...   
1      A wonderful little production The filming tech...   
2      I thought this was wonderful way to spend time...   
3      Basically there a family where little boy Jake...   
4      Petter Mattei Love in the Time of Money is vis...   
...                                                  ...   
49995  I thought this movie did down right good job I...   
49996  Bad plot bad dialogue bad acting idiotic direc...   
49997  I am Catholic taught in parochial elementary s...   
49998  I going to have to disagree with the previous ...   
49999  No one expects the Star Trek movies to be high...   

                                                  tokens  label  len  \
0      [2028, 1997, 1996, 2060, 15814, 2038, 3855, 20...      1  318   
1      [1037, 6919, 2210, 2537, 1996, 7467, 6028, 200...      1  167   
2      [1045, 2245, 2023, 2001, 6919, 2126, 2000, 524..

In [123]:
model_dir = "/Users/druss/Downloads/uncased_L-12_H-768_A-12"

bert_params = bert.params_from_pretrained_ckpt(model_dir)
l_bert = bert.BertModelLayer.from_params(bert_params, name="bert")


In [129]:
import os
from tensorflow import keras

max_seq_len = 128
l_input_ids      = keras.layers.Input(shape=(max_seq_len,), dtype='int32')
l_token_type_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32')

# using the default token_type/segment id 0
output = l_bert(l_input_ids)                              # output: [batch_size, max_seq_len, hidden_size]
model = keras.Model(inputs=l_input_ids, outputs=output)
model.build(input_shape=(None, max_seq_len))

bert_ckpt_file   = os.path.join(model_dir, "bert_model.ckpt")
bert.load_stock_weights(l_bert, bert_ckpt_file)



Done loading 196 BERT weights from: /Users/druss/Downloads/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x69318aad0> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights


[]