<a href="https://colab.research.google.com/github/funpi89/NLP_demo/blob/master/BertEmbedder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing dependencies

In [0]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

In [2]:
!pip install bert-for-tf2
!pip install sentencepiece

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/ff/84/1bea6c34d38f3e726830d3adeca76e6e901b98cf5babd635883dbedd7ecc/bert-for-tf2-0.14.1.tar.gz (40kB)
[K     |████████                        | 10kB 23.7MB/s eta 0:00:01[K     |████████████████▏               | 20kB 3.2MB/s eta 0:00:01[K     |████████████████████████▎       | 30kB 4.6MB/s eta 0:00:01[K     |████████████████████████████████| 40kB 3.3MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/ac/0d/615c0d4aea541b4f47c761263809a02e160e7a2babd175f0ddd804776cf4/params-flow-0.8.0.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2

In [3]:
try:
  %tensorflow_version 2.x
except: 
  pass
import tensorflow as tf
tf.__version__

'2.2.0-rc2'

In [0]:
import tensorflow_hub as hub

In [0]:
from tensorflow.keras import layers
import bert

# Data Preprocessing

## loading files

In [0]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv('/content/drive/My Drive/modern_NLP/train.csv', header=None, names=cols, engine='python', 
                   encoding='latin1')


In [0]:
data.drop(["id", "date", "query", "user"], axis=1, inplace=True)

In [8]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## Cleaning Data

In [0]:
def clean_tweet(tweet):
  tweet = BeautifulSoup(tweet, 'lxml').get_text()
  tweet = re.sub(r"@[A-Za-z0-9]+", ' ',tweet)
  tweet = re.sub(r"https?://[A-Za-z0-9./]+"," ", tweet)
  tweet = re.sub(r"[^a-zA-Z.!?]", " ",tweet)
  tweet = re.sub(r" +", " ",tweet)
  return tweet

In [0]:
data_clean = data['text'].apply(clean_tweet)

In [11]:
data_labels = data['sentiment']
data_labels[data_labels == 4] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## Tokenizer

In [0]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [13]:
tokenizer.tokenize("My dog's habbit is running.")

['my', 'dog', "'", 's', 'ha', '##bb', '##it', 'is', 'running', '.']

In [14]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("My dog is running."))

[2026, 3899, 2003, 2770, 1012]

## We only use the first sentence for bert inputs so we add the CLS token at the beginning and the SEP token at the end of each sentence

In [0]:
def encode_sentence(sent):
  return ["[CLS]"] + tokenizer.tokenize(sent) + ["[SEP]"]

In [0]:
data_inputs = data_clean.apply(encode_sentence)

In [17]:
data_inputs[:3]

0    [[CLS], aw, ##w, ##w, that, s, a, bum, ##mer, ...
1    [[CLS], is, upset, that, he, can, t, update, h...
2    [[CLS], i, dive, ##d, many, times, for, the, b...
Name: text, dtype: object

## Dataset Creation

## We need to create the 3 different inputs for each sentence

In [0]:
def get_ids(tokens):
  return tokenizer.convert_tokens_to_ids(tokens)

def get_mask(tokens):
  return np.char.not_equal(tokens, "[PAD]").astype(int)

def get_segments(tokens):
  seg_ids = []
  current_seg_id = 0
  for token in tokens:
    seg_ids.append(current_seg_id)
    if token == "[SEP]":
      current_seg_id = 1 - current_seg_id
  return seg_ids

In [0]:
data_with_len = [[sent, data_labels[i], len(sent)] for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [([get_ids(sent_lab[0]), get_mask(sent_lab[0]), get_segments(sent_lab[0])]
               , sent_lab[1]) for sent_lab in data_with_len if sent_lab[2] > 7]

In [20]:
a=0
for i, (data, _) in enumerate(sorted_all):
  try:
    # print(data)
    inputs = tf.stack(
      [tf.cast(data[0], dtype=tf.int32),
        tf.cast(data[1], dtype=tf.int32),
        tf.cast(data[2], dtype=tf.int32)],
        axis=0)
    if i < 10:
      print(len(data[0]), len(data[1]), len(data[2]))
    Dcnn(inputs, training=False)
  except:
    a += 1
print(a)
len(data[0]), len(data[1]), len(data[2])

8 8 8
8 8 8
8 8 8
8 8 8
8 8 8
8 8 8
8 8 8
8 8 8
8 8 8
8 8 8
1438658


(121, 121, 121)

In [0]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all, output_types=(tf.int32, tf.int32))

In [22]:
next(iter(all_dataset))

(<tf.Tensor: shape=(3, 8), dtype=int32, numpy=
 array([[ 101, 1045, 2572, 3141, 2000, 9901, 9212,  102],
        [   1,    1,    1,    1,    1,    1,    1,    1],
        [   0,    0,    0,    0,    0,    0,    0,    0]], dtype=int32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [0]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((3, None),()), padding_values=(0,0))

In [24]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 3, 8), dtype=int32, numpy=
 array([[[  101,  1045,  2572,  3141,  2000,  9901,  9212,   102],
         [    1,     1,     1,     1,     1,     1,     1,     1],
         [    0,     0,     0,     0,     0,     0,     0,     0]],
 
        [[  101,  2061,  2995, 25358,  3669,  3726,  2444,   102],
         [    1,     1,     1,     1,     1,     1,     1,     1],
         [    0,     0,     0,     0,     0,     0,     0,     0]],
 
        [[  101, 22978,  2023,  2323,  3437,  2115,  3160,   102],
         [    1,     1,     1,     1,     1,     1,     1,     1],
         [    0,     0,     0,     0,     0,     0,     0,     0]],
 
        [[  101,  2053,  2797,  6959,  2326,  2023,  2851,   102],
         [    1,     1,     1,     1,     1,     1,     1,     1],
         [    0,     0,     0,     0,     0,     0,     0,     0]],
 
        [[  101,  7568,  2000,  2022,  3225,  2147,  4826,   102],
         [    1,     1,     1,     1,     1,     1,     1,     1],

In [0]:
NB_BATCHES = math.ceil(len(sorted_all) // BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

# Model Building

In [26]:
my_sent = ["[CLS]"] + tokenizer.tokenize("Rose are red.") + ["[SEP]"]
bert_layer([
            tf.expand_dims(tf.cast(get_ids(my_sent), tf.int32), 0),
            tf.expand_dims(tf.cast(get_mask(my_sent), tf.int32), 0),
            tf.expand_dims(tf.cast(get_segments(my_sent), tf.int32), 0),
])

[<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
 array([[-9.02437449e-01, -3.12391222e-01, -9.18370306e-01,
          9.09932375e-01,  7.95610905e-01, -2.23733783e-01,
          9.01494741e-01,  3.36178988e-01, -7.02603579e-01,
         -9.99978065e-01, -6.12629890e-01,  9.03688192e-01,
          9.79902208e-01,  5.18856406e-01,  9.34267819e-01,
         -7.34853208e-01, -3.62728000e-01, -7.19748020e-01,
          4.50904161e-01, -5.73662937e-01,  7.22072601e-01,
          9.99985993e-01, -1.65528983e-01,  3.11937660e-01,
          5.09428024e-01,  9.66026962e-01, -7.30758607e-01,
          9.23188984e-01,  9.61604476e-01,  7.41122723e-01,
         -6.97756767e-01,  2.12863907e-01, -9.93341446e-01,
         -1.96652219e-01, -9.69987214e-01, -9.91843998e-01,
          4.82662380e-01, -6.86945021e-01, -1.17491290e-01,
         -2.24875938e-03, -9.16521847e-01,  4.42117631e-01,
          9.99978125e-01,  9.63494629e-02,  6.11179531e-01,
         -3.01456064e-01, -9.99999940e-01,  5.561

In [0]:
class DCNNBERTEmbedding(tf.keras.Model):
  def __init__(self,nb_filters=50, FFN_units=512, nb_classes=2, dropout_rate=0.1, name='dcnn'):
    super(DCNNBERTEmbedding, self).__init__(name=name)
    
    self.bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=False)
    # self.embedding = layers.Embedding(vocab_size, emb_dim)
    self.bigram = layers.Conv1D(filters=nb_filters, kernel_size=2, padding='valid', activation='relu')
    self.trigram = layers.Conv1D(filters=nb_filters, kernel_size=3, padding='valid', activation='relu')
    self.fourgram = layers.Conv1D(filters=nb_filters, kernel_size=4, padding='valid', activation='relu')
    self.pool = layers.GlobalMaxPooling1D()
    self.dense_1 = layers.Dense(units=FFN_units, activation='relu')
    self.dropout = layers.Dropout(rate=dropout_rate)
    if nb_classes == 2:
      self.last_layer = layers.Dense(units=1, activation='sigmoid')
    else:
      self.last_layer = layers.Dense(units=nb_classes, activation='softmax')
  def embed_with_bert(self, all_tokens):
    _, embs = self.bert_layer([all_tokens[:,0,:], all_tokens[:,1,:], all_tokens[:,2,:]])
    return embs
  def call(self, inputs, training):
    x = self.embed_with_bert(inputs)
    # x = self.embedding(inputs)
    x_1 = self.bigram(x)
    x_1 = self.pool(x_1)

    x_2 = self.trigram(x)
    x_2 = self.pool(x_2)

    x_3 = self.fourgram(x)
    x_3 = self.pool(x_3)

    merged = tf.concat([x_1, x_2, x_3], axis=-1) #(batch, 3*nb_filters)
    merged = self.dense_1(merged)
    merged = self.dropout(merged, training)
    output = self.last_layer(merged)
    
    return output


# Training

In [0]:
# VOCAB_SIZE = len(tokenizer.vocab)
# EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2
DROPOUT_RATE = 0.2
NB_EPOCHES = 1

In [0]:
Dcnn = DCNNBERTEmbedding(nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS, nb_classes=NB_CLASSES, dropout_rate=DROPOUT_RATE)

In [0]:
if NB_CLASSES == 2:
  Dcnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
else:
  Dcnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_accuracy'])

In [0]:
checkpoints_path = '/content/drive/My Drive/modern_NLP/ckptbertembedding/'
ckpt = tf.train.Checkpoint(Dcnn = Dcnn)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoints_path, max_to_keep=1)
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print('Latest checkpoint restored!!')

In [0]:
class MyCustomCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs=None):
    ckpt_manager.save()
    print("Checkpoint saved at {}".format(checkpoints_path))

In [33]:
Dcnn.fit(train_dataset, epochs=1, callbacks=[MyCustomCallback()])

  40463/Unknown - 1573s 39ms/step - loss: 0.3981 - accuracy: 0.8215Checkpoint saved at /content/drive/My Drive/modern_NLP/ckptbertembedding/


<tensorflow.python.keras.callbacks.History at 0x7f72b0fc3f28>

# Evaluation

In [34]:
results =  Dcnn.evaluate(test_dataset)



In [35]:
print(results)

[0.3549990653991699, 0.8499652147293091]


In [0]:
def get_prediction(sentence):
  # tokens = encode_sentence(sentence)
  my_sent = ["[CLS]"] + tokenizer.tokenize(sentence) + ["[SEP]"]
  input_ids = get_ids(my_sent)
  input_mask = get_mask(my_sent)
  segment_ids = get_segments(my_sent)

  inputs = tf.stack(
      [tf.cast(input_ids, dtype=tf.int32),
        tf.cast(input_mask, dtype=tf.int32),
        tf.cast(segment_ids, dtype=tf.int32)],
        axis=0)
  inputs = tf.expand_dims(inputs, 0) # simulates a batch
  output = Dcnn(inputs, training=False)
  sentiment = math.floor(output*2)
  if sentiment == 0:
    print('output of the model: {}\nPredicted sentiment: negative'.format(output))
  else:
    print('output of the model: {}\nPredicted sentiment: positive'.format(output))

In [37]:
get_prediction('this movie is so interesting')

output of the model: [[0.8644834]]
Predicted sentiment: positive


In [38]:
get_prediction('this movie is so boring')

output of the model: [[0.0156412]]
Predicted sentiment: negative


In [39]:
get_prediction('hahahahahahahaha')

output of the model: [[0.96148306]]
Predicted sentiment: positive
