<a href="https://colab.research.google.com/github/funpi89/NLP_demo/blob/master/BertTokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing dependencies

In [0]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

In [0]:
!pip install bert-for-tf2
!pip install sentencepiece

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/4c/2a/79f44178ac6f5b6995bc7804898fce2654da70e0c5c7f76332748420d6fd/bert-for-tf2-0.13.5.tar.gz (40kB)
[K     |████████▏                       | 10kB 25.4MB/s eta 0:00:01[K     |████████████████▎               | 20kB 6.1MB/s eta 0:00:01[K     |████████████████████████▍       | 30kB 7.2MB/s eta 0:00:01[K     |████████████████████████████████| 40kB 3.8MB/s 
[?25hCollecting py-params>=0.7.3
  Downloading https://files.pythonhosted.org/packages/e1/aa/a143f0193a4fb5c7f8aa816b30366e441ff6ffe6cda4887e4c01300c4b01/py-params-0.8.3.tar.gz
Collecting params-flow>=0.7.1
  Downloading https://files.pythonhosted.org/packages/0d/12/2604f88932f285a473015a5adabf08496d88dad0f9c1228fab1547ccc9b5/params-flow-0.7.4.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2

In [0]:
try:
  %tensorflow_version 2.x
except: 
  pass
import tensorflow as tf
tf.__version__

TensorFlow 2.x selected.


'2.1.0'

In [0]:
import tensorflow_hub as hub

In [0]:
from tensorflow.keras import layers
import bert

# Data Preprocessing

## loading files

In [0]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv('/content/drive/My Drive/modern_NLP/train.csv', header=None, names=cols, engine='python', 
                   encoding='latin1')


In [0]:
data.drop(["id", "date", "query", "user"], axis=1, inplace=True)

In [0]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## Cleaning Data

In [0]:
def clean_tweet(tweet):
  tweet = BeautifulSoup(tweet, 'lxml').get_text()
  tweet = re.sub(r"@[A-Za-z0-9]+", ' ',tweet)
  tweet = re.sub(r"https?://[A-Za-z0-9./]+"," ", tweet)
  tweet = re.sub(r"[^a-zA-Z.!?]", " ",tweet)
  tweet = re.sub(r" +", " ",tweet)
  return tweet

In [0]:
data_clean = data['text'].apply(clean_tweet)

In [0]:
data_labels = data['sentiment']
data_labels[data_labels == 4] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## Tokenizer

In [0]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [0]:
tokenizer.tokenize("My dog's habbit is running.")

['my', 'dog', "'", 's', 'ha', '##bb', '##it', 'is', 'running', '.']

In [0]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("My dog is running."))

[2026, 3899, 2003, 2770, 1012]

In [0]:
def encode_sentence(sent):
  return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [0]:
data_inputs = data_clean.apply(encode_sentence)

## Dataset Creation

In [0]:
data_with_len = [[sent, data_labels[i], len(sent)] for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [(sent_lab[0], sent_lab[1]) for sent_lab in data_with_len if sent_lab[2] > 7]

In [0]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all, output_types=(tf.int32, tf.int32))

In [0]:
next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=array([4241, 6862,  999, 2821, 1045, 2293, 2009,  999], dtype=int32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [0]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ),()))

In [0]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[ 4241,  6862,   999,  2821,  1045,  2293,  2009,   999],
        [ 2024,  2017,  3809,  1029,  1029,  2008, 19237,  4757],
        [ 1012,  1012,  4283,  2005,  6276,  2009,  2125,  4365],
        [ 7929,  2053,  2062, 10821, 14032,   999,  2293,  2166],
        [ 2026,  2540,  2180,  1056,  2292,  2017,  2175,   999],
        [10916,  2049,  2980,  1999,  2026,  8239, 20996, 17650],
        [ 2339,  2079,  2017,  2562, 24234,  2075,  6207,  1012],
        [ 2183,  2000,  3335,  3725,  2015,  2288,  5848,   999],
        [ 3504,  2066,  1037,  2524,  4632,  1999,  2023,  2028],
        [ 9712,  2057,  2293,  2017,  2041,  2182,  1999,  6027],
        [ 1055,  1043, 14945,  1055,  2031,  2351, 12300,  7249],
        [ 1051,  2053,  4189,   999,  8549,  3407,  5798,  2295],
        [ 2074,  2288,  2188,  2189, 19613,  2131,  2012,  2033],
        [ 3067,  2064,  2191, 15174,  2021,  2097,  2025,  3153],
        [ 8945, 12171, 12171

In [0]:
NB_BATCHES = math.ceil(len(sorted_all) // BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

# Model Building

In [0]:
class DCNN(tf.keras.Model):
  def __init__(self, vocab_size, emb_dim=128, nb_filters=50, FFN_units=512, nb_classes=2, dropout_rate=0.1,
               training=False, name='dcnn'):
    super(DCNN, self).__init__(name=name)
    
    self.embedding = layers.Embedding(vocab_size, emb_dim)
    self.bigram = layers.Conv1D(filters=nb_filters, kernel_size=2, padding='valid', activation='relu')
    self.trigram = layers.Conv1D(filters=nb_filters, kernel_size=3, padding='valid', activation='relu')
    self.fourgram = layers.Conv1D(filters=nb_filters, kernel_size=4, padding='valid', activation='relu')
    self.pool = layers.GlobalMaxPooling1D()
    self.dense_1 = layers.Dense(units=FFN_units, activation='relu')
    self.dropout = layers.Dropout(rate=dropout_rate)
    if nb_classes == 2:
      self.last_layer = layers.Dense(units=1, activation='sigmoid')
    else:
      self.last_layer = layers.Dense(units=nb_classes, activation='softmax')

  def call(self, inputs, training):
    x = self.embedding(inputs)
    x_1 = self.bigram(x)
    x_1 = self.pool(x_1)

    x_2 = self.trigram(x)
    x_2 = self.pool(x_2)

    x_3 = self.fourgram(x)
    x_3 = self.pool(x_3)

    merged = tf.concat([x_1, x_2, x_3], axis=-1) #(batch, 3*nb_filters)
    merged = self.dense_1(merged)
    merged = self.dropout(merged, training)
    output = self.last_layer(merged)
    
    return output


# Training

In [0]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2
DROPOUT_RATE = 0.2
NB_EPOCHES = 5

In [0]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE, emb_dim=EMB_DIM, nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS, nb_classes=NB_CLASSES, dropout_rate=DROPOUT_RATE)

In [0]:
if NB_CLASSES == 2:
  Dcnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
else:
  Dcnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_accuracy'])

In [0]:
checkpoints_path = '/content/drive/My Drive/modern_NLP/ckptbert/'
ckpt = tf.train.Checkpoint(Dcnn = Dcnn)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoints_path, max_to_keep=1)
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print('Latest checkpoint restored!!')

In [0]:
class MyCustomCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs=None):
    ckpt_manager.save()
    print("Checkpoint saved at {}".format(checkpoints_path))

In [0]:
Dcnn.fit(train_dataset, epochs=1, callbacks=[MyCustomCallback()])

  36913/Unknown - 2328s 63ms/step - loss: 0.4122 - accuracy: 0.8118Checkpoint saved at /content/drive/My Drive/modern_NLP/ckptbert/


<tensorflow.python.keras.callbacks.History at 0x7ff198366390>

# Evaluation

In [0]:
results =  Dcnn.evaluate(test_dataset)

In [0]:
print(results)

[0.366759982983809, 0.8409306]


In [0]:
def get_prediction(sentence):
  tokens = encode_sentence(sentence)
  inputs = tf.expand_dims(tokens, 0)
  output = Dcnn(inputs, training=False)
  sentiment = math.floor(output*2)
  if sentiment == 0:
    print('output of the model: {}\nPredicted sentiment: negative'.format(output))
  else:
    print('output of the model: {}\nPredicted sentiment: positive'.format(output))

In [0]:
get_prediction('this movie is so interesting')

output of the model: [[0.973313]]
Predicted sentiment: positive


In [0]:
get_prediction('this movie is so boring')

output of the model: [[0.0826827]]
Predicted sentiment: negative


In [0]:
get_prediction('hahahahahahahaha')

output of the model: [[0.9127056]]
Predicted sentiment: positive
