### Stage 1 : Importing dependencies

In [None]:
import numpy as np
import math

# model that allows us to do text-process
import re

# for extracting csv datasets into usable data frames
import pandas as pd

# parse xml/html form of data & decode it into usable data frame
from bs4 import BeautifulSoup

#random needed during data processing
import random

# get data from personal google drive
from google.colab import drive

In [None]:
# useful sdk for user-friendly usage of google's official package
!pip install bert-for-tf2

# required by bert-for-tf2 for decoding
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
[K     |████████████████████████████████| 41 kB 128 kB/s 
[?25hCollecting py-params>=0.9.6
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
Collecting params-flow>=0.8.0
  Downloading params-flow-0.8.2.tar.gz (22 kB)
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30535 sha256=13c439d61e1e5d2e0d2b7e5e368d7fb6958bb7a3dfffcd76322285eaee17fe1a
  Stored in directory: /root/.cache/pip/wheels/47/b6/e5/8c76ec779f54bc5c2f1b57d2200bb9c77616da83873e8acb53
  Building wheel for params-flow (setup.py) ... [?25l[?25hdone
  Created wheel for params-flow: filename=params_flow-0.8.2-py3-none-any.whl size=19472 sha256=12e388ce7f195f862acf066fae43c85354ee46

In [None]:
try:
    %tensorflow_version 2.x
except Exception:
    pass

import tensorflow as tf

# platform where lotta ML models are upload (for downloading weights of BERT)
import tensorflow_hub as hub

# building layers for our CNNs
from tensorflow.keras import layers
import bert

## Stage 2 : Data processing

We import files from our Google Drive

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
cols = ["sentiment", 'id', 'date', 'query', 'user', 'text']
data = pd.read_csv(
    "/content/drive/MyDrive/Colab Notebooks/BERT Tokenizer/training.1600000.processed.noemoticon.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)

In [None]:
data.drop(["id","date","query","user"], axis=1, inplace=True)

In [None]:
data.head(5)

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## Stage 3 : Pre-processing

### Cleaning

In [None]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()

    # for replacing certain string regex patterns with desired values
    tweet = re.sub(r"@[a-zA-Z0-9]+"," ", tweet)
    tweet = re.sub(r"https?://[a-zA-Z0-9./]+"," ", tweet)
    tweet = re.sub(r"[^a-zA-Z.!?']", " ", tweet)
    tweet = re.sub(r" +", " ", tweet)
    return tweet

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [None]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenization

We need to create a BERT layer to have access to the metadata for the tokenizer (such as vocab_size)

In [None]:
FullTokenizer = bert.bert_tokenization.FullTokenizer

# we use trainable as false since we only want to get the info the tokenizer of BERT, not fine-tune its weights at all
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=False )

# for getting the vocabulary file for the tokenizer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [None]:
tokenizer.tokenize("My dog loves strawberries.")

['my', 'dog', 'loves', 'straw', '##berries', '.']

In [None]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("My dog loves strawberries."))

[2026, 3899, 7459, 13137, 20968, 1012]

In [None]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [None]:
data_inputs = [encode_sentence(sent) for sent in data_clean]

### Dataset Creation

We will create padded batches (so we pad sentences for each batch independently), this way we will create minimum number of padding tokens possible. For that, we sort the sentences by length, apply padded_batches & then shuffle


In [None]:
data_with_len = [[sent, data_labels[i], len(sent)] for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x:x[2])

# basically, only include sentences whose length > 7
sorted_all = [(sent_lab[0], sent_lab[1]) for sent_lab in data_with_len if sent_lab[2] > 7]

In [None]:
# we currently have all our pairs of tweets & labels, that are cleaned, sorted & filtered by sentence length
# usually we use the from-tensor slices for this purpose - refer - https://www.tensorflow.org/guide/tensor_slicing
# here, we can't do it since our sentences are of different length
# so we need to call the from-generator -> so we can get datasets that are of different length, but we need to give it a generator 
# a generator is something that just gives, one after the other
# write a lambda func, that passes our input datasets, one-by-one to from-generator
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all, output_types =(tf.int32, tf.int32))

In [41]:
next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=array([2821,  999, 4485,  999, 2773,  999, 4067, 2017], dtype=int32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=4>)

Array of input dataset ( eg : [2821,  999, 4485,  999, 2773,  999, 4067, 2017]) & corresponding label (eg : 4)

**Padding**  - Padding phase is done at the same time as the batching phase

In [42]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ) ()))

In [43]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[ 2821,   999,  4485,   999,  2773,   999,  4067,  2017],
        [ 2893,  3201,  2000,  2175,  2067,  2000,  2082,  1012],
        [ 1045,  1005,  1049,  2085,  2206, 16299,  2006, 10474],
        [ 2428,  2123,  1005,  1056,  2215, 17776,  1042,  2497],
        [10047,  2863,  2031,  3524,  2963,  2009,  6229,  6928],
        [ 1045,  3984,  1045,  2031, 16021,  5358,  6200,  3892],
        [ 1045,  3246,  2017,  2064,  2031,  1037,  2204,  2717],
        [ 4019,  1996,  6580,  8146,  1061,  5506,  4632,  2299],
        [ 4931,  2045,  2015,  2498,  3308,  2007,  2008,  1012],
        [ 2049,  1037,  3835, 11559,  4633,  1999, 14022,  2651],
        [ 3835,   999,  2008,  1005,  1055, 12476,  1012, 23156],
        [ 2003,  4634,  2061,  2524,  2005,  2026,  6429,  2611],
        [ 2012,  2658,  2458,  1012,  1012,  1012,  1012,  1012],
        [ 1045,  1005,  1049,  2061,  3407,  2005, 18431,  1012],
        [23987,  2813, 23298

In [44]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES//10

# passing buffer size of shuffling = number of batches. This solves the problem on solely training/testing on 1st few examples i.e. examples with sentence length as say 5;
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

## Stage 3 : Model Building

In [78]:
class DCNN(tf.keras.Model):
    def __init__(self,
                vocab_size, # for applying embedding layer for our CNN
                emb_dim=120, # for using our vectors to build & traing our CNN here
                nb_filters=50, # number of convolutional filters for each size. we will get 50 feature detectors of size 2, 3 & 4 each
                FFN_units=12,  # number of hidden units we will use in our dense layers at the end becuase the end of the last feed forward neural network parts of our CNN will be made of 2 dense layers, having a number of hidden units in between them
                nb_classes=2,
                dropout_rate=0.1,
                training=False,# we need to know whether we are in "training" phase or not, else we won't use dropout_rate
                name="dcnn"
                ):
        super(DCNN, self).__init__(name=name)

        ## Embedding layer
        ## each word is a number right now. We wish to take these input tokens & convert them into vectors
        #  also, the parameters/weights will be trained here
        self.embedding = layers.Embedding(vocab_size, emb_dim)

        ## focusses on 2 consecutive words
        ## Currently, shifting feature detectors in only 1 dimension
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    # when you have strides of more than 1, sometimes the last iteration of our feature detectors could get out of the
                                    # max range of our matrix of our sentence & this padding indicates how we handle it
                                    padding='valid',
                                    # add a function to each element of our feature map (output of conventional phase)
                                    # here, we basically just set all negative values to 0
                                    activation="relu",
                                    )
        self.trigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=3,
                                    # when you have strides of more than 1, sometimes the last iteration of our feature detectors could get out of the
                                    # max range of our matrix of our sentence & this padding indicates how we handle it
                                    padding='valid',
                                    # add a function to each element of our feature map (output of conventional phase)
                                    # here, we basically just set all negative values to 0
                                    activation="relu",
                                    )
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=4,
                                    # when you have strides of more than 1, sometimes the last iteration of our feature detectors could get out of the
                                    # max range of our matrix of our sentence & this padding indicates how we handle it
                                    padding='valid',
                                    # add a function to each element of our feature map (output of conventional phase)
                                    # here, we basically just set all negative values to 0
                                    activation="relu",
                                    )
        
        ## Now we gotta create a layer i.e. function that  will take the max of those outputs, as we saw before
        self.pool = layers.GlobalMaxPool1D()

        # Feed Forwards, Neural Network
        ## Need 2 dense layers with a hidden number of units between the 2 dense layers
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")

        ## dropout layer - shut down some certain neurons i.e. free up compute power to prevent the model from overfitting
        # each time, different neurons are shut off
        self.dropuout = layers.Dropout(rate=dropout_rate)

        # >0.5 => 1, else 0
        # if more than 2 classes, then number of output units = number of classs
        # for multiclass, we will use softmax activation so that we get probabilities for each class to be the answer 
        if(nb_classes == 2):
            self.last_dense = layers.Dense(units=1, activation="sigmoid")
        else:
            self.last_dense = layers.Dense

    # use our created neurons
    def call(self, inputs, training):
            x = self.embedding(inputs)
            x_1 = self.bigram(x)
            x_l = self.pool(x_1)
            x_2 = self.threegram(x)
            x_2 = self.pool(x_1)
            x_3 = self.fourgram(x)
            x_3 = self.pool(x_1) # (batch_size, nb_filters)
            
            merged = tf.concat([x_1, x_2, x_3], axis=1)
            nerged = self.dense_1(merged)
            merged = self.dropout(merged, training)
            output = self.last_dense(merged)

            return output

## Stage 4 : Training

In [76]:
# Hyperparameters - parameters needed for training

VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [79]:
Dcnn = DCNN(
    vocab_size=VOCAB_SIZE,
    nb_filters=NB_FILTERS,
    FFN_units=FFN_UNITS,
    emb_dim=EMB_DIM,
    dropout_rate=DROPOUT_RATE,
)

In [82]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy", optimizer="adam",metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["sparse_categorical_accuracy"])

In [61]:
checkpoint_path = "/content/drive/MyDrive/Colab Notebooks/BERT Tokenizer/ckpt"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path,max_to_keep=1)
if ckpt_manager.latest_checkpoint:
    ckpt.resotre(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored ")


In [80]:
# run some other misc. functions during the training
class MyCustomCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint is saved at{}".format(checkpoint_path))

## Result

In [None]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
  37196/Unknown - 2200s 59ms/step - loss: -7279808610304.0000 - accuracy: 1.2602e-05Checkpoint is saved at/content/drive/MyDrive/Colab Notebooks/BERT Tokenizer/ckpt
Epoch 2/5

## Evaluation


In [None]:
results = Dcnn.evaluate(test_dataset)
print(results)

In [None]:
def get_prediction(sentence):

    # convert words from sentence into their respective numbers
    tokens = encode_sentence(sentence)

    # convert the tokenized representation of words into a v
    inputs = tf.expand_dims(tokens, 0)

    output = Dcnn(inputs, training=False)

    sentiment = math.floor(output*2)

    if sentiment == 0:
        print("Output of the model: {}\nPredicted sentiment: negative.".format(
            output))
    elif sentiment == 1:
        print("Output of the model: {}\nPredicted sentiment: positive.".format(
            output))

In [None]:
get_prediction("This movie was pretty interesting.")

In [None]:
get_prediction("I'd rather not do that again.")