##Stage 1: Importing dependencies

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import math
import re
from bs4 import BeautifulSoup

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
[?25l[K     |████████                        | 10 kB 24.2 MB/s eta 0:00:01[K     |████████████████                | 20 kB 26.8 MB/s eta 0:00:01[K     |███████████████████████▉        | 30 kB 11.7 MB/s eta 0:00:01[K     |███████████████████████████████▉| 40 kB 9.1 MB/s eta 0:00:01[K     |████████████████████████████████| 41 kB 137 kB/s 
[?25hCollecting py-params>=0.9.6
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
Collecting params-flow>=0.8.0
  Downloading params-flow-0.8.2.tar.gz (22 kB)
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30534 sha256=3ec24c66dff69adabc895f2de82e8b39203e6eec188d620b69b3caa903e3d9a2
  Stored in directory: /root/.cache/pip/wheels/47/b6/e5/8c76ec779f54bc5c2f1b57d2200bb9c77616da83873e8acb53
  Build

In [None]:
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

##Stage 2: Data Preprocessing

### Get data

In [None]:
!wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip

--2021-10-23 20:51:23--  http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip [following]
--2021-10-23 20:51:23--  https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81363704 (78M) [application/zip]
Saving to: ‘trainingandtestdata.zip’


2021-10-23 20:51:25 (45.1 MB/s) - ‘trainingandtestdata.zip’ saved [81363704/81363704]



In [None]:
!unzip /content/trainingandtestdata.zip

Archive:  /content/trainingandtestdata.zip
  inflating: testdata.manual.2009.06.14.csv  
  inflating: training.1600000.processed.noemoticon.csv  


In [None]:
#!unzip /content/trainingandtestdata.zip

### Preprocess

In [None]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv("/content/training.1600000.processed.noemoticon.csv",
                   names=cols,
                   encoding="latin1")

In [None]:
data.drop(["id", "date", "query", "user"], axis=1, inplace=True)

In [None]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   sentiment  1600000 non-null  int64 
 1   text       1600000 non-null  object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


### Cleaning

In [None]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Delete the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Delete URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Just keep letters and important punctuation
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Remove additional spaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [None]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenization

We need to create a BERT layer to have acces to meta data for the tokenize (like vocab size)

In [None]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [None]:
tokenizer.tokenize("my dog loves strawberries.")

['my', 'dog', 'loves', 'straw', '##berries', '.']

In [None]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("my dog loves strawberries."))

[2026, 3899, 7459, 13137, 20968, 1012]

In [None]:
def encode_sentence(sent):
  return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [None]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

### Dataset Creation

We will create padded batches (so we pad sentences for each batch inpedendently), this way we add the minimum of padding tokens possible. For that, we sort sentences by length, apply padded_batches and then shuffle.

In [None]:
import random
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [(sent_lab[0], sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 7]

In [None]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [None]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [None]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[ 6865,  2378,  2007,  2026,  2190,  2666, 10514,  5831],
        [ 9089,  5243,  1998,  1056,  9148,  2595,  9805,  7382],
        [ 2145,  2012,  2147,  4394, 25550,  2172,  1012,  1012],
        [ 4485,  1045,  2131,  2125,  2012,  2809,  2085,  1012],
        [ 2017,  2064,  2272,  2058,  7188,  2017,  2215,   999],
        [11498,  2497, 24978, 19538,  2050,  6846,  2206, 22399],
        [ 4553,  2000,  4875,  3020,  1012,  1012,  1012,   999],
        [ 2397,  2305, 22715,  9956,  2015,  2007,  2564,   999],
        [ 1996,  9592,  1997,  2008,  2025,  6230,  2003,  2471],
        [ 5458,  6016,  2026,  5001,  7685,  1999,  2005, 10315],
        [ 2053,  1012,  1012,  1012,  2026,  3274,  3844,  2091],
        [15203,  1029,  2106,  2017, 15301,  1996,  8872,  1029],
        [ 1996,  8505, 19718,  2074,  8271,  2033,  2039,  1012],
        [15854,  4283,  2000,  2005,  1996,  4620,   999,  1060],
        [10047,  3374,  2000

In [None]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

##Stage 3: Model building

In [None]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x) # (batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # (batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # (batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

##Stage 4: Training

In [None]:
VOCAB_SIZE = len(tokenizer.vocab)
EMBED_DIM = 200
NB_FILTERS = 96
FFN_UNITS = 256
NB_CLASSES = 2
DROPOUT_RATE = 0.2
NB_EPOCHS = 5

In [None]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMBED_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
checkpoint_path = "./content/checkpoints/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest Checkpoint restored!")

In [None]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [None]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
Checkpoint saved at ./content/checkpoints/.
Epoch 2/5
Checkpoint saved at ./content/checkpoints/.
Epoch 3/5
Checkpoint saved at ./content/checkpoints/.
Epoch 4/5
Checkpoint saved at ./content/checkpoints/.
Epoch 5/5
Checkpoint saved at ./content/checkpoints/.


<keras.callbacks.History at 0x7f79182758d0>

##Stage 5: Evaluation

In [None]:
results = Dcnn.evaluate(test_dataset)
results



[0.4363931715488434, 0.8263099193572998]

In [None]:
def get_prediction(sentence):
  tokens = encode_sentence(sentence)
  inputs = tf.expand_dims(tokens, 0)

  output = Dcnn(inputs, training=False)
  sentiment = math.floor(output*2)

  if sentiment == 0:
    print("Output of the model: {}\nPredicted sentiment: negative".format(output))
  elif sentiment == 1:
    print("Output of the model: {}\nPredicted sentiment: positive".format(output))

In [None]:
get_prediction("This movie is pretty interesting")
get_prediction("My dog likes to go to the park")
get_prediction("This traffic is killing me")
get_prediction("I bought medicine from pharmacy")
get_prediction("I bought medicine from pharmacy to heal")

Output of the model: [[0.9992894]]
Predicted sentiment: positive
Output of the model: [[0.9408146]]
Predicted sentiment: positive
Output of the model: [[0.00045076]]
Predicted sentiment: negative
Output of the model: [[0.01798723]]
Predicted sentiment: negative
Output of the model: [[0.0001173]]
Predicted sentiment: negative
