# Twitter Sentiment Analysis using BERT embedding layer.

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

In [2]:
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

### Loading the data file

In [3]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    "training.1600000.processed.noemoticon.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)

In [4]:
data.head(2)

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...


In [5]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

## Preprocessing

### Cleaning

In [6]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [7]:
data['text'][0]

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

In [10]:
data.shape

(1600000, 2)

In [12]:
data['sentiment'].sum()

3200000

Taking only 10 percent of the data.

In [13]:
data_short = data.sample(frac=0.1)

In [15]:
data_short.shape 

((160000, 2), 320016)

In [17]:
print('The fraction of postive sentiments :- ',len(data_short[data_short['sentiment']==4])/len(data_short))

The fraction of postive sentiments :-  0.500025


In [18]:
#Thus the shortened Dataset is a balanced one 

In [19]:
data_clean = [clean_tweet(tweet) for tweet in data_short.text]

In [30]:
data_short[0:2]

Unnamed: 0,sentiment,text
1514154,4,@Epicx they are pretty astro. think I may have...
730651,0,work. sunday. work. *sigh*


In [27]:
data_clean[0]

' they are pretty astro. think I may have to download their music mate '

In [31]:
data_labels = data_short.sentiment.values
data_labels[data_labels == 4] = 1

In [35]:
print(len(data_clean) , data_labels.shape)

160000 (160000,)


### Tokenization

We need to create a BERT layer to have access to meta data for the tokenizer.

In [36]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

We only use the first sentence for BERT inputs so we add the CLS token at the beginning and the SEP token at the end of each sentence.

In [37]:
def encode_sentence(sent):
    return ["[CLS]"] + tokenizer.tokenize(sent) + ["[SEP]"]

In [42]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

In [43]:
data_inputs[-1]

['[CLS]',
 'well',
 'he',
 'needs',
 'the',
 'alarm',
 'clock',
 'reset',
 'then',
 '[SEP]']

### Dataset creation

We need to create the 3 different inputs for each sentence.

In [47]:
def get_ids(tokens):
    return tokenizer.convert_tokens_to_ids(tokens)

def get_mask(tokens):
    return np.char.not_equal(tokens, "[PAD]").astype(int)

def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1-current_seg_id # turns 1 into 0 and vice versa
    return seg_ids

We will create padded batches (so we pad sentences for each batch independently), this way we add the minimum of padding tokens possible. For that, we sort sentences by length, apply padded_batches and then shuffle.

In [48]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [([get_ids(sent_lab[0]),
                get_mask(sent_lab[0]),
                get_segments(sent_lab[0])],
               sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 7]

In [58]:
data_with_len[-1]

[['[CLS]',
  'cc',
  '##tv',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  'paperwork',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  'cc',
  '##tv',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  'lunch',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  'cc',
  '##tv',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  'sack',
  'nah',
  '##eed',
  '##ul',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  'cc',
  '##tv',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  '.',
  'home',
  '[SEP]'],
 0,
 102]

In [55]:
len(sorted_all)

144484

In [59]:
sorted_all[-1]

([[101,
   10507,
   9189,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   17397,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   10507,
   9189,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   6265,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   10507,
   9189,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   12803,
   20976,
   13089,
   5313,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   10507,
   9189,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   1012,
   2188,
   102],
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,


In [60]:
# A list is a type of iterator so it can be used as generator for a dataset
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [64]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE,
                                       padded_shapes=((3, None), ()),
                                       padding_values=(0, 0))

In [66]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

In [67]:
NB_BATCHES

4516

In [70]:
NB_BATCHES_TEST 

451

In [71]:
BATCH_SIZE

32

In [72]:
next(iter(train_dataset))

(<tf.Tensor: shape=(32, 3, 10), dtype=int32, numpy=
 array([[[  101,  2053,  3291,  2428,  3246,  2017,  2424,  2115,  4937,
            102],
         [    1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [    0,     0,     0,     0,     0,     0,     0,     0,     0,
              0]],
 
        [[  101,  1045,  4299,  1045,  2001,  2004,  4427,  2004,  2017,
            102],
         [    1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [    0,     0,     0,     0,     0,     0,     0,     0,     0,
              0]],
 
        [[  101,  4372,  7913, 11365,  2696,  4012, 14833,  2022,  7474,
            102],
         [    1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [    0,     0,     0,     0,     0,     0,     0,     0,     0,
              0]],
 
        [[  101,  5983,  2026,  8808, 17955, 10261,  2001,  2182,  1012,
            102],
         [    1,     1,     1

# Building the Model

In [73]:
class DCNNBERTEmbedding(tf.keras.Model):
    
    def __init__(self,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 name="dcnn"):
        super(DCNNBERTEmbedding, self).__init__(name=name)
        
        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable=False)

        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def embed_with_bert(self, all_tokens):
        _, embs = self.bert_layer([all_tokens[:, 0, :],
                                   all_tokens[:, 1, :],
                                   all_tokens[:, 2, :]])
        return embs

    def call(self, inputs, training):
        x = self.embed_with_bert(inputs)

        print(x.shape)

        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

# Training the model

In [74]:
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

In [75]:
Dcnn = DCNNBERTEmbedding(nb_filters=NB_FILTERS,
                         FFN_units=FFN_UNITS,
                         nb_classes=NB_CLASSES,
                         dropout_rate=DROPOUT_RATE)

In [76]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [77]:
Dcnn.layers

[<tensorflow_hub.keras_layer.KerasLayer at 0x21f06476160>,
 <tensorflow.python.keras.layers.convolutional.Conv1D at 0x21f109bf310>,
 <tensorflow.python.keras.layers.convolutional.Conv1D at 0x21f109bf1c0>,
 <tensorflow.python.keras.layers.convolutional.Conv1D at 0x21f109bb730>,
 <tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D at 0x21f109bf490>,
 <tensorflow.python.keras.layers.core.Dense at 0x21f109bbdc0>,
 <tensorflow.python.keras.layers.core.Dropout at 0x21f109b69d0>,
 <tensorflow.python.keras.layers.core.Dense at 0x21f109b6b20>]

In [78]:
checkpoint_path = "ckpt_bert_embedding/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [79]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

## Training the model on the data set 

In [81]:
Dcnn.fit(train_dataset,
         epochs=1,
         callbacks=[MyCustomCallback()])

   4065/Unknown - 7940s 2s/step - loss: 0.4187 - accuracy: 0.8087Checkpoint saved at ckpt_bert_embedding/.


<tensorflow.python.keras.callbacks.History at 0x21f11211160>

# Evaluation

In [82]:
results = Dcnn.evaluate(test_dataset)
print(results)

(None, None, 768)
[0.3807750642299652, 0.8404933214187622]


In [83]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)

    input_ids = get_ids(tokens)
    input_mask = get_mask(tokens)
    segment_ids = get_segments(tokens)

    inputs = tf.stack(
        [tf.cast(input_ids, dtype=tf.int32),
         tf.cast(input_mask, dtype=tf.int32),
         tf.cast(segment_ids, dtype=tf.int32)],
         axis=0)
    inputs = tf.expand_dims(inputs, 0) # simulates a batch

    output = Dcnn(inputs, training=False)

    sentiment = math.floor(output*2)

    if sentiment == 0:
        print("Output of the model: {}\nPredicted sentiment: negative".format(
            output))
    elif sentiment == 1:
        print("Output of the model: {}\nPredicted sentiment: positive".format(
            output))

In [88]:
get_prediction("This actor is a awesome.")

(1, 8, 768)
Output of the model: [[0.93041694]]
Predicted sentiment: positive


In [89]:
Dcnn.save_weights('Dcnn_Bert.h5')

In [91]:
get_prediction("Awww!!!! Mumbai is Humid")

(1, 12, 768)
Output of the model: [[0.2719274]]
Predicted sentiment: negative


In [92]:
get_prediction("wow !!!! Mumbai is Humid")

(1, 10, 768)
Output of the model: [[0.84825623]]
Predicted sentiment: positive


In [93]:
get_prediction("I passsssssssssssed ")

(1, 10, 768)
Output of the model: [[0.68753064]]
Predicted sentiment: positive


In [98]:
get_prediction("Today the weather is sunny and bright")





(1, 9, 768)
Output of the model: [[0.9707856]]
Predicted sentiment: positive


In [99]:
from tensorflow.keras.models import clone_model

In [105]:
Dcnn.save_weights('Dcnn_Bert3.tf')

In [107]:
Dcnn.save_weights('Dcnn_Bert4.tf',save_format='tf')

In [109]:
get_prediction("i lost my car today")

(1, 7, 768)
Output of the model: [[0.00794217]]
Predicted sentiment: negative


In [110]:
get_prediction("The economy is going down")

(1, 7, 768)
Output of the model: [[0.07735083]]
Predicted sentiment: negative


In [111]:
get_prediction("Situation on the border escalates.")

(1, 10, 768)
Output of the model: [[0.24358734]]
Predicted sentiment: negative


Thus our model is able to correctly predict the sentiments conveyed in a sentence.