### Import dependencies

In [21]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

In [22]:
# !pip install bert-for-tf2
# !pip install sentencepiece

In [23]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print('gpu', gpu)
    tf.config.experimental.set_memory_growth(gpu, True)
    print('memory growth:' , tf.config.experimental.get_memory_growth(gpu))

gpu PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
memory growth: True


In [24]:
#!pip install tensorflow_hub
import tensorflow_hub as hub

In [25]:
import bert

## Data preprocessing
### Loading files

In [26]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv("/home/dawidkubicki/Datasets/sentiment_data/data/train.csv",
                   header=None,
                   names=cols,
                   engine="python",
                   encoding="latin1")

In [27]:
data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [28]:
data.drop(["id", "date", "query", "user"],
         axis=1,
         inplace=True)

In [29]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


### Cleaning

In [30]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    tweet = re.sub(r"https?://[A-za-z0-9./]+", ' ', tweet)
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [31]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [32]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenization
#### We need to create a BERT layer to have access to meta data for the tokenizer (like vocab size)

In [35]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3",
                           trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [36]:
tokenizer.tokenize("My dog love strawberries.")

['my', 'dog', 'love', 'straw', '##berries', '.']

In [38]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("My dog love strawberries."))

[2026, 3899, 2293, 13137, 20968, 1012]

In [39]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [41]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

### Dataset creation
#### We will create padded batches (so we pad sentences for each batch independetly), this way we add the minimum of padding tokens possible. For that, we sort sentences by length, apply padded_batches and then shuffle.

In [49]:
data_with_len = [[sent, data_labels[i], len(sent)] for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [(sent_lab[0], sent_lab[1]) for sent_lab in data_with_len if sent_lab[2] > 2]

In [50]:
#create a dataset from generator
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all, 
                                            output_types=(tf.int32, tf.int32))

In [51]:
next(iter(all_dataset))

(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([2025, 2438, 2051], dtype=int32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=0>)

In [52]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [53]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 3), dtype=int32, numpy=
 array([[ 2025,  2438,  2051],
        [ 3623,  1999,  9883],
        [ 4283,  2611,  2666],
        [ 2145,  2012,  2147],
        [ 7967, 13697,  9850],
        [ 2054,  2412,  7126],
        [ 5983,  5341, 24044],
        [ 1045,  2022, 20076],
        [ 4394,  2026,  3336],
        [15624,  9541,  2080],
        [ 3565,  6517,  2154],
        [ 2600, 20996,  1037],
        [ 6809,  2600,  2316],
        [ 2051,  2000,  3959],
        [ 2851,  4903,  2140],
        [ 3745, 20228,  2595],
        [ 2047,  8458, 11439],
        [17170,  7861,  1012],
        [ 2339,  4067, 29337],
        [ 1045,  3214,  9061],
        [ 2009,  2003,  8235],
        [ 2067,  2000,  2147],
        [ 3403,  2005,  2178],
        [ 4067,  2017,   999],
        [ 4067, 29337,   999],
        [ 2035,  2000,  2870],
        [ 2053,  3291,   999],
        [ 2205,  2116, 14799],
        [ 2026,  4091,  3480],
        [ 3582,  1996,  3003],
        [ 8906,  2000,

In [55]:
NB_BATCHES = math.ceil(len(sorted_all) /BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

## Model building

In [75]:
class DCNN(tf.keras.Model):
    def __init__(self,
                vocab_size,
                emb_dim=128,
                nb_filters=50,
                FFN_units=512,
                nb_classes=2,
                dropout_rate=0.1,
                training=False,
                name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, emb_dim)
        self.bigram = tf.keras.layers.Conv1D(filters=nb_filters,
                                   kernel_size=2,
                                   padding="valid",
                                   activation="relu") #1D cause we shift it only vertically
        self.trigram = tf.keras.layers.Conv1D(filters=nb_filters,
                                   kernel_size=3,
                                   padding="valid",
                                   activation="relu") #1D cause we shift it only vertically
        self.fourgram = tf.keras.layers.Conv1D(filters=nb_filters,
                                   kernel_size=4,
                                   padding="valid",
                                   activation="relu") #1D cause we shift it only vertically
        self.pool = tf.keras.layers.GlobalMaxPooling1D()
        self.dense_1 = tf.keras.layers.Dense(units=FFN_units,
                                   activation="relu")
        self.dropout = tf.keras.layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = tf.keras.layers.Dense(units=1,
                                          activation="sigmoid")
        else:
            self.last_dense = tf.keras.layers.Dense(units=nb_classes,
                                          activation="softmax")
            
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3) #(batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3*)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

## Training

#### Hyperparamethers

In [76]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [77]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [78]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                optimizer="adam",
                metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                optimizer="adam",
                metrics=["sparse_categorical_accuracy"])
    

In [79]:
#create a checkpoint
checkpoint_path = "/home/dawidkubicki/AI-Projects/bert-intuition/checkpoints"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoints)
    print("Latest checkpoint restored!")

In [80]:
#custom callback
class MyCustomCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [83]:
try:
  with tf.device('/device:GPU:0'):
    Dcnn.fit(train_dataset,
        epochs=NB_EPOCHS,
        callbacks=[MyCustomCallback()])
except RuntimeError as e:
  print(e)



Epoch 1/5
  42085/Unknown - 1677s 40ms/step - loss: 0.3782 - accuracy: 0.8313

KeyboardInterrupt: 

## Evaluation

In [None]:
results = Dcnn.evaluate(test_dataset)
print(results)

In [None]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)
    inputs = tf.expand_dims(tokens, 0)
    
    output = Dcnn(inputs, training=False)
    
    sentiment = math.floor(outputs*2)
    
    if sentiment == 0:
        print("Output of the model: {}\nPredicted sentiment: negative.".format(output))
        
    if sentiment == 1:
        print("Output of the model: {}\nPredicted sentiment: positive.".format(output))

In [None]:
get_prediction("I'd rather not do that again.")