<a href="https://colab.research.google.com/github/geekanese/hello-world/blob/master/BERT_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing dependencies

In [0]:
!pip install tf-models-official
!pip install tf-nightly

Collecting tf-models-official
[?25l  Downloading https://files.pythonhosted.org/packages/fd/83/75e858814c8ec4deb6130b86e2c896265b8343a5f68b72708e94b50c2ec9/tf_models_official-0.0.3.dev1-py2.py3-none-any.whl (694kB)
[K     |████████████████████████████████| 696kB 2.8MB/s 
Installing collected packages: tf-models-official
Successfully installed tf-models-official-0.0.3.dev1
Collecting tf-nightly
[?25l  Downloading https://files.pythonhosted.org/packages/7f/bd/2f87700c604a61cf0c3fe251645e0ce983fcb48c2cf9ea6d86dce1b993c6/tf_nightly-2.1.0.dev20191217-cp36-cp36m-manylinux2010_x86_64.whl (441.0MB)
[K     |████████████████████████████████| 441.0MB 32kB/s 
Collecting tf-estimator-nightly
[?25l  Downloading https://files.pythonhosted.org/packages/95/b3/f0fd60333bb1fb6bc1d7a60f9e6521cb753b20aca17e3e7a27a0f8437884/tf_estimator_nightly-2.0.0.dev2019121709-py2.py3-none-any.whl (451kB)
[K     |████████████████████████████████| 460kB 50.4MB/s 
Collecting tb-nightly<2.2.0a0,>=2.1.0a0
[?25l  Down

In [0]:
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_hub as hub



In [0]:
tf.__version__

'2.1.0-dev20191217'

In [0]:
from official.nlp.bert.tokenization import FullTokenizer
from official.nlp import optimization

In [0]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

# Stage 2: Data preprocessing

## Loading files

We import files from our personal Google drive.

In [0]:
drive.mount("/content/drive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    "/content/drive/My Drive/projects/CNN_for_NLP/data/train.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)

In [0]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

## Preprocessing

### Cleaning

In [0]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [0]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [0]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenization

We need to create a BERT layer to have access to meta data for the tokenizer (like vocab size).

In [0]:
FullTokenizer = FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

We only use the first sentence for BERT inputs so we add the CLS token at the beginning and the SEP token at the end of each sentence.

In [0]:
def encode_sentence(sent):
    return ["[CLS]"] + tokenizer.tokenize(sent) + ["[SEP]"]

In [0]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

We need to create the 3 different inputs for each sentence.

In [0]:
def get_ids(tokens):
    return tokenizer.convert_tokens_to_ids(tokens)
    
def get_mask(tokens):
    return np.char.not_equal(tokens, "[PAD]").astype(int)

def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1-current_seg_id # turns 1 into 0 and vice versa
    return seg_ids

### Dataset creation

We will create padded batches (so we pad sentences for each batch independently), this way we add the minimum of padding tokens possible. For that, we sort sentences by length, apply padded_batches and then shuffle.

In [0]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [([get_ids(sent_lab[0]),
                get_mask(sent_lab[0]),
                get_segments(sent_lab[0])],
               sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 7]

In [0]:
# A list is a type of iterator so it can be used as generator for a dataset
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [0]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE,
                                       padded_shapes=((3, None), ()),
                                       padding_values=(0, 0))

In [0]:
NB_BATCHES = len(sorted_all) // BATCH_SIZE
NB_BATCHES_TEST = NB_BATCHES // 10
NB_BATCHES_TRAIN = NB_BATCHES - NB_BATCHES_TEST
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

# Stage 3: Model building

In [0]:
class BERTClassifier(tf.keras.Model):

    def __init__(self,
                 nb_classes=2,
                 dropout_rate=0.1,
                 name="bert_classifier"):
        super(BERTClassifier, self).__init__(name=name)

        self.dropout_rate = dropout_rate

        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable=True)
        self.last_dense = layers.Dense(
            units=nb_classes,
            kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
    
    def apply_bert(self, all_tokens):
        pooled_output, _ = self.bert_layer([all_tokens[:, 0, :],
                                            all_tokens[:, 1, :],
                                            all_tokens[:, 2, :]])
        
        return pooled_output
    
    def call(self, inputs, training):
        output = self.apply_bert(inputs)
        output = tf.nn.dropout(output, rate=self.dropout_rate)

        probs = self.last_dense(output)

        return prob

# Stage 4: Training

In [0]:
NB_CLASSES = 2

DROPOUT_RATE = 0.1

BATCH_SIZE = 32
NB_EPOCHS = 5
INIT_LR = 5e-5
WARMUP_STEPS = int(NB_BATCHES_TRAIN * 0.1)

In [0]:
bert_classifier = BERTClassifier(NB_CLASSES, DROPOUT_RATE)

In [0]:
NB_BATCHES_TRAIN

40622

In [0]:
train_dataset_light = train_dataset.take(500)
steps_per_epoch_light = 100
WARMUP_STEPS_LIGHT = int(500 * 0.1)

In [0]:
optimizer_light = optimization.create_optimizer(
    init_lr=INIT_LR,
    num_train_steps=500,
    num_warmup_steps=WARMUP_STEPS_LIGHT)

In [0]:
# very close but slightly better than standard categorical crossentropy loss
def classification_loss_fn(labels, logits):
    labels = tf.squeeze(labels)
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    one_hot_labels = tf.one_hot(
        tf.cast(labels, dtype=tf.int32), depth=NB_CLASSES, dtype=tf.float32)
    per_example_loss = -tf.reduce_sum(
        tf.cast(one_hot_labels, dtype=tf.float32) * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return loss

In [0]:
bert_classifier.compile(optimizer_light,
                        classification_loss_fn,
                        [tf.keras.metrics.SparseCategoricalAccuracy()])

In [0]:
checkpoint_path = "./drive/My Drive/projects/BERT/ckpt_bert_class/"

ckpt = tf.train.Checkpoint(bert_classifier=bert_classifier)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [0]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [0]:
bert_classifier.fit(train_dataset_light,
                    steps_per_epoch=100,
                    epochs=NB_EPOCHS,
                    callbacks=[MyCustomCallback()])

Train for 100 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f9d76e8f550>

# Stage 5: Evaluation

In [0]:
bert_classifier.evaluate(test_dataset)

   4513/Unknown - 4347s 963ms/step - loss: 0.3348 - sparse_categorical_accuracy: 0.8576

[0.33483418830873274, 0.8575643]

In [0]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)

    input_ids = get_ids(tokens)
    input_mask = get_mask(tokens)
    segment_ids = get_segments(tokens)

    inputs = tf.stack(
        [tf.cast(input_ids, dtype=tf.int32),
         tf.cast(input_mask, dtype=tf.int32),
         tf.cast(segment_ids, dtype=tf.int32)],
         axis=0)
    inputs = tf.expand_dims(inputs, 0) # simulates a batch

    output = bert_classifier(inputs, training=False)

    sentiment = tf.argmax(tf.squeeze(output)).numpy()

    if sentiment == 0:
        print("Output of the model: {}\nPredicted sentiment: negative".format(
            output))
    elif sentiment == 1:
        print("Output of the model: {}\nPredicted sentiment: positive".format(
            output))