# Kalman to the rescue!
## Version 2
An attempt to change the notebook from [this tutorial](https://towardsdatascience.com/bert-to-the-rescue-17671379687f) to our binarized liar dataset. Here, the model used is not the BERT sequence classifier implemented in the tutorial, but BertForSequenceClassification from the huggingface/transformers library.



In [0]:
!pip install transformers pytorch-nlp
# Install Tensorflow 2.X and Keras

In [0]:
import sys
import math
import numpy as np
import pandas as pd
import random as rn
import torch
from transformers import BertModel
from transformers import BertTokenizer
from torch import nn
from torchnlp.datasets import imdb_dataset
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [0]:
if False:
  rn.seed(321)
  np.random.seed(321)
  torch.manual_seed(321)
  torch.cuda.manual_seed(321)

## Prepare the Data

In [0]:
def get_data(url):
	'''
	Simple function, that ready in the data, cleans it
	and returns it already split and train and test
	'''
	complete_data = pd.read_csv(url)
	complete_data.dropna()
	texts = complete_data['content'].to_numpy()

	labels = complete_data['label'].to_numpy()

	print('Data will be returned as: ')
	print('x_train, x_test, y_train, y_test')
	return train_test_split(texts,labels)

In [0]:
url = 'https://raw.githubusercontent.com/frietz58/euvsvirus/master/datasets/cleaned_data/liar_data_b.csv'
x_train, x_test, y_train, y_test = get_data(url)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

Data will be returned as: 
x_train, x_test, y_train, y_test


((9591,), (9591,), (3197,), (3197,))

Check balance between classes:

In [0]:
y_train.shape, y_test.shape, np.mean(y_train), np.mean(y_test)

((9591,), (3197,), 0.4434365551037431, 0.4385361276196434)

BERT tokenization: BERT was trained using the WordPiece tokenization. It means that a word can be broken down into more than one sub-words. This kind of tokenization is beneficial when dealing with out of vocabulary words, and it may help better represent complicated words. The sub-words are constructed during the training time and depend on the corpus the model was trained on.

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [0]:
tokenizer.tokenize('Hi my name is Pia bitches')

['hi', 'my', 'name', 'is', 'pia', 'bitch', '##es']

The cell below creates the tokenizer, tokenizes each review, adds the special [CLS] token, and then takes only the first 512 tokens for both train and test sets (512 is the maximum sequence size for BERT):

In [0]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], x_train))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], x_test))

len(train_tokens), len(test_tokens)                   

(9591, 3197)

Next, we need to convert each token in each review to an id as present in the tokenizer vocabulary. If there’s a token that is not present in the vocabulary, the tokenizer will use the special [UNK] token and use its id. Then we pad all sequences to size 512.

NOTE: post-padding might impede an LSTM classifier!

In [0]:
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

train_tokens_ids.shape, test_tokens_ids.shape

((9591, 512), (3197, 512))

In [0]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

# Baseline

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

In [0]:
baseline_model = make_pipeline(CountVectorizer(ngram_range=(1,3)), LogisticRegression(max_iter=10000)).fit(x_train, y_train)

In [0]:
baseline_predicted = baseline_model.predict(x_test)

In [0]:
print(classification_report(y_test, baseline_predicted))

              precision    recall  f1-score   support

           0       0.64      0.71      0.67      1766
           1       0.59      0.51      0.55      1431

    accuracy                           0.62      3197
   macro avg       0.61      0.61      0.61      3197
weighted avg       0.62      0.62      0.62      3197



# Bert Model

In [0]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        output = self.bert(tokens, attention_mask=masks)
        pooled_output = output[1]
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba
        

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Tesla P4
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [0]:
# This is the model implemented in the tutorial
# bert_clf = BertBinaryClassifier()

# We want to use the model provided by huggingface/transformers
from transformers import BertForSequenceClassification
bert_clf = BertForSequenceClassification.from_pretrained('bert-base-uncased')

if device.type == 'cuda':
  bert_clf = bert_clf.cuda()
  print('Memory Usage:')
  print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
  print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Memory Usage:
Allocated: 0.4 GB
Cached:    0.5 GB


Test for 3 samples if the model works:

In [0]:
x = torch.tensor(train_tokens_ids[:3]).to(device)
y, pooled = bert_clf.bert(x)

x.shape, y.shape, pooled.shape

(torch.Size([3, 512]), torch.Size([3, 512, 768]), torch.Size([3, 768]))

*   x is of size (3, 512) , we took only 3 reviews, 512 tokens each.
*   y is of size (3, 512, 768) , this is the BERTs final layer output for each token. Each token in each review is represented using a vector of size 768.
*   pooled is of size (3, 768) this is the output of our [CLS] token, the first token in our sequence.

In [0]:
y = bert_clf(x)[0]
y.cpu().detach().numpy()

array([[-0.13442509, -0.22923341],
       [-0.05997267, -0.25016168],
       [ 0.03254256, -0.22025113]], dtype=float32)

**Note** This could be one value per class, i.e. the logits (classification scores) for real news (0) and fake news (1).

In [0]:
if device.type == 'cuda':
  y, x, pooled = None, None, None
  torch.cuda.empty_cache()
  print('Memory Usage:')
  print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
  print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')
else:
  y, x, pooled = None, None, None

Memory Usage:
Allocated: 0.4 GB
Cached:    0.5 GB


# Fine-tune BERT

In [0]:
BATCH_SIZE = 2
EPOCHS = 2

In [0]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(y_train.reshape(-1, 1))
train_y_tensor = torch.nn.functional.one_hot(train_y_tensor, num_classes=2).squeeze().float()


test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(y_test.reshape(-1, 1))
test_y_tensor = torch.nn.functional.one_hot(test_y_tensor, num_classes=-1).squeeze().float()

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

In [0]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [0]:
# This is only used with BertBinaryClassifier
# param_optimizer = list(bert_clf.sigmoid.named_parameters()) 
# optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [0]:
optimizer = Adam(bert_clf.parameters(), lr=3e-6)

In [0]:
if device.type == 'cuda':
  torch.cuda.empty_cache()
  print('Memory Usage:')
  print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
  print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Memory Usage:
Allocated: 0.4 GB
Cached:    0.5 GB


In [0]:
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    # counter = 0
    
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        logits = bert_clf(token_ids, masks)[0]
        
        loss_func = nn.BCEWithLogitsLoss()

        batch_loss = loss_func(logits, labels)
        train_loss += batch_loss.item()
        
        bert_clf.zero_grad()
        batch_loss.backward()

        clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
        optimizer.step()
        
        clear_output(wait=True)
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, math.floor(len(x_train) / BATCH_SIZE), train_loss / (step_num + 1)))

        # counter += 1

Epoch:  2
4795/4795 loss: 0.6951577605344076 


In [0]:
if device.type == 'cuda':
  torch.cuda.empty_cache()
  print('Memory Usage:')
  print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
  print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

In [0]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        logits = bert_clf(token_ids, masks)[0]
        
        loss_func = nn.BCEWithLogitsLoss()

        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])

In [0]:
np.mean(bert_predicted)

In [0]:
print(classification_report(y_test, bert_predicted))