# Bert to the rescue!
- based on https://towardsdatascience.com/bert-to-the-rescue-17671379687f
- but changed imdb dataset (not from pytorch-nlp, but from a file, imdb_master.csv)
- So preprossing is different from the original post

In [1]:
import warnings
warnings.filterwarnings(action='ignore')
import sys
import numpy as np
import random
import torch
from torch import nn
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [2]:
from transformers import BertTokenizer, BertModel
from transformers import DistilBertTokenizer, DistilBertModel

In [3]:
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [4]:
import pandas as pd

In [5]:
from IPython.display import clear_output

In [6]:
random.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

## Prepare the Data

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
df = pd.read_csv('imdb_master.csv', encoding='latin-1')
df.drop(['Unnamed: 0'], axis = 1, inplace = True)
df.drop(['file'], axis = 1, inplace = True)
train_df = df[df['type'] == 'train'][:500].append(df[df['type'] == 'train'][-500:])
test_df = df[df['type'] =='test'][:50].append(df[df['type'] =='test'][-50:])
train_texts = train_df['review'].tolist()
test_texts = test_df['review'].tolist()
train_labels = train_df['label']
test_labels = test_df['label']
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))

len(train_tokens), len(test_tokens)  

train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

train_tokens_ids.shape, test_tokens_ids.shape

train_y = np.array(train_labels) != 'neg'
test_y = np.array(test_labels) != 'neg'
train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)

train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

# Baseline

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

In [9]:
baseline_model = make_pipeline(CountVectorizer(ngram_range=(1,3)), LogisticRegression()).fit(train_texts, train_y)

In [10]:
baseline_predicted = baseline_model.predict(test_texts)

In [11]:
print(classification_report(test_y, baseline_predicted))

              precision    recall  f1-score   support

       False       0.88      0.58      0.70        50
        True       0.69      0.92      0.79        50

    accuracy                           0.75       100
   macro avg       0.78      0.75      0.74       100
weighted avg       0.78      0.75      0.74       100



# Bert Model

In [86]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        pooled_output = self.bert(tokens, attention_mask=masks)[0]
        pooled_output=torch.tensor(pooled_output)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba
        

In [87]:
bert_clf = BertBinaryClassifier()

In [75]:
train_tokens_ids[:3]

array([[ 101, 2466, 1997, ...,    0,    0,    0],
       [ 101, 3199, 1005, ..., 2004, 2172,  102],
       [ 101, 2023, 2143, ...,    0,    0,    0]])

In [77]:
x = torch.tensor(train_tokens_ids[:3])
y = bert_clf.bert(x)

# Fine-tune BERT

In [78]:
# device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce RTX 2070 SUPER'

In [79]:
BATCH_SIZE = 4
EPOCHS = 10

In [80]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

In [81]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [82]:
param_optimizer = list(bert_clf.sigmoid.named_parameters()) 
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [83]:
optimizer = Adam(bert_clf.parameters(), lr=3e-6)

In [84]:
bert_clf = bert_clf.to(device)
bert_clf = nn.DataParallel(bert_clf)
bert_clf.cuda()

DataParallel(
  (module): BertBinaryClassifier(
    (bert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (dropout): Dropout(p=0.1, inplace=False)
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_a

In [89]:
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    
    print('Epoch: ', epoch_num + 1)
    
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        logits = bert_clf(token_ids, masks)
        
        loss_func = nn.BCELoss()

        batch_loss = loss_func(logits, labels)
        train_loss += batch_loss.item()
        
        
        bert_clf.zero_grad()
        batch_loss.backward()
        

        clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
        optimizer.step()
        
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_df) / BATCH_SIZE, train_loss / (step_num + 1)))
        

Epoch:  1


RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_index_select

In [54]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
    

In [55]:
np.mean(bert_predicted)

0.61

In [56]:
print(classification_report(test_y, bert_predicted))

              precision    recall  f1-score   support

       False       0.92      0.72      0.81        50
        True       0.77      0.94      0.85        50

    accuracy                           0.83       100
   macro avg       0.85      0.83      0.83       100
weighted avg       0.85      0.83      0.83       100

