## Classification with Multilingual BERT

Let's first train a model in English

In [1]:
import torch
from transformers import BertTokenizer
import pandas as pd



In [2]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

max_input_length = tokenizer.max_len_single_sentence

In [3]:
from torchtext import data

def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length]
    return tokens

TEXT = data.Field(
    #tokenize = 'spacy', 
    #tokenizer_language="es",
    tokenize=tokenize_and_cut,
    include_lengths = True,
    use_vocab=False,
    batch_first = True,
    preprocessing = tokenizer.convert_tokens_to_ids,
    init_token = init_token_idx,
    eos_token = eos_token_idx,
    pad_token = pad_token_idx,
    unk_token = unk_token_idx
)


In [6]:
ID = data.Field(sequential=False, use_vocab=False)
# All these arguments are because these are really floats
# See https://github.com/pytorch/text/issues/78#issuecomment-541203609
AVG = data.LabelField(dtype = torch.float, use_vocab=False, preprocessing=float)
STD = data.LabelField(dtype = torch.float, use_vocab=False, preprocessing=float)
SUBTASK_A = data.LabelField()

train_dataset = data.TabularDataset(
    "../data/English/task_a_distant.sample.tsv",
    format="tsv", skip_header=True,
    fields=[("id", ID), ("text", TEXT), ("avg", AVG), ("std", STD)],
)

dev_dataset = data.TabularDataset(
    "../data/olid/olid-training-v1.0.tsv",
    format="tsv", skip_header=True,
    fields=[("id", ID), ("text", TEXT), 
            ("subtask_a", SUBTASK_A), ("subtask_b", None), ("subtask_c", None)],
    
)

print(f"Train instances: {len(train_dataset)}")

print(f"Dev   instances: {len(dev_dataset)}")

Train instances: 90754
Dev   instances: 13240


Build vocabulary for label field. Can we just say 0 -> NOT (Offensive) 1 -> OFF? 

I don't know, just to make sure add the assertion

In [7]:
SUBTASK_A.build_vocab(dev_dataset)

assert SUBTASK_A.vocab.itos == ["NOT", "OFF"]

In [8]:
from transformers import BertModel

bert = BertModel.from_pretrained('bert-base-multilingual-uncased')

In [9]:
tokens = tokenizer.tokenize("¿Lo creerás, Ariadna? dijo Teseo. El minotauro apenas se defendió.")

token_ids = torch.LongTensor(tokenizer.convert_tokens_to_ids(tokens))
# I need to reshape it before BERT consumes it
# (batch_len, seq_len)... in this case batch_len == 1
print(tokenizer.convert_ids_to_tokens(token_ids))
last_hidden, clf = bert(token_ids.view(1, -1))

last_hidden.shape, clf.shape

['¿', 'lo', 'creer', '##as', ',', 'aria', '##dna', '?', 'dijo', 'tese', '##o', '.', 'el', 'mino', '##tau', '##ro', 'apenas', 'se', 'defend', '##io', '.']


(torch.Size([1, 21, 768]), torch.Size([1, 768]))

## Model


In [10]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers=1, 
                 bidirectional=False,
                 dropout=0.2):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
    
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):    
        #text = [batch size, sent len]
                
        with torch.no_grad():
            embedded = self.bert(text)[0]
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [11]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
DROPOUT = 0.25

model = BERTGRUSentiment(
    bert,
    hidden_dim=HIDDEN_DIM,
    output_dim=OUTPUT_DIM,
    dropout=DROPOUT)

In [12]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False
        
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 788,225 trainable parameters


In [16]:
BATCH_SIZE = 128


device = "cuda" if torch.cuda.is_available() else "cpu"

train_it, dev_it = data.BucketIterator.splits(
    (train_dataset, dev_dataset), batch_size=BATCH_SIZE, device=device,
    sort_key = lambda x: len(x.text), sort_within_batch = True,
)

In [17]:
batch = next(iter(train_it))

print(sum(batch.avg > 0.5) / 128.0, sum((batch.avg-batch.std) > 0.5) / 128.0)

# Convert to labels those which mu - std > 0.5
1.0 * ((batch.avg - batch.std) > 0.5)

tensor(0.1484, device='cuda:0') tensor(0.0938, device='cuda:0')


tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
        0., 0.], device='cuda:0')

## Train

In [28]:
import torch
from sklearn.metrics import accuracy_score, f1_score


def train(model, iterator, optimizer, criterion):
    """
    Trains the model for one full epoch
    """
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        set_batch = batch
        optimizer.zero_grad()
        text, lens = batch.text

        predictions = model(text)
        # See exploratory notebook or a few cells upwards for an explanation of this
        target = 1.0 * ((batch.avg - batch.std) > 0.5) 
        
        loss = criterion(predictions.squeeze(1), target)
        
        prob_predictions = torch.sigmoid(predictions)
        preds = torch.round(prob_predictions).detach().cpu()
        acc = accuracy_score(preds, target.cpu())

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    """
    Evaluates the model on the given iterator
    """
    epoch_loss = 0
    model.eval()
    with torch.no_grad():
        predicted_probas = []
        labels = []
        for batch in iterator:
            text, lens = batch.text
            target = batch.subtask_a
            
            predictions = model(text)
            loss = criterion(predictions.squeeze(1), target.float())
            
            prob_predictions = torch.sigmoid(predictions)

            predicted_probas.append(prob_predictions)
            labels.append(target.cpu())

            epoch_loss += loss.item()

        predicted_probas = torch.cat(predicted_probas).cpu()
        labels = torch.cat(labels).cpu()

        preds = torch.round(predicted_probas)

        pos_f1 = f1_score(labels, preds)
        neg_f1 = f1_score(1-labels, 1-preds)
        avg_f1 = (pos_f1 + neg_f1) / 2
        acc = accuracy_score(labels, preds)

    return epoch_loss / len(iterator), acc, avg_f1


In [31]:

import torch.optim as optim


optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)
model = model.to(device)
criterion = criterion.to(device)


In [33]:
from tqdm.notebook import tqdm
import time

N_EPOCHS = 20

best_valid_loss = float('inf')

early_stopping_tolerance = 6
epochs_without_improvement = 0

model_path = f"/tmp/bert_model.pt"

pbar = tqdm(range(N_EPOCHS), ncols=1000)
for epoch in pbar:
    epoch_bar = tqdm(train_it)
    
    train_loss, train_acc = train(model, epoch_bar, optimizer, criterion)
    valid_loss, valid_acc, valid_f1 = evaluate(model, dev_it, criterion)
    scheduler.step(valid_loss)
    
    desc = f'Train: Loss: {train_loss:.3f} Acc: {train_acc*100:.2f}%'
    desc += f'\nVal. Loss: {valid_loss:.3f} Acc: {valid_acc*100:.2f}% F1 {valid_f1:.3f}'
    pbar.set_description(desc)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        epochs_without_improvement = 0
        torch.save(model.state_dict(), model_path)
        print(f"Best model so far (Loss {best_valid_loss:.3f} - Acc {valid_acc:.3f}, F1 {valid_f1:.3f}) saved at {model_path}")
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= early_stopping_tolerance:
            print("Early stopping")
            break

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=200.0), HTML(value='')), layout=Layout(di…

HBox(children=(FloatProgress(value=0.0, max=710.0), HTML(value='')))


Best model so far (Loss 1.239 - Acc 0.769, F1 0.679) saved at /tmp/bert_model.pt


HBox(children=(FloatProgress(value=0.0, max=710.0), HTML(value='')))


Best model so far (Loss 1.143 - Acc 0.774, F1 0.693) saved at /tmp/bert_model.pt


HBox(children=(FloatProgress(value=0.0, max=710.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=710.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=710.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=710.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=710.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=710.0), HTML(value='')))


Early stopping


In [34]:
model.load_state_dict(torch.load(model_path))

loss, acc, f1 = evaluate(model, dev_it, criterion)

print(f'Val Loss: {loss:.3f}  Acc: {acc*100:.2f}% F1: {f1:.3f}')

Val Loss: 1.143  Acc: 77.41% F1: 0.693
