## Classification with Multilingual BERT

Let's first train a model in English

In [1]:
import torch
from transformers import BertTokenizer
import pandas as pd



In [2]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

max_input_length = tokenizer.max_len_single_sentence

In [3]:
from torchtext import data

def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length]
    return tokens

TEXT = data.Field(
    #tokenize = 'spacy', 
    #tokenizer_language="es",
    tokenize=tokenize_and_cut,
    include_lengths = True,
    use_vocab=False,
    batch_first = True,
    preprocessing = tokenizer.convert_tokens_to_ids,
    init_token = init_token_idx,
    eos_token = eos_token_idx,
    pad_token = pad_token_idx,
    unk_token = unk_token_idx
)


In [4]:
ID = data.Field(sequential=False, use_vocab=False)
# All these arguments are because these are really floats
# See https://github.com/pytorch/text/issues/78#issuecomment-541203609
AVG = data.LabelField(dtype = torch.float, use_vocab=False, preprocessing=float)
STD = data.LabelField(dtype = torch.float, use_vocab=False, preprocessing=float)
SUBTASK_A = data.LabelField()

train_dataset = data.TabularDataset(
    "../data/English/task_a_distant.sample.tsv",
    format="tsv", skip_header=True,
    fields=[("id", ID), ("text", TEXT), ("avg", AVG), ("std", STD)],
)

dev_dataset = data.TabularDataset(
    "../data/olid/olid-training-v1.0.tsv",
    format="tsv", skip_header=True,
    fields=[("id", ID), ("text", TEXT), 
            ("subtask_a", SUBTASK_A), ("subtask_b", None), ("subtask_c", None)],
    
)

print(f"Train instances: {len(train_dataset)}")

print(f"Dev   instances: {len(dev_dataset)}")

Train instances: 90754
Dev   instances: 13240


Build vocabulary for label field. Can we just say 0 -> NOT (Offensive) 1 -> OFF? 

I don't know, just to make sure add the assertion

In [5]:
SUBTASK_A.build_vocab(dev_dataset)

assert SUBTASK_A.vocab.itos == ["NOT", "OFF"]

In [6]:
from transformers import BertModel

bert = BertModel.from_pretrained('bert-base-multilingual-uncased')

In [7]:
tokens = tokenizer.tokenize("¬øLo creer√°s, Ariadna? dijo Teseo. El minotauro apenas se defendi√≥.")

token_ids = torch.LongTensor(tokenizer.convert_tokens_to_ids(tokens))
# I need to reshape it before BERT consumes it
# (batch_len, seq_len)... in this case batch_len == 1
print(tokenizer.convert_ids_to_tokens(token_ids))
last_hidden, clf = bert(token_ids.view(1, -1))

last_hidden.shape, clf.shape

['¬ø', 'lo', 'creer', '##as', ',', 'aria', '##dna', '?', 'dijo', 'tese', '##o', '.', 'el', 'mino', '##tau', '##ro', 'apenas', 'se', 'defend', '##io', '.']


(torch.Size([1, 21, 768]), torch.Size([1, 768]))

## Model


In [8]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers=1, 
                 bidirectional=False,
                 finetune_bert=False,
                 dropout=0.2):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.finetune_bert = finetune_bert
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
    
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):    
        #text = [batch size, sent len]
        
        if not self.finetune_bert:
            with torch.no_grad():
                embedded = self.bert(text)[0]
            #embedded = [batch size, sent len, emb dim]
        else:
            embedded = self.bert(text)[0]
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [9]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
DROPOUT = 0.25

model = BERTGRUSentiment(
    bert,
    hidden_dim=HIDDEN_DIM,
    output_dim=OUTPUT_DIM,
    dropout=DROPOUT)

In [10]:
#for name, param in model.named_parameters():                
#    if name.startswith('bert'):
#        param.requires_grad = False
        
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 168,144,641 trainable parameters


In [12]:
BATCH_SIZE = 128


device = "cuda" if torch.cuda.is_available() else "cpu"

train_it, dev_it = data.BucketIterator.splits(
    (train_dataset, dev_dataset), batch_size=BATCH_SIZE, device=device,
    sort_key = lambda x: len(x.text), sort_within_batch = True,
)

In [13]:
batch = next(iter(train_it))

print((sum(batch.avg > 0.5) / 128.0).item(), (sum((batch.avg-batch.std) > 0.5) / 128.0).item())

# Convert to labels those which mu - std > 0.5
1.0 * ((batch.avg - batch.std) > 0.5)

0.140625 0.09375


tensor([0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0.,
        0., 0.], device='cuda:0')

## Train

In [14]:
import torch
from sklearn.metrics import accuracy_score, f1_score


def train(model, iterator, optimizer, criterion):
    """
    Trains the model for one full epoch
    """
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        set_batch = batch
        optimizer.zero_grad()
        text, lens = batch.text

        predictions = model(text)
        # See exploratory notebook or a few cells upwards for an explanation of this
        target = 1.0 * (batch.avg > 0.6) 
        
        loss = criterion(predictions.squeeze(1), target)
        
        prob_predictions = torch.sigmoid(predictions)
        preds = torch.round(prob_predictions).detach().cpu()
        acc = accuracy_score(preds, target.cpu())

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    """
    Evaluates the model on the given iterator
    """
    epoch_loss = 0
    model.eval()
    with torch.no_grad():
        predicted_probas = []
        labels = []
        for batch in iterator:
            text, lens = batch.text
            target = batch.subtask_a
            
            predictions = model(text)
            loss = criterion(predictions.squeeze(1), target.float())
            
            prob_predictions = torch.sigmoid(predictions)

            predicted_probas.append(prob_predictions)
            labels.append(target.cpu())

            epoch_loss += loss.item()

        predicted_probas = torch.cat(predicted_probas).cpu()
        labels = torch.cat(labels).cpu()

        preds = torch.round(predicted_probas)

        pos_f1 = f1_score(labels, preds)
        neg_f1 = f1_score(1-labels, 1-preds)
        avg_f1 = (pos_f1 + neg_f1) / 2
        acc = accuracy_score(labels, preds)

    return epoch_loss / len(iterator), acc, avg_f1


In [15]:

import torch.optim as optim


optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)
model = model.to(device)
criterion = criterion.to(device)


In [16]:
from tqdm.notebook import tqdm
import time

N_EPOCHS = 20

best_valid_loss = float('inf')

early_stopping_tolerance = 5
epochs_without_improvement = 0

model_path = f"/tmp/bert_model.pt"

pbar = tqdm(range(N_EPOCHS), ncols=1000)
for epoch in pbar:
    epoch_bar = tqdm(train_it)
    
    train_loss, train_acc = train(model, epoch_bar, optimizer, criterion)
    valid_loss, valid_acc, valid_f1 = evaluate(model, dev_it, criterion)
    scheduler.step(valid_loss)
    
    desc = f'Train: Loss: {train_loss:.3f} Acc: {train_acc*100:.2f}%'
    desc += f'\nVal. Loss: {valid_loss:.3f} Acc: {valid_acc*100:.2f}% F1 {valid_f1:.3f}'
    pbar.set_description(desc)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        epochs_without_improvement = 0
        torch.save(model.state_dict(), model_path)
        print(f"Best model so far (Loss {best_valid_loss:.3f} - Acc {valid_acc:.3f}, F1 {valid_f1:.3f}) saved at {model_path}")
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= early_stopping_tolerance:
            print("Early stopping")
            break

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=20.0), HTML(value='')), layout=Layout(dis‚Ä¶

HBox(children=(FloatProgress(value=0.0, max=710.0), HTML(value='')))


Best model so far (Loss 0.718 - Acc 0.779, F1 0.710) saved at /tmp/bert_model.pt


HBox(children=(FloatProgress(value=0.0, max=710.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=710.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=710.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=710.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=710.0), HTML(value='')))


Early stopping


In [17]:
model.load_state_dict(torch.load(model_path))

loss, acc, f1 = evaluate(model, dev_it, criterion)

print(f'Val Loss: {loss:.3f}  Acc: {acc*100:.2f}% F1: {f1:.3f}')

Val Loss: 0.718  Acc: 77.91% F1: 0.710


## Error Analysis

I reload the dataframe because it is not possible to retrieve the original text from the examples :-\ (they are already tokenized)

In [32]:
df_dev = pd.read_table("../data/olid/olid-training-v1.0.tsv", index_col=0)

df_dev.columns

Index(['tweet', 'subtask_a', 'subtask_b', 'subtask_c'], dtype='object')

In [58]:
model.eval()

rows = []
for batch in tqdm(dev_it):
    text, lens = batch.text
    
    outs = model(text)
    ids = batch.id
    labels = batch.subtask_a
    probs = torch.sigmoid(outs)
    preds = torch.round(probs)
    
    for tid, label, pred, prob in zip(ids.cpu(), labels.cpu(), preds.cpu(), probs.cpu()):
        rows.append({
            "id": tid.item(),
            "text": df_dev.loc[tid.item(), "tweet"],
            "label": label.item(),
            "pred": int(pred.item()),
            "prob": prob.item()
        })
        
df = pd.DataFrame(rows)
df.set_index("id", inplace=True)

HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))




In [61]:
true_pos = df[(df["label"] == 1) & (df["pred"] == 1)].copy()
false_neg = df[(df["label"] == 1) & (df["pred"] == 0)].copy()
false_neg.sort_values("prob", ascending=True, inplace=True)
false_pos = df[(df["label"] == 0) & (df["pred"] == 1)].copy()
false_pos.sort_values("prob", ascending=False, inplace=True)
true_neg = df[(df["label"] == 0) & (df["pred"] == 0)].copy()

conf_matrix = pd.DataFrame([
    {"real":"not off", "pred not off": len(true_neg),      "pred off": len(false_pos)},
    {"real":"off",     "pred not off":     len(false_neg), "pred off": len(true_pos)}
])

conf_matrix.set_index("real", inplace=True)

print("Falsos negativos: {}".format(len(false_neg)))
print("Falsos positivos: {}".format(len(false_pos)))

conf_matrix

Falsos negativos: 2480
Falsos positivos: 445


Unnamed: 0_level_0,pred not off,pred off
real,Unnamed: 1_level_1,Unnamed: 2_level_1
not off,8395,445
off,2480,1920


Normalized 

In [62]:
pd.options.display.float_format = '{:,.3f}'.format

conf_matrix / len(df)

Unnamed: 0_level_0,pred not off,pred off
real,Unnamed: 1_level_1,Unnamed: 2_level_1
not off,0.634,0.034
off,0.187,0.145


We've got a HUGE number of false negatives. We have to work on that

In [70]:
false_neg.iloc[99]["text"]

"@USER @USER I'm talking about someone from a historical background and heritage of Guns. She a Roman catholic... They are generally all for gun control. You do your research sir."

Is this really offensive?

In [71]:
false_neg.iloc[99]

text     @USER @USER I'm talking about someone from a h...
label                                                    1
pred                                                     0
prob                                                 0.001
Name: 66104, dtype: object

Is there any error?

In [73]:
df_dev.loc[66104]

tweet        @USER @USER I'm talking about someone from a h...
subtask_a                                                  OFF
subtask_b                                                  TIN
subtask_c                                                  OTH
Name: 66104, dtype: object

In [77]:
pd.set_option('max_colwidth', None)
false_neg.iloc[:30]

Unnamed: 0_level_0,text,label,pred,prob
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
65102,@USER @USER @USER @USER So if Paul Golding turned up for a job at the BBC you'd let him work for them?,1,0,0.0
54306,"@USER @USER Hillary? Why is she on the social studies curriculum? Why? Who is she to be in any curriculum? She is nothing! I'm shocked that she was in any curriculum, but I am very happy to see that she is OUT!"" (I am so glad she's nowhere near the @USER",1,0,0.0
33600,@USER @USER when robyn posts a selfie the world stops for a moment to appreciate it,1,0,0.0
42410,@USER @USER Feinstein wanted to delay until after the midterms. Hopefully the RED wave will wash her into the Atlantic. #MAGA,1,0,0.001
98517,@USER #AskAlly Please tell me something what should i do to help my mom when she is at home with a broken leg cause she's in Hospital now with a broken leg üò≠üò≠üò≠üò≠,1,0,0.001
45781,Sean Penn: #MeToo Movement Seeks to ‚ÄòDivide Men and Women‚Äô URL &lt;‚Äî Gotta say‚Ä¶ I stand w/ Jeff Spicoli on this one. URL #MAGA #WWG1WGA #ConfirmKavanaugh,1,0,0.001
79079,@USER I have you and Kaze- I need more.,1,0,0.001
31527,@USER He is quite good at faking. Must have done it numerous times,1,0,0.001
85971,@USER Man I hope I don‚Äôt die before Florence hits.. I heard he is going to be a great time!!,1,0,0.001
42205,@USER @USER #Privilege Laneü§î #MAGA Blvdü§î Good Ole Days Avenueü§î,1,0,0.001
