## Classification with Multilingual BERT + Fine tuning

Let's first train a model in English

In [1]:
%load_ext autoreload
%autoreload 2

import torch
from transformers import BertTokenizer
import pandas as pd
from offenseval import Tokenizer

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

init_token = bert_tokenizer.cls_token
eos_token = bert_tokenizer.sep_token
pad_token = bert_tokenizer.pad_token
unk_token = bert_tokenizer.unk_token

init_token_idx = bert_tokenizer.cls_token_id
eos_token_idx = bert_tokenizer.sep_token_id
pad_token_idx = bert_tokenizer.pad_token_id
unk_token_idx = bert_tokenizer.unk_token_id

# Trying to cut this down to check if this improves memory usage

tokenizer = Tokenizer(bert_tokenizer)



In [2]:
from torchtext import data

TEXT = data.Field(
    tokenize=tokenizer.tokenize,
    include_lengths = True,
    use_vocab=False,
    batch_first = True,
    preprocessing = tokenizer.convert_tokens_to_ids,
    init_token = init_token_idx,
    eos_token = eos_token_idx,
    pad_token = pad_token_idx,
    unk_token = unk_token_idx
)


In [3]:
ID = data.Field(sequential=False, use_vocab=False)
# All these arguments are because these are really floats
# See https://github.com/pytorch/text/issues/78#issuecomment-541203609
AVG = data.LabelField(dtype = torch.float, use_vocab=False, preprocessing=float)
STD = data.LabelField(dtype = torch.float, use_vocab=False, preprocessing=float)
SUBTASK_A = data.LabelField()

train_dataset = data.TabularDataset(
    "../data/English/task_a_distant.sample.tsv",
    format="tsv", skip_header=True,
    fields=[("id", ID), ("text", TEXT), ("avg", AVG), ("std", STD)],
)

dev_dataset = data.TabularDataset(
    "../data/olid/olid-training-v1.0.tsv",
    format="tsv", skip_header=True,
    fields=[("id", ID), ("text", TEXT), 
            ("subtask_a", SUBTASK_A), ("subtask_b", None), ("subtask_c", None)],
    
)

test_dataset = data.TabularDataset(
    "../data/olid/test_a.tsv",
    format="tsv", skip_header=True,
    fields=[("id", ID), ("text", TEXT), 
            ("subtask_a", SUBTASK_A)],
)

print(f"Train instances: {len(train_dataset)}")
print(f"Dev   instances: {len(dev_dataset)}")
print(f"Test instances:  {len(test_dataset)}")

Train instances: 90754
Dev   instances: 13240
Test instances:  860


Build vocabulary for label field. Can we just say 0 -> NOT (Offensive) 1 -> OFF? 

I don't know, just to make sure add the assertion

In [4]:
SUBTASK_A.build_vocab(dev_dataset)

assert SUBTASK_A.vocab.itos == ["NOT", "OFF"]

## Model


We use %load directive to show the model here...

In [5]:
BATCH_SIZE = 32


device = "cuda" if torch.cuda.is_available() else "cpu"

train_it, dev_it, test_it = data.BucketIterator.splits(
    (train_dataset, dev_dataset, test_dataset), batch_size=BATCH_SIZE, device=device,
    sort_key = lambda x: len(x.text), sort_within_batch = True,
)

## Train

In [6]:
import torch
from sklearn.metrics import accuracy_score, f1_score


def train(model, iterator, optimizer, criterion, scheduler, max_grad_norm=None):
    """
    Trains the model for one full epoch
    """
    epoch_loss = 0
    epoch_acc = 0

    model.train()
    
    is_tqdm = type(pbar).__module__.split(".")[0] == "tqdm"

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        text, lens = batch.text
        target = 1. * (batch.avg > 0.6) 
        
        predictions = model(text)[0]
        loss = criterion(predictions.squeeze(1), target)
        
        prob_predictions = torch.sigmoid(predictions)
        preds = torch.round(prob_predictions).detach().cpu()
        acc = accuracy_score(preds, target.cpu())

        loss.backward()
        # Gradient clipping
        if max_grad_norm:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            
        optimizer.step()
        scheduler.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        lr = optimizer.param_groups[0]["lr"]
        if is_tqdm:
            desc = f"Loss {epoch_loss / (i+1):.3f} -- Acc {epoch_acc / (i+1):.3f} -- LR {lr:.5f}"
            iterator.set_description(desc)

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    """
    Evaluates the model on the given iterator
    """
    epoch_loss = 0
    model.eval()
    with torch.no_grad():
        predicted_probas = []
        labels = []
        for batch in iterator:
            text, lens = batch.text
            target = batch.subtask_a
            
            predictions = model(text)[0]
            loss = criterion(predictions.squeeze(1), target.float())
            
            prob_predictions = torch.sigmoid(predictions)

            predicted_probas.append(prob_predictions)
            labels.append(target.cpu())

            epoch_loss += loss.item()

        predicted_probas = torch.cat(predicted_probas).cpu()
        labels = torch.cat(labels).cpu()

        preds = torch.round(predicted_probas)

        pos_f1 = f1_score(labels, preds)
        neg_f1 = f1_score(1-labels, 1-preds)
        avg_f1 = (pos_f1 + neg_f1) / 2
        acc = accuracy_score(labels, preds)

    return epoch_loss / len(iterator), acc, avg_f1, pos_f1, neg_f1


Compute the class weights 

In [7]:
from sklearn.utils import compute_class_weight, compute_sample_weight
y = []
for elem in dev_dataset:
    y.append(1*(elem.subtask_a == 'OFF'))
    
class_weights = compute_class_weight('balanced', [0, 1], y)

# normalize it
class_weights = class_weights / class_weights[0]

class_weights

array([1.        , 2.00909091])

In [8]:
import torch.optim as optim
import torch.nn as nn
from transformers import AdamW, BertForSequenceClassification


model = BertForSequenceClassification.from_pretrained(
    'bert-base-multilingual-uncased',
    num_labels=1,
)

optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([class_weights[1]]))


model = model.to(device)
criterion = criterion.to(device)


In [9]:
from tqdm.notebook import tqdm
from transformers.optimization import get_constant_schedule_with_warmup
import time

N_EPOCHS = 5

# Copied from tranBertForSequenceClassifications page...please check
num_training_steps = N_EPOCHS * len(train_it)
num_warmup_steps = num_training_steps // 10
warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1

scheduler = get_constant_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, 
) 


best_valid_loss = float('inf')

early_stopping_tolerance = 100
epochs_without_improvement = 0
max_grad_norm = 1.0

model_path = f"/tmp/bert_model.pt"

pbar = tqdm(range(N_EPOCHS), ncols=1000)
for epoch in pbar:

    epoch_bar = tqdm(train_it, ncols=800)

    train_loss, train_acc = train(
        model, epoch_bar, optimizer, criterion,
        max_grad_norm=max_grad_norm, scheduler=scheduler
    )
    valid_loss, valid_acc, valid_f1, pos_f1, neg_f1 = evaluate(model, dev_it, criterion)
    
    desc = f'Train: Loss: {train_loss:.3f} Acc: {train_acc*100:.2f}%'
    desc += f'\nVal. Loss: {valid_loss:.3f} Acc: {valid_acc*100:.2f}% Macro F1 {valid_f1:.3f} (P {pos_f1:.3f} - N {neg_f1:.3f})'
    pbar.set_description(desc)
    print(desc)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        epochs_without_improvement = 0
        torch.save(model.state_dict(), model_path)
        print(f"Best model so far (Loss {best_valid_loss:.3f} - Acc {valid_acc:.3f}, F1 {valid_f1:.3f}) saved at {model_path}")
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= early_stopping_tolerance:
            print("Early stopping")
            break

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=5.0), HTML(value='')), layout=Layout(disp…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=2837.0), HTML(value='')), layout=Layout(d…


Train: Loss: 0.244 Acc: 93.04%
Val. Loss: 1.259 Acc: 80.54% Macro F1 0.766 (P 0.670 - N 0.862)
Best model so far (Loss 1.259 - Acc 0.805, F1 0.766) saved at /tmp/bert_model.pt


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=2837.0), HTML(value='')), layout=Layout(d…


Train: Loss: 0.108 Acc: 97.66%
Val. Loss: 1.767 Acc: 80.53% Macro F1 0.760 (P 0.656 - N 0.864)


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=2837.0), HTML(value='')), layout=Layout(d…


Train: Loss: 0.080 Acc: 98.42%
Val. Loss: 2.264 Acc: 80.56% Macro F1 0.755 (P 0.643 - N 0.866)


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=2837.0), HTML(value='')), layout=Layout(d…


Train: Loss: 0.059 Acc: 98.93%
Val. Loss: 2.500 Acc: 80.34% Macro F1 0.749 (P 0.633 - N 0.866)


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=2837.0), HTML(value='')), layout=Layout(d…


Train: Loss: 0.043 Acc: 99.24%
Val. Loss: 2.849 Acc: 79.62% Macro F1 0.733 (P 0.604 - N 0.863)



In [10]:
model.load_state_dict(torch.load(model_path))

loss, acc, f1, pos_f1, neg_f1 = evaluate(model, dev_it, criterion)

print(f'Val Loss: {loss:.3f}  Acc: {acc*100:.2f}% Macro F1: {f1:.3f} Pos F1 {pos_f1:.3f} Neg F1 {neg_f1:.3f}')

Val Loss: 1.259  Acc: 80.54% Macro F1: 0.766 Pos F1 0.670 Neg F1 0.862


In [11]:
model.load_state_dict(torch.load(model_path))

loss, acc, f1, pos_f1, neg_f1 = evaluate(model, test_it, criterion)

print(f'Test Loss: {loss:.3f}  Acc: {acc*100:.2f}% Macro F1: {f1:.3f} Pos F1 {pos_f1:.3f} Neg F1 {neg_f1:.3f}')

Test Loss: 1.094  Acc: 84.30% Macro F1: 0.786 Pos F1 0.675 Neg F1 0.897


In [16]:
import os
import pickle

def save_model(model, TEXT, output_path):
    base, _ = os.path.splitext(output_path)
    vocab_path = f"{base}.vocab.pkl"
    num_score_path = f"{base}.num_score.pkl"

    torch.save(model, output_path)

    with open(vocab_path, "wb") as f:
        pickle.dump(TEXT, f)

    print(f"Model saved to {output_path}")
    print(f"Vocab saved to {vocab_path}")

save_model(model, TEXT, "../models/bert-seq.mean06.pt")


Model saved to ../models/bert-seq.mean06.pt
Vocab saved to ../models/bert-seq.mean06.vocab.pkl


## Error Analysis

I reload the dataframe because it is not possible to retrieve the original text from the examples :-\ (they are already tokenized)

In [17]:
df_test = pd.read_table("../data/olid/test_a.tsv", index_col=0)

df_test.columns

Index(['text', 'subtask_a'], dtype='object')

In [18]:
torch.cuda.empty_cache()

In [48]:
model.eval()

rows = []
for batch in tqdm(test_it):
    text, lens = batch.text
    
    outs = model(text)[0].detach().cpu()
    ids = batch.id
    labels = batch.subtask_a
    probs = torch.sigmoid(outs)
    # Change the threshold to improve recall
    preds = probs > 0.5
    
    for tid, label, pred, prob in zip(ids.cpu(), labels.cpu(), preds.cpu(), probs.cpu()):
        rows.append({
            "id": tid.item(),
            "text": df_test.loc[tid.item(), "text"],
            "label": label.item(),
            "pred": int(pred.item()),
            "prob": prob.item()
        })
        
df = pd.DataFrame(rows)
df.set_index("id", inplace=True)

HBox(children=(FloatProgress(value=0.0, max=27.0), HTML(value='')))




In [49]:
labels = df["label"]
preds = df["pred"]

acc = accuracy_score(labels, preds)
pos_f1 = f1_score(labels, preds)
neg_f1 = f1_score(1-labels, 1-preds)
avg_f1 = (pos_f1 + neg_f1) / 2

acc, pos_f1, neg_f1, avg_f1

(0.8430232558139535, 0.674698795180723, 0.8965517241379309, 0.785625259659327)

In [50]:
true_pos = df[(df["label"] == 1) & (df["pred"] == 1)].copy()
false_neg = df[(df["label"] == 1) & (df["pred"] == 0)].copy()
false_neg.sort_values("prob", ascending=True, inplace=True)
false_pos = df[(df["label"] == 0) & (df["pred"] == 1)].copy()
false_pos.sort_values("prob", ascending=False, inplace=True)
true_neg = df[(df["label"] == 0) & (df["pred"] == 0)].copy()

conf_matrix = pd.DataFrame([
    {"real":"not off", "pred not off": len(true_neg),      "pred off": len(false_pos)},
    {"real":"off",     "pred not off":     len(false_neg), "pred off": len(true_pos)}
])

conf_matrix.set_index("real", inplace=True)

print("Falsos negativos: {}".format(len(false_neg)))
print("Falsos positivos: {}".format(len(false_pos)))

conf_matrix

Falsos negativos: 100
Falsos positivos: 35


Unnamed: 0_level_0,pred not off,pred off
real,Unnamed: 1_level_1,Unnamed: 2_level_1
not off,585,35
off,100,140


Normalized 

In [51]:
pd.options.display.float_format = '{:,.3f}'.format

conf_matrix / len(df)

Unnamed: 0_level_0,pred not off,pred off
real,Unnamed: 1_level_1,Unnamed: 2_level_1
not off,0.68,0.041
off,0.116,0.163


We should improve recall in offensive tweets

In [46]:
false_neg.iloc[0]["text"]

'#NoPasaran: Unity demo to oppose the far-right in #London – #antifa #Oct13 — Enough is Enough! URL'

Is this really offensive?

In [47]:
false_neg.iloc[0]

text     #NoPasaran: Unity demo to oppose the far-right...
label                                                    1
pred                                                     0
prob                                                 0.001
Name: 60133, dtype: object

Is there any error?

In [None]:
df_dev.loc[79079]

In [None]:
pd.set_option('max_colwidth', None)
false_neg.iloc[:30]