## Classification with Multilingual BERT + Fine tuning

Let's first train a model in English

In [1]:
%load_ext autoreload
%autoreload 2

import torch
from transformers import BertTokenizer
import pandas as pd
from offenseval.nn import Tokenizer

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

init_token = bert_tokenizer.cls_token
eos_token = bert_tokenizer.sep_token
pad_token = bert_tokenizer.pad_token
unk_token = bert_tokenizer.unk_token

init_token_idx = bert_tokenizer.cls_token_id
eos_token_idx = bert_tokenizer.sep_token_id
pad_token_idx = bert_tokenizer.pad_token_id
unk_token_idx = bert_tokenizer.unk_token_id

# Trying to cut this down to check if this improves memory usage

tokenizer = Tokenizer(bert_tokenizer)



In [2]:
bert_tokenizer import data

TEXT = data.Field(
    tokenize=tokenizer.tokenize,
    include_lengths = True,
    use_vocab=False,
    batch_first = True,
    preprocessing = tokenizer.convert_tokens_to_ids,
    init_token = init_token_idx,
    eos_token = eos_token_idx,
    pad_token = pad_token_idx,
    unk_token = unk_token_idx
)


In [3]:
ID = data.Field(sequential=False, use_vocab=False)
# All these arguments are because these are really floats
# See https://github.com/pytorch/text/issues/78#issuecomment-541203609
AVG = data.LabelField(dtype = torch.float, use_vocab=False, preprocessing=float)
STD = data.LabelField(dtype = torch.float, use_vocab=False, preprocessing=float)
SUBTASK_A = data.LabelField()

train_dataset = data.TabularDataset(
    "../../data/English/task_a_distant.sample.tsv",
    format="tsv", skip_header=True,
    fields=[("id", ID), ("text", TEXT), ("avg", AVG), ("std", STD)],
)

dev_dataset = data.TabularDataset(
    "../../data/olid/olid-training-v1.0.tsv",
    format="tsv", skip_header=True,
    fields=[("id", ID), ("text", TEXT), 
            ("subtask_a", SUBTASK_A), ("subtask_b", None), ("subtask_c", None)],
    
)

test_dataset = data.TabularDataset(
    "../../data/olid/test_a.tsv",
    format="tsv", skip_header=True,
    fields=[("id", ID), ("text", TEXT), 
            ("subtask_a", SUBTASK_A)],
)

print(f"Train instances: {len(train_dataset)}")
print(f"Dev   instances: {len(dev_dataset)}")
print(f"Test instances:  {len(test_dataset)}")

Train instances: 453771
Dev   instances: 13240
Test instances:  860


Build vocabulary for label field. Can we just say 0 -> NOT (Offensive) 1 -> OFF? 

I don't know, just to make sure add the assertion

In [4]:
SUBTASK_A.build_vocab(dev_dataset)

assert SUBTASK_A.vocab.itos == ["NOT", "OFF"]

## Model


We use %load directive to show the model here...

In [5]:
BATCH_SIZE = 32


device = "cuda" if torch.cuda.is_available() else "cpu"

train_it, dev_it, test_it = data.BucketIterator.splits(
    (train_dataset, dev_dataset, test_dataset), batch_size=BATCH_SIZE, device=device,
    sort_key = lambda x: len(x.text), sort_within_batch = True,
)

Compute the class weights 

In [6]:
from sklearn.utils import compute_class_weight, compute_sample_weight
y = []
for elem in train_dataset:
    y.append(1*(elem.avg > 0.6))
    
class_weights = compute_class_weight('balanced', [0, 1], y)

# normalize it
class_weights = class_weights / class_weights[0]

class_weights

array([1.        , 7.70761053])

In [7]:
import torch.optim as optim
import torch.nn as nn
from transformers import AdamW, BertForSequenceClassification


model = BertForSequenceClassification.from_pretrained(
    'bert-base-multilingual-uncased',
    num_labels=1,
)

optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([class_weights[1]]))


model = model.to(device)
criterion = criterion.to(device)


In [8]:
from tqdm.notebook import tqdm
from offenseval.nn.training import train, evaluate
from transformers.optimization import get_constant_schedule_with_warmup
import time

N_EPOCHS = 5

# Copied from tranBertForSequenceClassifications page...please check
num_training_steps = N_EPOCHS * len(train_it)
num_warmup_steps = num_training_steps // 10
warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1

scheduler = get_constant_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, 
) 


best_valid_loss = float('inf')

early_stopping_tolerance = 100
epochs_without_improvement = 0
max_grad_norm = 1.0

model_path = f"/tmp/bert_model.pt"

def get_target(batch):
    return 1. * (batch.avg > 0.6)


pbar = tqdm(range(N_EPOCHS), ncols=1000)
pbar.set_description("Epochs")


for epoch in pbar:
    #epoch_bar = tqdm(train_it, ncols=800)

    train_loss, train_acc = train(
        model, train_it, optimizer, criterion, get_target=get_target,
        max_grad_norm=max_grad_norm, scheduler=scheduler, ncols=750
    )
    valid_loss, valid_acc, valid_f1, pos_f1, neg_f1 = evaluate(
        model, dev_it, criterion, get_target=lambda batch: batch.subtask_a
    )
    
    desc = f'Train: Loss: {train_loss:.3f} Acc: {train_acc*100:.2f}%'
    desc += f'\nVal. Loss: {valid_loss:.3f} Acc: {valid_acc*100:.2f}% Macro F1 {valid_f1:.3f} (P {pos_f1:.3f} - N {neg_f1:.3f})'
    
    print(desc)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        epochs_without_improvement = 0
        torch.save(model.state_dict(), model_path)
        print(f"Best model so far (Loss {best_valid_loss:.3f} - Acc {valid_acc:.3f}, F1 {valid_f1:.3f}) saved at {model_path}")
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= early_stopping_tolerance:
            print("Early stopping")
            break

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=5.0), HTML(value='')), layout=Layout(disp…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=14181.0), HTML(value='')), layout=Layout(…


Train: Loss: 0.386 Acc: 96.39%
Val. Loss: 7.447 Acc: 80.59% Macro F1 0.756 (P 0.645 - N 0.866)
Best model so far (Loss 7.447 - Acc 0.806, F1 0.756) saved at /tmp/bert_model.pt


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=14181.0), HTML(value='')), layout=Layout(…


Train: Loss: 0.235 Acc: 98.14%
Val. Loss: 5.749 Acc: 81.62% Macro F1 0.776 (P 0.680 - N 0.871)
Best model so far (Loss 5.749 - Acc 0.816, F1 0.776) saved at /tmp/bert_model.pt


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=14181.0), HTML(value='')), layout=Layout(…


Train: Loss: 0.190 Acc: 98.60%
Val. Loss: 8.562 Acc: 81.06% Macro F1 0.761 (P 0.653 - N 0.870)


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=14181.0), HTML(value='')), layout=Layout(…


Train: Loss: 0.148 Acc: 98.99%
Val. Loss: 9.788 Acc: 80.71% Macro F1 0.752 (P 0.634 - N 0.869)


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=14181.0), HTML(value='')), layout=Layout(…


Train: Loss: 0.106 Acc: 99.28%
Val. Loss: 9.877 Acc: 80.91% Macro F1 0.756 (P 0.642 - N 0.870)



Evaluate in dev again...

In [11]:
model.load_state_dict(torch.load(model_path))

loss, acc, f1, pos_f1, neg_f1 = evaluate(model, dev_it, criterion, get_target=lambda b: b.subtask_a)

print(f'Val Loss: {loss:.3f}  Acc: {acc*100:.2f}% Macro F1: {f1:.3f} Pos F1 {pos_f1:.3f} Neg F1 {neg_f1:.3f}')

Val Loss: 5.749  Acc: 81.62% Macro F1: 0.776 Pos F1 0.680 Neg F1 0.871


And then in OLID test set

In [12]:

loss, acc, f1, pos_f1, neg_f1 = evaluate(model, test_it, criterion, get_target=lambda batch: batch.subtask_a)

print(f'Test Loss: {loss:.3f}  Acc: {acc*100:.2f}% Macro F1: {f1:.3f} Pos F1 {pos_f1:.3f} Neg F1 {neg_f1:.3f}')

Test Loss: 4.910  Acc: 85.00% Macro F1: 0.790 Pos F1 0.677 Neg F1 0.902


Save model (we need to save the vocabulary too!)

In [27]:
from offenseval.nn import save_model

save_model(model, TEXT, "../../models/bert-seq.sample.mean06.pt")


Model saved to ../../models/bert-seq.sample.mean06.pt
Vocab saved to ../../models/bert-seq.sample.mean06.vocab.pkl


## Error Analysis

I reload the dataframe because it is not possible to retrieve the original text from the examples :-\ (they are already tokenized)

In [30]:
df_test = pd.read_table("../../data/olid/test_a.tsv", index_col=0)

df_test.columns

Index(['text', 'subtask_a'], dtype='object')

In [31]:
torch.cuda.empty_cache()

In [32]:
model.eval()

rows = []
for batch in tqdm(test_it):
    text, lens = batch.text
    
    outs = model(text)[0].detach().cpu()
    ids = batch.id
    labels = batch.subtask_a
    probs = torch.sigmoid(outs)
    # Change the threshold to improve recall
    preds = probs > 0.5
    
    for tid, label, pred, prob in zip(ids.cpu(), labels.cpu(), preds.cpu(), probs.cpu()):
        rows.append({
            "id": tid.item(),
            "text": df_test.loc[tid.item(), "text"],
            "label": label.item(),
            "pred": int(pred.item()),
            "prob": prob.item()
        })
        
df = pd.DataFrame(rows)
df.set_index("id", inplace=True)

HBox(children=(FloatProgress(value=0.0, max=27.0), HTML(value='')))




In [35]:
from sklearn.metrics import accuracy_score, f1_score

labels = df["label"]

preds = df["pred"]

acc = accuracy_score(labels, preds)
pos_f1 = f1_score(labels, preds)
neg_f1 = f1_score(1-labels, 1-preds)
avg_f1 = (pos_f1 + neg_f1) / 2

acc, pos_f1, neg_f1, avg_f1

(0.85, 0.6766917293233082, 0.9023467070401211, 0.7895192181817147)

In [36]:
true_pos = df[(df["label"] == 1) & (df["pred"] == 1)].copy()
false_neg = df[(df["label"] == 1) & (df["pred"] == 0)].copy()
false_neg.sort_values("prob", ascending=True, inplace=True)
false_pos = df[(df["label"] == 0) & (df["pred"] == 1)].copy()
false_pos.sort_values("prob", ascending=False, inplace=True)
true_neg = df[(df["label"] == 0) & (df["pred"] == 0)].copy()

conf_matrix = pd.DataFrame([
    {"real":"not off", "pred not off": len(true_neg),      "pred off": len(false_pos)},
    {"real":"off",     "pred not off":     len(false_neg), "pred off": len(true_pos)}
])

conf_matrix.set_index("real", inplace=True)

print("Falsos negativos: {}".format(len(false_neg)))
print("Falsos positivos: {}".format(len(false_pos)))

conf_matrix

Falsos negativos: 105
Falsos positivos: 24


Unnamed: 0_level_0,pred not off,pred off
real,Unnamed: 1_level_1,Unnamed: 2_level_1
not off,596,24
off,105,135


Normalized 

In [37]:
pd.options.display.float_format = '{:,.3f}'.format

conf_matrix / len(df)

Unnamed: 0_level_0,pred not off,pred off
real,Unnamed: 1_level_1,Unnamed: 2_level_1
not off,0.693,0.028
off,0.122,0.157


We should improve recall in offensive tweets

In [44]:
false_neg.iloc[4]["text"]

'#Christian #America – If we go by #Trump’s example, where liberals support open borders, I guess conservatives support school shootings.   Please explain how this makes America great again.'

Is this really offensive?

In [45]:
false_neg.iloc[0]

text     #EmmyAwards2018 - Ratings tank as expected.  W...
label                                                    1
pred                                                     0
prob                                                 0.000
Name: 34575, dtype: object

Is there any error?