## Classification with Multilingual BERT + Fine tuning

Let's first train a model in English

In [1]:
%load_ext autoreload
%autoreload 2

import torch
from transformers import BertTokenizer
import pandas as pd

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

# Trying to cut this down to check if this improves memory usage
max_input_length = 128 #tokenizer.max_len_single_sentence

In [2]:
from torchtext import data

def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length]
    return tokens

TEXT = data.Field(
    tokenize=tokenize_and_cut,
    include_lengths = True,
    use_vocab=False,
    batch_first = True,
    preprocessing = tokenizer.convert_tokens_to_ids,
    init_token = init_token_idx,
    eos_token = eos_token_idx,
    pad_token = pad_token_idx,
    unk_token = unk_token_idx
)


In [3]:
ID = data.Field(sequential=False, use_vocab=False)
# All these arguments are because these are really floats
# See https://github.com/pytorch/text/issues/78#issuecomment-541203609
AVG = data.LabelField(dtype = torch.float, use_vocab=False, preprocessing=float)
STD = data.LabelField(dtype = torch.float, use_vocab=False, preprocessing=float)
SUBTASK_A = data.LabelField()

train_dataset = data.TabularDataset(
    "../data/English/task_a_distant.sample.tsv",
    format="tsv", skip_header=True,
    fields=[("id", ID), ("text", TEXT), ("avg", AVG), ("std", STD)],
)

dev_dataset = data.TabularDataset(
    "../data/olid/olid-training-v1.0.tsv",
    format="tsv", skip_header=True,
    fields=[("id", ID), ("text", TEXT), 
            ("subtask_a", SUBTASK_A), ("subtask_b", None), ("subtask_c", None)],
    
)

test_dataset = data.TabularDataset(
    "../data/olid/test_a.tsv",
    format="tsv", skip_header=True,
    fields=[("id", ID), ("text", TEXT), 
            ("subtask_a", SUBTASK_A)],
)

print(f"Train instances: {len(train_dataset)}")
print(f"Dev   instances: {len(dev_dataset)}")
print(f"Test instances:  {len(test_dataset)}")

Train instances: 90754
Dev   instances: 13240
Test instances:  860


Build vocabulary for label field. Can we just say 0 -> NOT (Offensive) 1 -> OFF? 

I don't know, just to make sure add the assertion

In [4]:
SUBTASK_A.build_vocab(dev_dataset)

assert SUBTASK_A.vocab.itos == ["NOT", "OFF"]

In [5]:
from transformers import BertModel

bert = BertModel.from_pretrained('bert-base-multilingual-uncased')

In [6]:
tokens = tokenizer.tokenize("¿Lo creerás, Ariadna? dijo Teseo. El minotauro apenas se defendió.")

token_ids = torch.LongTensor(tokenizer.convert_tokens_to_ids(tokens))
# I need to reshape it before BERT consumes it
# (batch_len, seq_len)... in this case batch_len == 1
print(tokenizer.convert_ids_to_tokens(token_ids))
last_hidden, clf = bert(token_ids.view(1, -1))

last_hidden.shape, clf.shape

['¿', 'lo', 'creer', '##as', ',', 'aria', '##dna', '?', 'dijo', 'tese', '##o', '.', 'el', 'mino', '##tau', '##ro', 'apenas', 'se', 'defend', '##io', '.']


(torch.Size([1, 21, 768]), torch.Size([1, 768]))

## Model


We use %load directive to show the model here...

In [9]:
# %load ../offenseval/models/bert_gru.py
import torch.nn as nn

class BERTGRUSequenceClassifier(nn.Module):

    """
    BERT + GRU model

    Inspired on Ben Trevett's implementation:
    https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/6%20-%20Transformers%20for%20Sentiment%20Analysis.ipynb
    """
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers=1,
                 bidirectional=False,
                 finetune_bert=False,
                 dropout=0.2):

        super().__init__()

        self.bert = bert

        embedding_dim = bert.config.to_dict()['hidden_size']

        self.finetune_bert = finetune_bert

        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)

        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        #text = [batch size, sent len]

        if not self.finetune_bert:
            with torch.no_grad():
                embedded = self.bert(text)[0]
        else:
            embedded = self.bert(text)[0]
        #embedded = [batch size, sent len, emb dim]
        _, hidden = self.rnn(embedded)

        #hidden = [n layers * n directions, batch size, emb dim]

        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])

        #hidden = [batch size, hid dim]

        output = self.out(hidden)

        #output = [batch size, out dim]

        return output


In [17]:
BATCH_SIZE = 32


device = "cuda" if torch.cuda.is_available() else "cpu"

train_it, dev_it, test_it = data.BucketIterator.splits(
    (train_dataset, dev_dataset, test_dataset), batch_size=BATCH_SIZE, device=device,
    sort_key = lambda x: len(x.text), sort_within_batch = True,
)

## Train

In [11]:
import torch
from sklearn.metrics import accuracy_score, f1_score


def train(model, iterator, optimizer, criterion, scheduler, max_grad_norm=None):
    """
    Trains the model for one full epoch
    """
    epoch_loss = 0
    epoch_acc = 0

    model.train()
    
    is_tqdm = type(pbar).__module__.split(".")[0] == "tqdm"

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        text, lens = batch.text

        predictions = model(text)
        
        target = 1.0 * (batch.avg > 0.6) 
        
        loss = criterion(predictions.squeeze(1), target)
        
        prob_predictions = torch.sigmoid(predictions)
        preds = torch.round(prob_predictions).detach().cpu()
        acc = accuracy_score(preds, target.cpu())

        loss.backward()
        # Gradient clipping
        if max_grad_norm:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            
        optimizer.step()
        scheduler.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        lr = optimizer.param_groups[0]["lr"]
        if is_tqdm:
            desc = f"Loss {epoch_loss / (i+1):.3f} -- Acc {epoch_acc / (i+1):.3f} -- LR {lr:.5f}"
            iterator.set_description(desc)

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    """
    Evaluates the model on the given iterator
    """
    epoch_loss = 0
    model.eval()
    with torch.no_grad():
        predicted_probas = []
        labels = []
        for batch in iterator:
            text, lens = batch.text
            target = batch.subtask_a
            
            predictions = model(text)
            loss = criterion(predictions.squeeze(1), target.float())
            
            prob_predictions = torch.sigmoid(predictions)

            predicted_probas.append(prob_predictions)
            labels.append(target.cpu())

            epoch_loss += loss.item()

        predicted_probas = torch.cat(predicted_probas).cpu()
        labels = torch.cat(labels).cpu()

        preds = torch.round(predicted_probas)

        pos_f1 = f1_score(labels, preds)
        neg_f1 = f1_score(1-labels, 1-preds)
        avg_f1 = (pos_f1 + neg_f1) / 2
        acc = accuracy_score(labels, preds)

    return epoch_loss / len(iterator), acc, avg_f1, pos_f1, neg_f1


Compute the class weights 

In [12]:
from sklearn.utils import compute_class_weight, compute_sample_weight
y = []
for elem in dev_dataset:
    y.append(1*(elem.subtask_a == 'OFF'))
    
class_weights = compute_class_weight('balanced', [0, 1], y)

# normalize it
class_weights = class_weights / class_weights[0]

class_weights[1]

2.0090909090909093

In [13]:
import torch.optim as optim
from transformers import AdamW
from offenseval.models import BERTGRUSequenceClassifier

HIDDEN_DIM = 256
OUTPUT_DIM = 1
DROPOUT = 0.25


model = BERTGRUSequenceClassifier(
    bert,
    finetune_bert=True,
    hidden_dim=HIDDEN_DIM,
    output_dim=OUTPUT_DIM,
    dropout=DROPOUT)

print(f'The model has {count_parameters(model):,} trainable parameters')

optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([class_weights[1]]))


model = model.to(device)
criterion = criterion.to(device)


The model has 168,144,641 trainable parameters


In [14]:
from tqdm.notebook import tqdm
from transformers.optimization import get_constant_schedule_with_warmup
import time

N_EPOCHS = 5

# Copied from transformers page...please check
num_training_steps = N_EPOCHS * len(train_it)
num_warmup_steps = len(train_it)
warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1

scheduler = get_constant_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, 
) 


best_valid_loss = float('inf')

early_stopping_tolerance = 100
epochs_without_improvement = 0
max_grad_norm = 1.0

model_path = f"/tmp/bert_model.pt"

pbar = tqdm(range(N_EPOCHS), ncols=1000)
for epoch in pbar:

    epoch_bar = tqdm(train_it, ncols=800)

    train_loss, train_acc = train(
        model, epoch_bar, optimizer, criterion,
        max_grad_norm=max_grad_norm, scheduler=scheduler
    )
    valid_loss, valid_acc, valid_f1, pos_f1, neg_f1 = evaluate(model, dev_it, criterion)
    
    desc = f'Train: Loss: {train_loss:.3f} Acc: {train_acc*100:.2f}%'
    desc += f'\nVal. Loss: {valid_loss:.3f} Acc: {valid_acc*100:.2f}% Macro F1 {valid_f1:.3f} (P {pos_f1:.3f} - N {neg_f1:.3f})'
    pbar.set_description(desc)
    print(desc)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        epochs_without_improvement = 0
        torch.save(model.state_dict(), model_path)
        print(f"Best model so far (Loss {best_valid_loss:.3f} - Acc {valid_acc:.3f}, F1 {valid_f1:.3f}) saved at {model_path}")
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= early_stopping_tolerance:
            print("Early stopping")
            break

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=5.0), HTML(value='')), layout=Layout(disp…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=2837.0), HTML(value='')), layout=Layout(d…


Train: Loss: 0.238 Acc: 93.66%
Val. Loss: 1.507 Acc: 80.28% Macro F1 0.757 (P 0.651 - N 0.863)
Best model so far (Loss 1.507 - Acc 0.803, F1 0.757) saved at /tmp/bert_model.pt


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=2837.0), HTML(value='')), layout=Layout(d…


Train: Loss: 0.118 Acc: 97.45%
Val. Loss: 1.641 Acc: 80.83% Macro F1 0.767 (P 0.668 - N 0.865)


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=2837.0), HTML(value='')), layout=Layout(d…


Train: Loss: 0.082 Acc: 98.28%
Val. Loss: 1.988 Acc: 80.34% Macro F1 0.753 (P 0.642 - N 0.864)


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=2837.0), HTML(value='')), layout=Layout(d…


Train: Loss: 0.060 Acc: 98.84%
Val. Loss: 2.116 Acc: 80.69% Macro F1 0.759 (P 0.652 - N 0.866)


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=2837.0), HTML(value='')), layout=Layout(d…


Train: Loss: 0.042 Acc: 99.25%
Val. Loss: 2.508 Acc: 80.26% Macro F1 0.748 (P 0.630 - N 0.865)
Early stopping


In [16]:
model.load_state_dict(torch.load(model_path))

loss, acc, f1, pos_f1, neg_f1 = evaluate(model, dev_it, criterion)

print(f'Val Loss: {loss:.3f}  Acc: {acc*100:.2f}% Macro F1: {f1:.3f} Pos F1 {pos_f1:.3f} Neg F1 {neg_f1:.3f}')

Val Loss: 1.507  Acc: 80.28% Macro F1: 0.757 Pos F1 0.651 Neg F1 0.863


In [18]:
model.load_state_dict(torch.load(model_path))

loss, acc, f1, pos_f1, neg_f1 = evaluate(model, test_it, criterion)

print(f'Test Loss: {loss:.3f}  Acc: {acc*100:.2f}% Macro F1: {f1:.3f} Pos F1 {pos_f1:.3f} Neg F1 {neg_f1:.3f}')

Test Loss: 1.241  Acc: 83.60% Macro F1: 0.770 Pos F1 0.647 Neg F1 0.893


In [19]:
torch.save(model.state_dict(), "../models/bert.en-sample.ft.mean06.pt")

## Error Analysis

I reload the dataframe because it is not possible to retrieve the original text from the examples :-\ (they are already tokenized)

In [30]:
df_test = pd.read_table("../data/olid/test_a.tsv", index_col=0)

df_test.columns

Index(['text', 'subtask_a'], dtype='object')

In [31]:
torch.cuda.empty_cache()

In [77]:
model.eval()

rows = []
for batch in tqdm(test_it):
    text, lens = batch.text
    
    outs = model(text).detach().cpu()
    ids = batch.id
    labels = batch.subtask_a
    probs = torch.sigmoid(outs)
    # Change the threshold to improve recall
    preds = probs > 0.10
    
    for tid, label, pred, prob in zip(ids.cpu(), labels.cpu(), preds.cpu(), probs.cpu()):
        rows.append({
            "id": tid.item(),
            "text": df_test.loc[tid.item(), "text"],
            "label": label.item(),
            "pred": int(pred.item()),
            "prob": prob.item()
        })
        
df = pd.DataFrame(rows)
df.set_index("id", inplace=True)

HBox(children=(FloatProgress(value=0.0, max=27.0), HTML(value='')))




In [78]:
labels = df["label"]
preds = df["pred"]

acc = accuracy_score(labels, preds)
pos_f1 = f1_score(labels, preds)
neg_f1 = f1_score(1-labels, 1-preds)
avg_f1 = (pos_f1 + neg_f1) / 2

acc, pos_f1, neg_f1, avg_f1

(0.8383720930232558,
 0.6759906759906759,
 0.8923315259488769,
 0.7841611009697764)

In [75]:
true_pos = df[(df["label"] == 1) & (df["pred"] == 1)].copy()
false_neg = df[(df["label"] == 1) & (df["pred"] == 0)].copy()
false_neg.sort_values("prob", ascending=True, inplace=True)
false_pos = df[(df["label"] == 0) & (df["pred"] == 1)].copy()
false_pos.sort_values("prob", ascending=False, inplace=True)
true_neg = df[(df["label"] == 0) & (df["pred"] == 0)].copy()

conf_matrix = pd.DataFrame([
    {"real":"not off", "pred not off": len(true_neg),      "pred off": len(false_pos)},
    {"real":"off",     "pred not off":     len(false_neg), "pred off": len(true_pos)}
])

conf_matrix.set_index("real", inplace=True)

print("Falsos negativos: {}".format(len(false_neg)))
print("Falsos positivos: {}".format(len(false_pos)))

conf_matrix

Falsos negativos: 93
Falsos positivos: 45


Unnamed: 0_level_0,pred not off,pred off
real,Unnamed: 1_level_1,Unnamed: 2_level_1
not off,575,45
off,93,147


Normalized 

In [76]:
pd.options.display.float_format = '{:,.3f}'.format

conf_matrix / len(df)

Unnamed: 0_level_0,pred not off,pred off
real,Unnamed: 1_level_1,Unnamed: 2_level_1
not off,0.669,0.052
off,0.108,0.171


We should improve recall in offensive tweets

In [23]:
false_neg.iloc[0]["text"]

'@USER #StrataData #AI #data #hairball: @USER TDavis touring the application of #Cognitive and #machinelearning techniques to speed #Governance of #data URL'

Is this really offensive?

In [24]:
false_neg.iloc[0]

text     @USER #StrataData #AI #data #hairball: @USER T...
label                                                    1
pred                                                     0
prob                                                 0.001
Name: 22728, dtype: object

Is there any error?

In [25]:
df_dev.loc[79079]

tweet        @USER I have you and Kaze- I need more.
subtask_a                                        OFF
subtask_b                                        TIN
subtask_c                                        IND
Name: 79079, dtype: object

In [26]:
pd.set_option('max_colwidth', None)
false_neg.iloc[:30]

Unnamed: 0_level_0,text,label,pred,prob
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
22728,@USER #StrataData #AI #data #hairball: @USER TDavis touring the application of #Cognitive and #machinelearning techniques to speed #Governance of #data URL,1,0,0.001
63235,#HanoiJane Fonda Campaigns for Gun Control Measure; Will it Backfire? - Liberty Park Press URL,1,0,0.001
21970,"Judicial Watch Sues for Records of FBI 302 Interviews with Demoted Justice Department Official Bruce Ohr"" URL #TCOT #MAGA #RedNationRising""",1,0,0.001
45266,@USER @USER True but the constant talk from the left about common sense gun control every time and yet not one valid proposal,1,0,0.001
30890,#VA10 voters know @USER is the right side of the gun issue. Gun control or gun rights? The answer may help determine whether NRA-financed Rep. Comstock wins reelection URL,1,0,0.001
28037,"Thinking about the Student-Loan Crisis"" URL #TCOT #MAGA #RedNationRising""",1,0,0.001
59539,@USER @USER in every conner coz anti pakistani will not able to attack on our future generation rights,1,0,0.001
38764,@USER @USER The right has the unique ability to create monsters out of thin air. It Legislation was even created to fight the phantom menance of antifa--currently in committee. URL,1,0,0.001
65778,@USER @USER Indeed Americans need more gun control which means hitting your target. Can never have to much range time. ☺😊,1,0,0.001
12084,@USER Over 30 years worth of background checks under different Presidents and this one slipped through? #KavanaughConfirmation #KavanaughWithdraw #KavanaughSCOTUS #maga,1,0,0.001
