## Classification with Multilingual BERT

Let's first train a model in English

In [1]:
import torch
from transformers import BertTokenizer
import pandas as pd



In [2]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

# Trying to cut this down to check if this improves memory usage
max_input_length = 128 #tokenizer.max_len_single_sentence

In [3]:
from torchtext import data

def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length]
    return tokens

TEXT = data.Field(
    #tokenize = 'spacy', 
    #tokenizer_language="es",
    tokenize=tokenize_and_cut,
    include_lengths = True,
    use_vocab=False,
    batch_first = True,
    preprocessing = tokenizer.convert_tokens_to_ids,
    init_token = init_token_idx,
    eos_token = eos_token_idx,
    pad_token = pad_token_idx,
    unk_token = unk_token_idx
)


In [4]:
ID = data.Field(sequential=False, use_vocab=False)
# All these arguments are because these are really floats
# See https://github.com/pytorch/text/issues/78#issuecomment-541203609
AVG = data.LabelField(dtype = torch.float, use_vocab=False, preprocessing=float)
STD = data.LabelField(dtype = torch.float, use_vocab=False, preprocessing=float)
SUBTASK_A = data.LabelField()

train_dataset = data.TabularDataset(
    "../data/English/task_a_distant.sample.tsv",
    format="tsv", skip_header=True,
    fields=[("id", ID), ("text", TEXT), ("avg", AVG), ("std", STD)],
)

dev_dataset = data.TabularDataset(
    "../data/olid/olid-training-v1.0.tsv",
    format="tsv", skip_header=True,
    fields=[("id", ID), ("text", TEXT), 
            ("subtask_a", SUBTASK_A), ("subtask_b", None), ("subtask_c", None)],
    
)

print(f"Train instances: {len(train_dataset)}")

print(f"Dev   instances: {len(dev_dataset)}")

Train instances: 90754
Dev   instances: 13240


Build vocabulary for label field. Can we just say 0 -> NOT (Offensive) 1 -> OFF? 

I don't know, just to make sure add the assertion

In [5]:
SUBTASK_A.build_vocab(dev_dataset)

assert SUBTASK_A.vocab.itos == ["NOT", "OFF"]

In [6]:
from transformers import BertModel

bert = BertModel.from_pretrained('bert-base-multilingual-uncased')

In [7]:
tokens = tokenizer.tokenize("¿Lo creerás, Ariadna? dijo Teseo. El minotauro apenas se defendió.")

token_ids = torch.LongTensor(tokenizer.convert_tokens_to_ids(tokens))
# I need to reshape it before BERT consumes it
# (batch_len, seq_len)... in this case batch_len == 1
print(tokenizer.convert_ids_to_tokens(token_ids))
last_hidden, clf = bert(token_ids.view(1, -1))

last_hidden.shape, clf.shape

['¿', 'lo', 'creer', '##as', ',', 'aria', '##dna', '?', 'dijo', 'tese', '##o', '.', 'el', 'mino', '##tau', '##ro', 'apenas', 'se', 'defend', '##io', '.']


(torch.Size([1, 21, 768]), torch.Size([1, 768]))

## Model


In [8]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers=1, 
                 bidirectional=False,
                 finetune_bert=False,
                 dropout=0.2):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.finetune_bert = finetune_bert
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
    
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):    
        #text = [batch size, sent len]
        
        if not self.finetune_bert:
            with torch.no_grad():
                embedded = self.bert(text)[0]
            #embedded = [batch size, sent len, emb dim]
        else:
            embedded = self.bert(text)[0]
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [9]:
#for name, param in model.named_parameters():                
#    if name.startswith('bert'):
#        param.requires_grad = False
        
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)



In [10]:
BATCH_SIZE = 32


device = "cuda" if torch.cuda.is_available() else "cpu"

train_it, dev_it = data.BucketIterator.splits(
    (train_dataset, dev_dataset), batch_size=BATCH_SIZE, device=device,
    sort_key = lambda x: len(x.text), sort_within_batch = True,
)

## Train

In [49]:
import torch
from sklearn.metrics import accuracy_score, f1_score


def train(model, iterator, optimizer, criterion, max_grad_norm=None):
    """
    Trains the model for one full epoch
    """
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        text, lens = batch.text

        predictions = model(text)
        
        target = 1.0 * (batch.avg > 0.6) 
        
        loss = criterion(predictions.squeeze(1), target)
        
        prob_predictions = torch.sigmoid(predictions)
        preds = torch.round(prob_predictions).detach().cpu()
        acc = accuracy_score(preds, target.cpu())

        loss.backward()
        # Gradient clipping
        if max_grad_norm:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    """
    Evaluates the model on the given iterator
    """
    epoch_loss = 0
    model.eval()
    with torch.no_grad():
        predicted_probas = []
        labels = []
        for batch in iterator:
            text, lens = batch.text
            target = batch.subtask_a
            
            predictions = model(text)
            loss = criterion(predictions.squeeze(1), target.float())
            
            prob_predictions = torch.sigmoid(predictions)

            predicted_probas.append(prob_predictions)
            labels.append(target.cpu())

            epoch_loss += loss.item()

        predicted_probas = torch.cat(predicted_probas).cpu()
        labels = torch.cat(labels).cpu()

        preds = torch.round(predicted_probas)

        pos_f1 = f1_score(labels, preds)
        neg_f1 = f1_score(1-labels, 1-preds)
        avg_f1 = (pos_f1 + neg_f1) / 2
        acc = accuracy_score(labels, preds)

    return epoch_loss / len(iterator), acc, avg_f1, pos_f1, neg_f1


Compute the class weights 

In [50]:
from sklearn.utils import compute_class_weight, compute_sample_weight
y = []
for elem in dev_dataset:
    y.append(1*(elem.subtask_a == 'OFF'))
    
class_weights = compute_class_weight('balanced', [0, 1], y)

class_weights

array([0.74886878, 1.50454545])

In [51]:
import torch.optim as optim

HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
DROPOUT = 0.25

model = BERTGRUSentiment(
    bert,
    finetune_bert=False,
    hidden_dim=HIDDEN_DIM,
    output_dim=OUTPUT_DIM,
    dropout=DROPOUT)

print(f'The model has {count_parameters(model):,} trainable parameters')

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([class_weights[1]]))
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=1)
model = model.to(device)
criterion = criterion.to(device)


The model has 168,144,641 trainable parameters


In [52]:
from tqdm.notebook import tqdm
import time

N_EPOCHS = 12

best_valid_loss = float('inf')

early_stopping_tolerance = 4
epochs_without_improvement = 0
max_grad_norm = 1.0

model_path = f"/tmp/bert_model.pt"

pbar = tqdm(range(N_EPOCHS), ncols=1000)
for epoch in pbar:
    epoch_bar = tqdm(train_it)
    
    train_loss, train_acc = train(
        model, epoch_bar, optimizer, criterion,
        max_grad_norm=max_grad_norm
    )
    valid_loss, valid_acc, valid_f1, pos_f1, neg_f1 = evaluate(model, dev_it, criterion)
    scheduler.step(valid_loss)
    
    desc = f'Train: Loss: {train_loss:.3f} Acc: {train_acc*100:.2f}%'
    desc += f'\nVal. Loss: {valid_loss:.3f} Acc: {valid_acc*100:.2f}% F1 {valid_f1:.3f}'
    pbar.set_description(desc)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        epochs_without_improvement = 0
        torch.save(model.state_dict(), model_path)
        print(f"Best model so far (Loss {best_valid_loss:.3f} - Acc {valid_acc:.3f}, F1 {valid_f1:.3f}) saved at {model_path}")
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= early_stopping_tolerance:
            print("Early stopping")
            break

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=20.0), HTML(value='')), layout=Layout(dis…

HBox(children=(FloatProgress(value=0.0, max=2837.0), HTML(value='')))


Best model so far (Loss 0.980 - Acc 0.790, F1 0.729) saved at /tmp/bert_model.pt


HBox(children=(FloatProgress(value=0.0, max=2837.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2837.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2837.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2837.0), HTML(value='')))

KeyboardInterrupt: 

In [54]:
model.load_state_dict(torch.load(model_path))

loss, acc, f1, pos_f1, neg_f1 = evaluate(model, dev_it, criterion)

print(f'Val Loss: {loss:.3f}  Acc: {acc*100:.2f}% Macro F1: {f1:.3f} Pos F1 {pos_f1:.3f} Neg F1 {neg_f1:.3f}')

Val Loss: 0.980  Acc: 78.96% Macro F1: 0.729 Pos F1 0.600 Neg F1 0.857


In [55]:
torch.save(model.state_dict(), "../models/bert.en-sample.no-ft.mean06.pt")

## Error Analysis

I reload the dataframe because it is not possible to retrieve the original text from the examples :-\ (they are already tokenized)

In [56]:
df_dev = pd.read_table("../data/olid/olid-training-v1.0.tsv", index_col=0)

df_dev.columns

Index(['tweet', 'subtask_a', 'subtask_b', 'subtask_c'], dtype='object')

In [57]:
model.eval()

rows = []
for batch in tqdm(dev_it):
    text, lens = batch.text
    
    outs = model(text)
    ids = batch.id
    labels = batch.subtask_a
    probs = torch.sigmoid(outs)
    preds = torch.round(probs)
    
    for tid, label, pred, prob in zip(ids.cpu(), labels.cpu(), preds.cpu(), probs.cpu()):
        rows.append({
            "id": tid.item(),
            "text": df_dev.loc[tid.item(), "tweet"],
            "label": label.item(),
            "pred": int(pred.item()),
            "prob": prob.item()
        })
        
df = pd.DataFrame(rows)
df.set_index("id", inplace=True)

HBox(children=(FloatProgress(value=0.0, max=414.0), HTML(value='')))




In [58]:
true_pos = df[(df["label"] == 1) & (df["pred"] == 1)].copy()
false_neg = df[(df["label"] == 1) & (df["pred"] == 0)].copy()
false_neg.sort_values("prob", ascending=True, inplace=True)
false_pos = df[(df["label"] == 0) & (df["pred"] == 1)].copy()
false_pos.sort_values("prob", ascending=False, inplace=True)
true_neg = df[(df["label"] == 0) & (df["pred"] == 0)].copy()

conf_matrix = pd.DataFrame([
    {"real":"not off", "pred not off": len(true_neg),      "pred off": len(false_pos)},
    {"real":"off",     "pred not off":     len(false_neg), "pred off": len(true_pos)}
])

conf_matrix.set_index("real", inplace=True)

print("Falsos negativos: {}".format(len(false_neg)))
print("Falsos positivos: {}".format(len(false_pos)))

conf_matrix

Falsos negativos: 2311
Falsos positivos: 475


Unnamed: 0_level_0,pred not off,pred off
real,Unnamed: 1_level_1,Unnamed: 2_level_1
not off,8365,475
off,2311,2089


Normalized 

In [59]:
pd.options.display.float_format = '{:,.3f}'.format

conf_matrix / len(df)

Unnamed: 0_level_0,pred not off,pred off
real,Unnamed: 1_level_1,Unnamed: 2_level_1
not off,0.632,0.036
off,0.175,0.158


Terrible recall for offensive tweets. We've got to improve this

In [60]:
false_neg.iloc[0]["text"]

'@USER @USER @USER @USER You ignored my question, which is crucial to the argument. Once sensible" gun control is imposed and another shooting occurs, do you truly believe they will not push for further restrictions?"'

Is this really offensive?

In [61]:
false_neg.iloc[0]

text     @USER @USER @USER @USER You ignored my questio...
label                                                    1
pred                                                     0
prob                                                 0.000
Name: 80706, dtype: object

Is there any error?

In [62]:
df_dev.loc[79079]

tweet        @USER I have you and Kaze- I need more.
subtask_a                                        OFF
subtask_b                                        TIN
subtask_c                                        IND
Name: 79079, dtype: object

In [63]:
pd.set_option('max_colwidth', None)
false_neg.iloc[:30]

Unnamed: 0_level_0,text,label,pred,prob
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
80706,"@USER @USER @USER @USER You ignored my question, which is crucial to the argument. Once sensible"" gun control is imposed and another shooting occurs, do you truly believe they will not push for further restrictions?""",1,0,0.0
45781,Sean Penn: #MeToo Movement Seeks to ‘Divide Men and Women’ URL &lt;— Gotta say… I stand w/ Jeff Spicoli on this one. URL #MAGA #WWG1WGA #ConfirmKavanaugh,1,0,0.0
31527,@USER He is quite good at faking. Must have done it numerous times,1,0,0.0
70844,@USER I was think she jisoo but I’m Focused I found she is taeyoun,1,0,0.0
46424,"@USER @USER @USER You should.have tried more, perhaps a BJP or RSS connection could have been fixed"".Masters would be happy Dhanya""",1,0,0.0
19303,"@USER Hallo please find the unroll here: Thread by @USER 1. Moore's failure to deal with the foul smear against him, is a good example of how the new rules could apply. The firs […]"" #FakeNews #MAGA URL See you soon. 🤖""",1,0,0.0
54306,"@USER @USER Hillary? Why is she on the social studies curriculum? Why? Who is she to be in any curriculum? She is nothing! I'm shocked that she was in any curriculum, but I am very happy to see that she is OUT!"" (I am so glad she's nowhere near the @USER",1,0,0.0
62156,@USER America will be better off once you retire Flakey Jeff Flake. Go away.......,1,0,0.0
19179,@USER @USER @USER @USER as of right now my only solid position is that the NRA has become far too influential in politics and as a result they have delayed the desire for gun control for about 20 years.,1,0,0.0
37694,@USER Ok but please understand that I am not one of dubakoor mumtaj dubakoor army. I only support her for certain things. I get what you are saying and I don’t understand how she has survived from up to this point. For me she is not fit for title winner.,1,0,0.0
