# English pretraining then Danish finetuning

In this notebook we finetune the model in English to then finetune it again in Danish

It has similar performance

In [1]:
%load_ext autoreload
%autoreload 2
import os
from datetime import datetime
import fire
import torch
import pandas as pd
from torchtext import data
import torch.nn as nn
from transformers import (
    AdamW, BertForSequenceClassification, BertTokenizer,
    get_constant_schedule_with_warmup
)

from offenseval.nn import (
    Tokenizer,
    train, evaluate, train_cycle, save_model, load_model, evaluate_dataset
)
from offenseval.datasets import datasets

pd.options.display.max_rows = 200
pd.options.display.max_colwidth = 300

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


Create fields and some other boilerplate

In [2]:
from offenseval.datasets import datasets, build_dataset
from transformers import BertModel, BertTokenizer

bert_name = "bert-base-multilingual-cased"
bert_model = BertModel.from_pretrained(bert_name)
bert_tokenizer = BertTokenizer.from_pretrained(bert_name)


In [3]:
init_token_idx = bert_tokenizer.cls_token_id
eos_token_idx = bert_tokenizer.sep_token_id
pad_token_idx = bert_tokenizer.pad_token_id
unk_token_idx = bert_tokenizer.unk_token_id

# Trying to cut this down to check if this improves memory usage

tokenizer = Tokenizer(bert_tokenizer)

ID = data.Field(sequential=False, use_vocab=False)
# All these arguments are because these are really floats
# See https://github.com/pytorch/text/issues/78#issuecomment-541203609
SUBTASK_A = data.LabelField()

TEXT = data.Field(
    tokenize=tokenizer.tokenize,
    include_lengths = True,
    use_vocab=False,
    batch_first = True,
    preprocessing = tokenizer.convert_tokens_to_ids,
    init_token = init_token_idx,
    eos_token = eos_token_idx,
    pad_token = pad_token_idx,
    unk_token = unk_token_idx
)

Get the predictions

In [4]:
from offenseval.datasets import datasets, build_dataset, build_examples

fields = {
    "id": ('id', ID),
    "text": ('text', TEXT),
    "subtask_a": ("subtask_a", SUBTASK_A)
}

train_en_dataset = build_dataset(datasets["olid"]["train"], fields)
dev_en_dataset = build_dataset(datasets["olid"]["dev"], fields)

train_da_dataset = build_dataset(datasets["danish"]["train"], fields)
dev_da_dataset = build_dataset(datasets["danish"]["dev"], fields)


SUBTASK_A.build_vocab(dev_en_dataset)
assert SUBTASK_A.vocab.itos == ["NOT", "OFF"]

In [5]:

print(f"There are {len(train_en_dataset)} English training tweets")
print(f"There are {len(dev_en_dataset)} English dev tweets")

print(f"There are {len(train_da_dataset)} Danish training tweets")
print(f"There are {len(dev_da_dataset)} Danish dev tweets")


There are 13240 English training tweets
There are 860 English dev tweets
There are 2368 Danish training tweets
There are 592 Danish dev tweets


In [6]:
print("Building iterators")

BATCH_SIZE = 64

train_en_it, dev_en_it = data.BucketIterator.splits(
    (train_en_dataset, dev_en_dataset), batch_size=BATCH_SIZE, device=device,
    sort_key = lambda x: len(x.text), sort_within_batch = True,
)


train_da_it, dev_da_it = data.BucketIterator.splits(
    (train_da_dataset, dev_da_dataset), batch_size=BATCH_SIZE, device=device,
    sort_key = lambda x: len(x.text), sort_within_batch = True,
)

Building iterators


In [7]:
from offenseval.nn import create_criterion
from offenseval.nn.models import BertSeqModel
from transformers import get_linear_schedule_with_warmup

model = BertSeqModel(bert_model, dropout=0.10).to(device)
epochs = 10

criterion = create_criterion(device, weight_with=train_en_dataset)
optimizer = AdamW(model.parameters(), lr=2.5e-5)

num_training_steps = epochs * len(train_en_it)
num_warmup_steps = num_training_steps // 10
warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
)

Construct dataset for better visualization

In [8]:
from offenseval.nn import train_cycle

def get_target(batch):
    return batch.subtask_a.double()

output_path = "../../models/bert_cased.en.da.pt"

train_cycle(
    model, optimizer, criterion, scheduler, 
    train_en_it, dev_en_it, epochs, get_target=get_target, monitor="f1",
    model_path=output_path, early_stopping_tolerance=5, ncols=700
)

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=10.0), HTML(value='')), layout=Layout(dis…



Epoch 0


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=207.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.825 Acc: 64.67%
Val.Loss: 0.629 Acc: 79.42% Macro F1 0.756 (P 0.660 - N 0.852)
Best model so far (Loss: 0.629 Acc: 79.42% Macro F1 0.756 (P 0.660 - N 0.852)) saved at ../../models/bert_cased.en.da.pt


Epoch 1


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=207.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.656 Acc: 77.02%
Val.Loss: 0.602 Acc: 80.35% Macro F1 0.773 (P 0.691 - N 0.856)
Best model so far (Loss: 0.602 Acc: 80.35% Macro F1 0.773 (P 0.691 - N 0.856)) saved at ../../models/bert_cased.en.da.pt


Epoch 2


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=207.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.537 Acc: 82.03%
Val.Loss: 0.584 Acc: 83.26% Macro F1 0.797 (P 0.711 - N 0.882)
Best model so far (Loss: 0.584 Acc: 83.26% Macro F1 0.797 (P 0.711 - N 0.882)) saved at ../../models/bert_cased.en.da.pt


Epoch 3


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=207.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.413 Acc: 87.55%
Val.Loss: 0.669 Acc: 82.67% Macro F1 0.787 (P 0.695 - N 0.879)


Epoch 4


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=207.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.305 Acc: 91.43%
Val.Loss: 0.732 Acc: 80.93% Macro F1 0.775 (P 0.687 - N 0.863)


Epoch 5


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=207.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.233 Acc: 93.73%
Val.Loss: 0.950 Acc: 81.63% Macro F1 0.772 (P 0.672 - N 0.872)


Epoch 6


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=207.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.181 Acc: 95.70%
Val.Loss: 1.145 Acc: 82.56% Macro F1 0.772 (P 0.662 - N 0.882)


Epoch 7


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=207.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.130 Acc: 96.83%
Val.Loss: 1.218 Acc: 81.74% Macro F1 0.763 (P 0.650 - N 0.876)
Early stopping


In [10]:
model.load_state_dict(torch.load(output_path))

report = evaluate(
    model, 
    dev_en_it, 
    criterion, 
    get_target=lambda batch: batch.subtask_a)

print(f'Val {report}')


Val Loss: 0.584 Acc: 83.26% Macro F1 0.797 (P 0.711 - N 0.882)


Now, train it in Danish

In [11]:
from offenseval.nn import create_criterion
from offenseval.nn.models import BertSeqModel
from transformers import get_linear_schedule_with_warmup

epochs = 10

criterion = create_criterion(device, weight_with=train_en_dataset)
optimizer = AdamW(model.parameters(), lr=1e-5)

num_training_steps = epochs * len(train_en_it)
num_warmup_steps = num_training_steps // 10
warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
)

In [12]:
from offenseval.nn import train_cycle


train_cycle(
    model, optimizer, criterion, scheduler, 
    train_da_it, dev_da_it, epochs, get_target=get_target, monitor="f1",
    model_path=output_path, early_stopping_tolerance=5, ncols=700
)

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=10.0), HTML(value='')), layout=Layout(dis…



Epoch 0


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=37.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.546 Acc: 82.81%
Val.Loss: 0.552 Acc: 87.16% Macro F1 0.672 (P 0.415 - N 0.928)
Best model so far (Loss: 0.552 Acc: 87.16% Macro F1 0.672 (P 0.415 - N 0.928)) saved at ../../models/bert_cased.en.da.pt


Epoch 1


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=37.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.498 Acc: 87.50%
Val.Loss: 0.516 Acc: 88.34% Macro F1 0.700 (P 0.465 - N 0.935)
Best model so far (Loss: 0.516 Acc: 88.34% Macro F1 0.700 (P 0.465 - N 0.935)) saved at ../../models/bert_cased.en.da.pt


Epoch 2


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=37.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.435 Acc: 89.86%
Val.Loss: 0.479 Acc: 88.51% Macro F1 0.718 (P 0.500 - N 0.935)
Best model so far (Loss: 0.479 Acc: 88.51% Macro F1 0.718 (P 0.500 - N 0.935)) saved at ../../models/bert_cased.en.da.pt


Epoch 3


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=37.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.351 Acc: 91.68%
Val.Loss: 0.479 Acc: 86.32% Macro F1 0.728 (P 0.537 - N 0.920)
Best model so far (Loss: 0.479 Acc: 86.32% Macro F1 0.728 (P 0.537 - N 0.920)) saved at ../../models/bert_cased.en.da.pt


Epoch 4


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=37.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.278 Acc: 93.16%
Val.Loss: 0.468 Acc: 91.05% Macro F1 0.787 (P 0.624 - N 0.949)
Best model so far (Loss: 0.468 Acc: 91.05% Macro F1 0.787 (P 0.624 - N 0.949)) saved at ../../models/bert_cased.en.da.pt


Epoch 5


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=37.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.215 Acc: 94.89%
Val.Loss: 0.522 Acc: 90.20% Macro F1 0.776 (P 0.608 - N 0.944)


Epoch 6


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=37.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.168 Acc: 96.49%
Val.Loss: 0.585 Acc: 89.36% Macro F1 0.771 (P 0.604 - N 0.939)


Epoch 7


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=37.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.118 Acc: 97.34%
Val.Loss: 0.661 Acc: 90.71% Macro F1 0.770 (P 0.593 - N 0.948)


Epoch 8


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=37.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.078 Acc: 98.35%
Val.Loss: 0.816 Acc: 90.88% Macro F1 0.757 (P 0.565 - N 0.949)


Epoch 9


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=37.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.065 Acc: 98.61%
Val.Loss: 0.790 Acc: 90.37% Macro F1 0.771 (P 0.596 - N 0.945)
Early stopping


In [13]:
model.load_state_dict(torch.load(output_path))

report = evaluate(
    model, 
    dev_da_it, 
    criterion, 
    get_target=lambda batch: batch.subtask_a)

print(f'Val {report}')


Val Loss: 0.468 Acc: 91.05% Macro F1 0.787 (P 0.624 - N 0.949)
