# English pretraining then Danish finetuning

In this notebook we finetune the model in English to then finetune it again in Danish

It has similar performance

In [1]:
%load_ext autoreload
%autoreload 2
import os
from datetime import datetime
import fire
import torch
import pandas as pd
from torchtext import data
import torch.nn as nn
from transformers import (
    AdamW, BertForSequenceClassification, BertTokenizer,
    get_constant_schedule_with_warmup
)

from offenseval.nn import (
    Tokenizer,
    train, evaluate, train_cycle, save_model, load_model, evaluate_dataset
)
from offenseval.datasets import datasets

pd.options.display.max_rows = 200
pd.options.display.max_colwidth = 300

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


Create fields and some other boilerplate

In [2]:
from offenseval.datasets import datasets, build_dataset
from transformers import BertModel, BertTokenizer

bert_name = "bert-base-multilingual-cased"
bert_model = BertModel.from_pretrained(bert_name)
bert_tokenizer = BertTokenizer.from_pretrained(bert_name)


In [3]:
init_token_idx = bert_tokenizer.cls_token_id
eos_token_idx = bert_tokenizer.sep_token_id
pad_token_idx = bert_tokenizer.pad_token_id
unk_token_idx = bert_tokenizer.unk_token_id

# Trying to cut this down to check if this improves memory usage

tokenizer = Tokenizer(bert_tokenizer)

ID = data.Field(sequential=False, use_vocab=False)
# All these arguments are because these are really floats
# See https://github.com/pytorch/text/issues/78#issuecomment-541203609
SUBTASK_A = data.LabelField()

TEXT = data.Field(
    tokenize=tokenizer.tokenize,
    include_lengths = True,
    use_vocab=False,
    batch_first = True,
    preprocessing = tokenizer.convert_tokens_to_ids,
    init_token = init_token_idx,
    eos_token = eos_token_idx,
    pad_token = pad_token_idx,
    unk_token = unk_token_idx
)

Get the predictions

In [4]:
from offenseval.datasets import datasets, build_dataset, build_examples

fields = {
    "id": ('id', ID),
    "text": ('text', TEXT),
    "subtask_a": ("subtask_a", SUBTASK_A)
}

train_en_dataset = build_dataset(datasets["olid"]["train"], fields)
dev_en_dataset = build_dataset(datasets["olid"]["dev"], fields)

train_da_dataset = build_dataset(datasets["danish"]["train"], fields)
dev_da_dataset = build_dataset(datasets["danish"]["dev"], fields)


SUBTASK_A.build_vocab(dev_en_dataset)
assert SUBTASK_A.vocab.itos == ["NOT", "OFF"]

In [5]:

print(f"There are {len(train_en_dataset)} English training tweets")
print(f"There are {len(dev_en_dataset)} English dev tweets")

print(f"There are {len(train_da_dataset)} Danish training tweets")
print(f"There are {len(dev_da_dataset)} Danish dev tweets")


There are 13240 English training tweets
There are 860 English dev tweets
There are 2368 Danish training tweets
There are 592 Danish dev tweets


In [6]:
print("Building iterators")

BATCH_SIZE = 32

train_en_it, dev_en_it = data.BucketIterator.splits(
    (train_en_dataset, dev_en_dataset), batch_size=BATCH_SIZE, device=device,
    sort_key = lambda x: len(x.text), sort_within_batch = True,
)


train_da_it, dev_da_it = data.BucketIterator.splits(
    (train_da_dataset, dev_da_dataset), batch_size=BATCH_SIZE, device=device,
    sort_key = lambda x: len(x.text), sort_within_batch = True,
)

Building iterators


In [7]:
from offenseval.nn import create_criterion
from offenseval.nn.models import BertSeqModel
from transformers import get_linear_schedule_with_warmup

model = BertSeqModel(bert_model, dropout=0.10).to(device)
epochs = 10

criterion = create_criterion(device, weight_with=train_en_dataset)
optimizer = AdamW(model.parameters(), lr=5e-5)

num_training_steps = epochs * len(train_en_it)
num_warmup_steps = num_training_steps // 10
warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
)

Construct dataset for better visualization

In [8]:
from offenseval.nn import train_cycle

def get_target(batch):
    return batch.subtask_a.double()

output_path = "../../models/bert_cased.en.da.pt"

train_cycle(
    model, optimizer, criterion, scheduler, 
    train_en_it, dev_en_it, epochs, get_target=get_target, monitor="f1",
    model_path=output_path, early_stopping_tolerance=5, ncols=700
)

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=10.0), HTML(value='')), layout=Layout(dis…



Epoch 0


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=414.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.791 Acc: 67.39%
Val.Loss: 0.629 Acc: 77.91% Macro F1 0.752 (P 0.669 - N 0.834)
Best model so far (Loss: 0.629 Acc: 77.91% Macro F1 0.752 (P 0.669 - N 0.834)) saved at ../../models/bert_cased.en.da.pt


Epoch 1


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=414.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.665 Acc: 77.14%
Val.Loss: 0.612 Acc: 78.37% Macro F1 0.756 (P 0.673 - N 0.839)
Best model so far (Loss: 0.612 Acc: 78.37% Macro F1 0.756 (P 0.673 - N 0.839)) saved at ../../models/bert_cased.en.da.pt


Epoch 2


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=414.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.535 Acc: 83.24%
Val.Loss: 0.593 Acc: 76.86% Macro F1 0.743 (P 0.661 - N 0.824)


Epoch 3


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=414.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.394 Acc: 89.30%
Val.Loss: 0.724 Acc: 81.05% Macro F1 0.768 (P 0.668 - N 0.867)
Best model so far (Loss: 0.724 Acc: 81.05% Macro F1 0.768 (P 0.668 - N 0.867)) saved at ../../models/bert_cased.en.da.pt


Epoch 4


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=414.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.287 Acc: 93.00%
Val.Loss: 0.950 Acc: 78.26% Macro F1 0.749 (P 0.658 - N 0.841)


Epoch 5


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=414.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.215 Acc: 95.68%
Val.Loss: 1.111 Acc: 79.65% Macro F1 0.759 (P 0.663 - N 0.854)


Epoch 6


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=414.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.164 Acc: 96.60%
Val.Loss: 1.248 Acc: 81.28% Macro F1 0.761 (P 0.649 - N 0.872)


Epoch 7


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=414.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.116 Acc: 97.97%
Val.Loss: 1.325 Acc: 83.02% Macro F1 0.784 (P 0.684 - N 0.884)
Best model so far (Loss: 1.325 Acc: 83.02% Macro F1 0.784 (P 0.684 - N 0.884)) saved at ../../models/bert_cased.en.da.pt


Epoch 8


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=414.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.085 Acc: 98.55%
Val.Loss: 1.452 Acc: 82.21% Macro F1 0.781 (P 0.687 - N 0.876)


Epoch 9


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=414.0), HTML(value='')), layout=Layout(di…


Train: Loss: 0.060 Acc: 98.92%
Val.Loss: 1.526 Acc: 82.33% Macro F1 0.778 (P 0.678 - N 0.878)


In [9]:
model.load_state_dict(torch.load(output_path))

report = evaluate(
    model, 
    dev_en_it, 
    criterion, 
    get_target=lambda batch: batch.subtask_a)

print(f'Val {report}')


Val Loss: 0.603 Acc: 83.49% Macro F1 0.791 (P 0.695 - N 0.887)


Now, train it in Danish

In [10]:
from offenseval.nn import create_criterion
from offenseval.nn.models import BertSeqModel
from transformers import get_linear_schedule_with_warmup

epochs = 10

criterion = create_criterion(device, weight_with=train_en_dataset)
optimizer = AdamW(model.parameters(), lr=2e-5)

num_training_steps = epochs * len(train_en_it)
num_warmup_steps = num_training_steps // 10
warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
)

In [11]:
from offenseval.nn import train_cycle


train_cycle(
    model, optimizer, criterion, scheduler, 
    train_da_it, dev_da_it, epochs, get_target=get_target, monitor="f1",
    model_path=output_path, early_stopping_tolerance=5, ncols=700
)

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=10.0), HTML(value='')), layout=Layout(dis…



Epoch 0


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=74.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.545 Acc: 85.26%
Val.Loss: 0.517 Acc: 88.34% Macro F1 0.691 (P 0.448 - N 0.935)
Best model so far (Loss: 0.517 Acc: 88.34% Macro F1 0.691 (P 0.448 - N 0.935)) saved at ../../models/bert_cased.en.da.pt


Epoch 1


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=74.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.483 Acc: 88.30%
Val.Loss: 0.480 Acc: 88.34% Macro F1 0.704 (P 0.473 - N 0.934)
Best model so far (Loss: 0.480 Acc: 88.34% Macro F1 0.704 (P 0.473 - N 0.934)) saved at ../../models/bert_cased.en.da.pt


Epoch 2


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=74.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.390 Acc: 90.88%
Val.Loss: 0.502 Acc: 90.20% Macro F1 0.735 (P 0.525 - N 0.945)
Best model so far (Loss: 0.502 Acc: 90.20% Macro F1 0.735 (P 0.525 - N 0.945)) saved at ../../models/bert_cased.en.da.pt


Epoch 3


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=74.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.290 Acc: 94.09%
Val.Loss: 0.539 Acc: 89.70% Macro F1 0.748 (P 0.555 - N 0.942)
Best model so far (Loss: 0.539 Acc: 89.70% Macro F1 0.748 (P 0.555 - N 0.942)) saved at ../../models/bert_cased.en.da.pt


Epoch 4


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=74.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.205 Acc: 96.07%
Val.Loss: 0.626 Acc: 89.19% Macro F1 0.750 (P 0.562 - N 0.938)
Best model so far (Loss: 0.626 Acc: 89.19% Macro F1 0.750 (P 0.562 - N 0.938)) saved at ../../models/bert_cased.en.da.pt


Epoch 5


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=74.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.127 Acc: 97.93%
Val.Loss: 0.706 Acc: 91.05% Macro F1 0.763 (P 0.576 - N 0.950)
Best model so far (Loss: 0.706 Acc: 91.05% Macro F1 0.763 (P 0.576 - N 0.950)) saved at ../../models/bert_cased.en.da.pt


Epoch 6


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=74.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.132 Acc: 98.10%
Val.Loss: 0.719 Acc: 90.88% Macro F1 0.763 (P 0.578 - N 0.949)
Best model so far (Loss: 0.719 Acc: 90.88% Macro F1 0.763 (P 0.578 - N 0.949)) saved at ../../models/bert_cased.en.da.pt


Epoch 7


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=74.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.097 Acc: 98.69%
Val.Loss: 0.799 Acc: 90.54% Macro F1 0.758 (P 0.569 - N 0.947)


Epoch 8


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=74.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.082 Acc: 98.73%
Val.Loss: 1.085 Acc: 85.64% Macro F1 0.718 (P 0.520 - N 0.916)


Epoch 9


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=74.0), HTML(value='')), layout=Layout(dis…


Train: Loss: 0.060 Acc: 99.07%
Val.Loss: 1.118 Acc: 90.20% Macro F1 0.746 (P 0.547 - N 0.945)


In [12]:
model.load_state_dict(torch.load(output_path))

report = evaluate(
    model, 
    dev_da_it, 
    criterion, 
    get_target=lambda batch: batch.subtask_a)

print(f'Val {report}')


Val Loss: 0.719 Acc: 90.88% Macro F1 0.763 (P 0.578 - N 0.949)
