## Danish multilingual Analysis

In this notebook we will look at the errors that our model performs in zero-shot mode.

We will use a model trained on OLID

In [1]:
%load_ext autoreload
%autoreload 2
import os
from datetime import datetime
import fire
import torch
import pandas as pd
from torchtext import data
import torch.nn as nn
from transformers import (
    AdamW, BertForSequenceClassification, BertTokenizer,
    get_constant_schedule_with_warmup
)

from offenseval.nn import (
    Tokenizer,
    train, evaluate, train_cycle, save_model, load_model, evaluate_dataset
)
from offenseval.datasets import datasets

pd.options.display.max_rows = 200
pd.options.display.max_colwidth = 300

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


Create fields and some other boilerplate

In [2]:
from offenseval.datasets import datasets, build_dataset
from transformers import BertModel, BertTokenizer

bert_name = "bert-base-multilingual-cased"
bert_model = BertModel.from_pretrained(bert_name)
bert_tokenizer = BertTokenizer.from_pretrained(bert_name)


In [3]:
init_token_idx = bert_tokenizer.cls_token_id
eos_token_idx = bert_tokenizer.sep_token_id
pad_token_idx = bert_tokenizer.pad_token_id
unk_token_idx = bert_tokenizer.unk_token_id

# Trying to cut this down to check if this improves memory usage

tokenizer = Tokenizer(bert_tokenizer)

ID = data.Field(sequential=False, use_vocab=False)
# All these arguments are because these are really floats
# See https://github.com/pytorch/text/issues/78#issuecomment-541203609
SUBTASK_A = data.LabelField()

TEXT = data.Field(
    tokenize=tokenizer.tokenize,
    include_lengths = True,
    use_vocab=False,
    batch_first = True,
    preprocessing = tokenizer.convert_tokens_to_ids,
    init_token = init_token_idx,
    eos_token = eos_token_idx,
    pad_token = pad_token_idx,
    unk_token = unk_token_idx
)

Get the predictions

In [4]:
from offenseval.datasets import datasets, build_dataset, build_examples

fields = {
    "id": ('id', ID),
    "tweet": ('text', TEXT),
    "subtask_a": ("subtask_a", SUBTASK_A)
}

df_train_en = pd.read_table(datasets["olid"]["train"])
df_train_da = pd.read_table(datasets["danish"]["train"])
df_train_tr = pd.read_table(datasets["turkish"]["train"])

#df_train_en = df_train_en.sample(df_train_da.shape[0])

train_en_examples = build_examples(df_train_en, fields)
train_da_examples = build_examples(df_train_da, fields)
train_tr_examples = build_examples(df_train_tr, fields)


print(f"There are {df_train_en.shape[0]} English tweets")
print(f"There are {df_train_da.shape[0]} Danish tweets")
print(f"There are {df_train_tr.shape[0]} Turkish tweets")


There are 13240 English tweets
There are 2368 Danish tweets
There are 25021 Turkish tweets


In [5]:
examples = train_en_examples + train_da_examples 

train_dataset = data.Dataset(examples, fields.values())
dev_dataset = build_dataset(datasets["danish"]["dev"], fields)


SUBTASK_A.build_vocab(dev_dataset)
assert SUBTASK_A.vocab.itos == ["NOT", "OFF"]

In [6]:
print("Building iterators")

BATCH_SIZE = 16

train_it, dev_it = data.BucketIterator.splits(
    (train_dataset, dev_dataset), batch_size=BATCH_SIZE, device=device,
    sort_key = lambda x: len(x.text), sort_within_batch = True,
)

Building iterators


In [7]:
from offenseval.nn import create_criterion
from offenseval.nn.models import BertSeqModel

model = BertSeqModel(bert_model, dropout=0.10).to(device)
epochs = 10

criterion = create_criterion(device, weight_with=train_dataset)
optimizer = AdamW(model.parameters(), lr=1e-5)

num_training_steps = epochs * len(train_it)
num_warmup_steps = num_training_steps // 10
warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1

scheduler = get_constant_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps,
)

Construct dataset for better visualization

In [8]:
from offenseval.nn import train_cycle

def get_target(batch):
    return batch.subtask_a.double()

output_path = "../../models/bert_cased.en+da.pt"

train_cycle(
    model, optimizer, criterion, scheduler,
    train_it, dev_it, epochs, get_target=get_target,
    model_path=output_path, early_stopping_tolerance=5, ncols=700
)

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=10.0), HTML(value='')), layout=Layout(dis…



Epoch 0


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=976.0), HTML(value='')), layout=Layout(di…

RuntimeError: CUDA out of memory. Tried to allocate 22.00 MiB (GPU 0; 10.92 GiB total capacity; 3.83 GiB already allocated; 20.50 MiB free; 4.00 GiB reserved in total by PyTorch)