## BERT Cased trained on distant Multilingual Test

Let's load a BERT model in English and test it against the other languages


| Language    | Accuracy   | Macro F1   | Pos F1    |
|-------------|------------|------------|-----------|                     
|  English    |  0.823     |   0.783    |    0.690  |
|  Danish     |  0.877     |   0.682    |    0.434  |
|  Turkish    |  0.800     |   0.506    |    0.125  |
|  Arabic     |  0.822     |   0.482    |    0.063  |
|  Greek      |  0.719     |   0.475    |    0.118  |


In [1]:
%load_ext autoreload
%autoreload 2
import os
from datetime import datetime
import fire
import torch
from torchtext import data
import torch.nn as nn
from transformers import (
    AdamW, BertForSequenceClassification, BertTokenizer,
    get_constant_schedule_with_warmup
)

from offenseval.nn import (
    Tokenizer,
    train, evaluate, train_cycle, save_model, load_model
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model, TEXT = load_model("../models/bert_cased.en_sample.pt", device)

In [2]:
import pandas as pd
from offenseval.nn import evaluate_dataset

datasets = {
    "English": "../data/olid/test_a.tsv",
    "Danish": "../data/Danish/dev.tsv",
    "Turkish": "../data/Turkish/dev.tsv",
    "Arabic": "../data/Arabic/offenseval-ar-dev-v1.tsv",
    "Greek": "../data/Greek/dev.tsv",
}

df_results = []

for lang, path in datasets.items():
    print(lang)
    loss, acc, f1, pos_f1, neg_f1 = evaluate_dataset(
        model, TEXT, path
    )
    print(f'Test Loss: {loss:.3f}  Acc: {acc*100:.2f}% Macro F1: {f1:.3f} Pos F1 {pos_f1:.3f} Neg F1 {neg_f1:.3f}')
    df_results.append({
        "lang": lang, "Accuracy": acc, "Macro-F1": f1, "Pos-F1": pos_f1})

pd.options.display.float_format = '{:,.3f}'.format

df_results = pd.DataFrame(df_results)
df_results.set_index("lang", inplace=True)

df_results

English
Loading dataset...
Building iterators
Test Loss: 0.679  Acc: 82.33% Macro F1: 0.783 Pos F1 0.690 Neg F1 0.876
Danish
Loading dataset...
Building iterators
Test Loss: 0.557  Acc: 87.67% Macro F1: 0.682 Pos F1 0.434 Neg F1 0.931
Turkish
Loading dataset...
Building iterators
Test Loss: 0.739  Acc: 80.04% Macro F1: 0.506 Pos F1 0.125 Neg F1 0.887
Arabic
Loading dataset...
Building iterators
Test Loss: 0.668  Acc: 82.20% Macro F1: 0.482 Pos F1 0.063 Neg F1 0.902
Greek
Loading dataset...
Building iterators
Test Loss: 0.858  Acc: 71.87% Macro F1: 0.475 Pos F1 0.118 Neg F1 0.833


Unnamed: 0_level_0,Accuracy,Macro-F1,Pos-F1
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
English,0.823,0.783,0.69
Danish,0.877,0.682,0.434
Turkish,0.8,0.506,0.125
Arabic,0.822,0.482,0.063
Greek,0.719,0.475,0.118


In [3]:
print(df_results.to_string())

         Accuracy  Macro-F1  Pos-F1
lang                               
English     0.823     0.783   0.690
Danish      0.877     0.682   0.434
Turkish     0.800     0.506   0.125
Arabic      0.822     0.482   0.063
Greek       0.719     0.475   0.118
