In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install ipywidgets seqeval

In [None]:
from datasets import DatasetDict
from collections import defaultdict
import matplotlib.pyplot as plt
import torch
from transformers import TrainingArguments


# 1. Load dataset

In [None]:
#%cd /kaggle/input/xtremepanx/save

In [None]:
languages = ['en', 'vi', 'fr', 'nl', 'zh']

In [None]:
panx_ch = defaultdict(DatasetDict)
for lang in languages:
    panx_ch[lang] = DatasetDict.load_from_disk(f'/kaggle/input/xtremepanx/save/{lang}_datasets')

In [None]:
panx_ch

In [None]:
example = panx_ch["vi"]["train"][80]
print(example)

# 1.1 Create Tag name

In [None]:
tags = panx_ch["en"]["train"].features["ner_tags"].feature

def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

In [None]:
panx_en = panx_ch["en"].map(create_tag_names)

In [None]:
indices_to_print = [19, 4234, 12904]

for index in indices_to_print:
    en_example = panx_en["train"][index]
    df = pd.DataFrame([en_example["tokens"], en_example["ner_tags_str"]],
                      ['Tokens', 'Tag names'])
    print(df)
    print("\n" + "="*50 + "\n")  


# 2. Model

## 2.1. XLMRoberta with Token classification head

In [None]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # Load model body
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        # Set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # Load and initialize weights
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, 
                labels=None, **kwargs):
        # Use model body to get encoder representations
        outputs = self.roberta(input_ids, attention_mask=attention_mask,
                               token_type_ids=token_type_ids, **kwargs)
        # Apply classifier to encoder representation
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        # Calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        # Return model output object
        return TokenClassifierOutput(loss=loss, logits=logits, 
                                     hidden_states=outputs.hidden_states, 
                                     attentions=outputs.attentions)

## 2.2. Load body config and weights

In [None]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [None]:
xlmr_model_name = "xlm-roberta-base"

In [None]:
from transformers import AutoConfig

xlmr_config = AutoConfig.from_pretrained(xlmr_model_name, 
                                         num_labels=tags.num_classes,
                                         id2label=index2tag, label2id=tag2index)

In [None]:
xlmr_config

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# xlmr_model = (XLMRobertaForTokenClassification
#               .from_pretrained(xlmr_model_name, config=xlmr_config)
#               .to(device))

# 3. Tokenize and align

## 3.1 Load tokenizer

In [None]:
from transformers import AutoTokenizer

xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

## 3.2 Tokenize and align labels
Tokenize the inputs into subwords and mask the tag id of subsequent subwords to -100

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True, 
                                      is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched=True, 
                      remove_columns=['ner_tags', 'tokens'])

In [None]:
panx_en_encoded = encode_panx_dataset(panx_ch["en"])

In [None]:
panx_en_encoded["train"][1]

# 4. Finetune

# 4.1. Training Arguments

In [None]:
from huggingface_hub import login
login("...")


## 4.2 Define Metrics

In [None]:
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

In [None]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions, 
                                       eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

In [None]:
def get_f1_score(trainer, dataset):
    return trainer.predict(dataset).metrics["test_f1"]

In [None]:
#dynamically pad the inputs received, as well as the labels
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [None]:
def model_init():
    return (XLMRobertaForTokenClassification
            .from_pretrained(xlmr_model_name, config=xlmr_config)
            .to(device))

In [None]:
%env TOKENIZERS_PARALLELISM=false

In [None]:
!wandb login ...

In [None]:
from transformers import EarlyStoppingCallback

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,  
    early_stopping_threshold=0.0  
)

In [None]:
num_epochs = 4
batch_size = 16
model_name = f"xlm-roberta-base-finetuned-panx"

training_args = TrainingArguments(
    output_dir=model_name, log_level="error", num_train_epochs=num_epochs, 
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size, evaluation_strategy="epoch",
    save_only_model=True,
    save_steps=1e6, weight_decay=0.01, disable_tqdm=False, 
    save_strategy="epoch", save_total_limit=1,
    push_to_hub=True, 
    load_best_model_at_end=True,
    metric_for_best_model='f1'
    )


# Finetune on each language

In [None]:
from transformers import Trainer

def train_model(dataset):
    train_ds = dataset["train"]
    valid_ds = dataset["validation"]
    test_ds = dataset["test"]
    training_args.logging_steps = len(train_ds) // batch_size
    
    trainer = Trainer(model_init=model_init, args=training_args,
        data_collator=data_collator, compute_metrics=compute_metrics,
        train_dataset=train_ds, eval_dataset=valid_ds, tokenizer=xlmr_tokenizer,
        callbacks=[early_stopping_callback])
    trainer.train()
    if training_args.push_to_hub:
        trainer.push_to_hub(commit_message="Training completed!")
    
    f1_score = get_f1_score(trainer, test_ds)
    return pd.DataFrame.from_dict(
        {"num_samples": [len(train_ds)], "f1_score": [f1_score]})

In [None]:
langs = ['en', 'vi', 'fr', 'nl', 'zh']

In [None]:
import gc
 
gc.collect()

with torch.no_grad():
    torch.cuda.empty_cache()


In [None]:
f1_scores = defaultdict(dict)

In [None]:
# for lang in langs:
#     training_args.output_dir = f"xlm-roberta-base-finetuned-panx-{lang}"
#     # Fine-tune on monolingual corpus
#     ds_encoded = encode_panx_dataset(panx_ch[lang])
#     #metrics = train_model(ds_encoded)
#     # Collect F1-scores in common dict
#     f1_scores[lang][lang] = metrics["f1_score"][0]


## Finetune on all languages

In [None]:
corpora = []
for lang in langs:
    ds_encoded = encode_panx_dataset(panx_ch[lang])
    corpora.append(ds_encoded)


In [None]:
from datasets import concatenate_datasets

def concatenate_splits(corpora):
    multi_corpus = DatasetDict()
    for split in corpora[0].keys():
        multi_corpus[split] = concatenate_datasets(
            [corpus[split] for corpus in corpora]).shuffle(seed=42)
    return multi_corpus

In [None]:
corpora_encoded = concatenate_splits(corpora)

In [None]:
training_args.logging_steps = len(corpora_encoded["train"]) // batch_size
training_args.output_dir = "xlm-roberta-base-finetuned-panx-all"
modelall = XLMRobertaForTokenClassification.from_pretrained('ladoza03/xlm-roberta-base-finetuned-panx-all')
training_args.num_train_epochs = 100
trainer = Trainer(model=modelall, args=training_args,
    data_collator=data_collator, compute_metrics=compute_metrics,
    tokenizer=xlmr_tokenizer, train_dataset=corpora_encoded["train"],
    eval_dataset=corpora_encoded["validation"],
    callbacks=[early_stopping_callback])

trainer.train()
trainer.push_to_hub(commit_message="Training completed!")

# Evaluate

In [None]:
modelall = XLMRobertaForTokenClassification.from_pretrained('ladoza03/xlm-roberta-base-finetuned-panx-all')

In [None]:
trainer_all = Trainer(model=modelall, args=training_args,
    data_collator=data_collator, compute_metrics=compute_metrics,
    tokenizer=xlmr_tokenizer)


In [None]:
model_en = XLMRobertaForTokenClassification.from_pretrained('ladoza03/xlm-roberta-base-finetuned-panx-en')
model_fr = XLMRobertaForTokenClassification.from_pretrained('ladoza03/xlm-roberta-base-finetuned-panx-fr')
model_vi = XLMRobertaForTokenClassification.from_pretrained('ladoza03/xlm-roberta-base-finetuned-panx-vi')
model_nl = XLMRobertaForTokenClassification.from_pretrained('ladoza03/xlm-roberta-base-finetuned-panx-nl')
model_zh = XLMRobertaForTokenClassification.from_pretrained('ladoza03/xlm-roberta-base-finetuned-panx-zh')

In [None]:
trainer_en = Trainer(model=model_en, args=training_args,
    data_collator=data_collator, compute_metrics=compute_metrics,
    tokenizer=xlmr_tokenizer)

trainer_vi = Trainer(model=model_vi, args=training_args,
    data_collator=data_collator, compute_metrics=compute_metrics,
    tokenizer=xlmr_tokenizer)

trainer_fr = Trainer(model=model_fr, args=training_args,
    data_collator=data_collator, compute_metrics=compute_metrics,
    tokenizer=xlmr_tokenizer)

trainer_zh = Trainer(model=model_zh, args=training_args,
    data_collator=data_collator, compute_metrics=compute_metrics,
    tokenizer=xlmr_tokenizer)

trainer_nl = Trainer(model=model_nl, args=training_args,
    data_collator=data_collator, compute_metrics=compute_metrics,
    tokenizer=xlmr_tokenizer)



In [None]:
def get_f1_score(trainer, dataset):
    return trainer.predict(dataset).metrics["test_f1"]

In [None]:
def evaluate_lang_performance(lang, trainer):
    panx_ds = encode_panx_dataset(panx_ch[lang])
    return get_f1_score(trainer, panx_ds["test"])

# Zero shot

In [None]:
f1_scores = defaultdict(dict)

In [None]:
f1_scores["en"]["en"] = evaluate_lang_performance("en", trainer_en)
print(f"F1-score of [en] model on [en] dataset: {f1_scores['en']['en']:.3f}")

In [None]:
f1_scores["en"]["vi"] = evaluate_lang_performance("vi", trainer_en)
print(f"F1-score of [en] model on [vi] dataset: {f1_scores['en']['vi']:.3f}")

In [None]:
f1_scores["en"]["nl"] = evaluate_lang_performance("nl", trainer_en)
print(f"F1-score of [en] model on [nl] dataset: {f1_scores['en']['nl']:.3f}")

In [None]:
f1_scores["en"]["fr"] = evaluate_lang_performance("fr", trainer_en)
print(f"F1-score of [en] model on [fr] dataset: {f1_scores['en']['fr']:.3f}")

In [None]:
f1_scores["en"]["zh"] = evaluate_lang_performance("zh", trainer_en)
print(f"F1-score of [en] model on [zh] dataset: {f1_scores['en']['zh']:.3f}")

# Monolingual and Multilingual

In [None]:
f1_scores["vi"]["vi"] = evaluate_lang_performance("vi", trainer_vi)
print(f"F1-score of [vi] model on [vi] dataset: {f1_scores['vi']['vi']:.3f}")

In [None]:
f1_scores["fr"]["fr"] = evaluate_lang_performance("fr", trainer_fr)
print(f"F1-score of [fr] model on [fr] dataset: {f1_scores['fr']['fr']:.3f}")

In [None]:
f1_scores["nl"]["nl"] = evaluate_lang_performance("nl", trainer_nl)
print(f"F1-score of [nl] model on [nl] dataset: {f1_scores['nl']['nl']:.3f}")

In [None]:
f1_scores["zh"]["zh"] = evaluate_lang_performance("zh", trainer_zh)
print(f"F1-score of [zh] model on [zh] dataset: {f1_scores['zh']['zh']:.3f}")

In [None]:
for idx, lang in enumerate(langs):
    f1_scores["all"][lang] = get_f1_score(trainer_all, corpora[idx]["test"])

In [None]:
scores_data = {"all": f1_scores["all"]}
f1_scores_df = pd.DataFrame(scores_data).T.round(4)
f1_scores_df.rename_axis(index="Fine-tune on", columns="Evaluated on",
                         inplace=True)
f1_scores_df

In [None]:
scores_data = {"en": f1_scores["en"],
               "each": {lang: f1_scores[lang][lang] for lang in langs},
               "all": f1_scores["all"]}
f1_scores_df = pd.DataFrame(scores_data).T.round(4)
f1_scores_df.rename_axis(index="Fine-tune on", columns="Evaluated on",
                         inplace=True)
f1_scores_df

## When zero-shot is better than monolingual model

In [None]:
def train_on_subset(dataset, num_samples):
    train_ds = dataset["train"].shuffle(seed=42).select(range(num_samples))
    valid_ds = dataset["validation"]
    test_ds = dataset["test"]
    training_args.logging_steps = len(train_ds) // batch_size
    
    trainer = Trainer(model_init=model_init, args=training_args,
        data_collator=data_collator, compute_metrics=compute_metrics,
        train_dataset=train_ds, eval_dataset=valid_ds, tokenizer=xlmr_tokenizer)
    trainer.train()
    if training_args.push_to_hub:
        trainer.push_to_hub(commit_message="Training completed!")
    
    f1_score = get_f1_score(trainer, test_ds)
    return pd.DataFrame.from_dict(
        {"num_samples": [len(train_ds)], "f1_score": [f1_score]})

In [None]:
panx_nl_encoded = encode_panx_dataset(panx_ch["fr"])

In [None]:
from transformers import Trainer

columns = ['num_samples', 'f1_score']
metrics_df = pd.DataFrame(columns=columns)
training_args.num_train_epochs = 3

for num_samples in [250, 500, 1000, 2000, 4000]:
    subset_metrics = train_on_subset(panx_nl_encoded, num_samples)
    metrics_df = pd.concat([metrics_df, subset_metrics], ignore_index=True)


In [None]:
fig, ax = plt.subplots()
ax.axhline(0.8017, ls="--", color="r")
metrics_df.set_index("num_samples").plot(ax=ax)
plt.legend(["Zero-shot from English", "Fine-tuned on French"], loc="lower right")
plt.ylim((0, 1))
plt.xlabel("Number of Training Samples")
plt.ylabel("F1 Score")
plt.show()