In [2]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "xlm-roberta-base"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
xlmr_params = model.num_parameters()
print(f"XLM-R Base has {xlmr_params/1000000}M parameters ")

XLM-R Base has 278.295186M parameters 


In [4]:
text = "I am a <mask>"

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits

mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> I am a woman'
'>>> I am a girl'
'>>> I am a blogger'
'>>> I am a female'
'>>> I am a lesbian'


In [7]:
from datasets import load_dataset, Dataset, DatasetDict, Features, Value
import os, csv
import pandas as pd

data_dir = "../data"
langs = ["NR","SS","XH","ZU"]
datasets = {}

for lang in langs:
    lang_set = {
        "TRAIN": Dataset.from_pandas(
            pd.read_csv(f"../data/TRAIN/{lang}_TRAIN.tsv", delimiter="\t", quoting=csv.QUOTE_NONE)
        ),
        "TEST": Dataset.from_pandas(
            pd.read_csv(f"../data/TEST/{lang}_TEST.tsv", delimiter="\t", quoting=csv.QUOTE_NONE)
        ),
    }

    datasets[lang] = DatasetDict(lang_set)

datasets

{'NR': DatasetDict({
     TRAIN: Dataset({
         features: ['word', 'seg', 'parse'],
         num_rows: 44663
     })
     TEST: Dataset({
         features: ['word', 'seg', 'parse'],
         num_rows: 5026
     })
 }),
 'SS': DatasetDict({
     TRAIN: Dataset({
         features: ['word', 'seg', 'parse'],
         num_rows: 42596
     })
     TEST: Dataset({
         features: ['word', 'seg', 'parse'],
         num_rows: 4789
     })
 }),
 'XH': DatasetDict({
     TRAIN: Dataset({
         features: ['word', 'seg', 'parse'],
         num_rows: 43825
     })
     TEST: Dataset({
         features: ['word', 'seg', 'parse'],
         num_rows: 4910
     })
 }),
 'ZU': DatasetDict({
     TRAIN: Dataset({
         features: ['word', 'seg', 'parse'],
         num_rows: 44142
     })
     TEST: Dataset({
         features: ['word', 'seg', 'parse'],
         num_rows: 4955
     })
 })}

In [28]:
def tokenize_function(examples):
    result = tokenizer(examples["seg"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


x = tokenizer.encode(datasets["NR"]["TRAIN"]["word"][1])
print(x)
print(tokenizer.decode(x))

# Use batched=True to activate fast multithreading!
# tokenized_datasets = datasets["NR"].map(
#     tokenize_function, batched=True, remove_columns=["word", "seg", "parse"]
# )
# tokenized_datasets

[0, 6, 4, 2]
<s>,</s>
