In [2]:
from datasets import get_dataset_config_names

xtreme_subsets = get_dataset_config_names("xtreme")
print(f"XTREME has {len(xtreme_subsets)} configurations")

XTREME has 183 configurations


In [3]:
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
panx_subsets[:3]

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']

In [4]:
from datasets import load_dataset

# load german corpus (.de)
load_dataset("xtreme", name = "PAN-X.de")

Downloading and preparing dataset xtreme/PAN-X.de (download: 223.17 MiB, generated: 9.08 MiB, post-processed: Unknown size, total: 232.25 MiB) to C:\Users\61417\.cache\huggingface\datasets\xtreme\PAN-X.de\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4...


Downloading data: 100%|██████████| 234M/234M [01:10<00:00, 3.31MB/s] 
                                                                                          

Dataset xtreme downloaded and prepared to C:\Users\61417\.cache\huggingface\datasets\xtreme\PAN-X.de\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 227.77it/s]


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [6]:
# create a 'swiss' corpus

from collections import defaultdict
from datasets import DatasetDict

langs = ["de", "fr","it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059] # empirical number of lang speakers in switzerland
# Return a DatasetDict if a key does not exist
panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    # Load monolingual corpus
    ds = load_dataset("xtreme", name = f"PAN-X.{lang}")
    # shuffle and downsample each split according to spoken proportion
    for split in ds:
        panx_ch[lang][split] = (
            ds[split]
            .shuffle(seed=0)
            .select(range(int(frac * ds[split].num_rows)))
        )


Reusing dataset xtreme (C:\Users\61417\.cache\huggingface\datasets\xtreme\PAN-X.de\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)
100%|██████████| 3/3 [00:00<00:00, 376.88it/s]
Loading cached shuffled indices for dataset at C:\Users\61417\.cache\huggingface\datasets\xtreme\PAN-X.de\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4\cache-e5ddf09f1ae095ec.arrow


Downloading and preparing dataset xtreme/PAN-X.fr (download: 223.17 MiB, generated: 6.37 MiB, post-processed: Unknown size, total: 229.53 MiB) to C:\Users\61417\.cache\huggingface\datasets\xtreme\PAN-X.fr\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4...


                                                                                          

Dataset xtreme downloaded and prepared to C:\Users\61417\.cache\huggingface\datasets\xtreme\PAN-X.fr\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 364.25it/s]


Downloading and preparing dataset xtreme/PAN-X.it (download: 223.17 MiB, generated: 7.35 MiB, post-processed: Unknown size, total: 230.52 MiB) to C:\Users\61417\.cache\huggingface\datasets\xtreme\PAN-X.it\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4...


                                                                                          

Dataset xtreme downloaded and prepared to C:\Users\61417\.cache\huggingface\datasets\xtreme\PAN-X.it\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 252.92it/s]


Downloading and preparing dataset xtreme/PAN-X.en (download: 223.17 MiB, generated: 7.30 MiB, post-processed: Unknown size, total: 230.47 MiB) to C:\Users\61417\.cache\huggingface\datasets\xtreme\PAN-X.en\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4...


                                                                                          

Dataset xtreme downloaded and prepared to C:\Users\61417\.cache\huggingface\datasets\xtreme\PAN-X.en\1.0.0\29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 433.59it/s]


In [9]:
import pandas as pd

pd.DataFrame(
    {lang:[panx_ch[lang]["train"].num_rows] for lang in langs},
    index = ["Number of training examples"]
)

Unnamed: 0,de,fr,it,en
Number of training examples,12580,4580,1680,1180


In [13]:
element = panx_ch["de"]["train"][0]

for k, v in element.items():
    print(f"{k}: {v}")

tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']


In [14]:
for k, v in panx_ch["de"]["train"].features.items():
    print(f"{k},{v}")


tokens,Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags,Sequence(feature=ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs,Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


In [16]:
tags = panx_ch["de"]["train"].features["ner_tags"].feature

def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

panx_de = panx_ch["de"].map(create_tag_names)
print(panx_de)

100%|██████████| 12580/12580 [00:02<00:00, 5331.24ex/s]
100%|██████████| 6290/6290 [00:01<00:00, 5050.93ex/s]
100%|██████████| 6290/6290 [00:01<00:00, 5450.34ex/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 12580
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 6290
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 6290
    })
})





In [17]:
de_example = panx_de["train"][0]
pd.DataFrame([de_example["tokens"], de_example["ner_tags_str"]],
["Tokens", "Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
Tags,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


In [18]:
from collections import Counter

split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient = "index")

Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071


In [19]:
from transformers import AutoTokenizer

bert_model_name = 'bert-base-cased'
xlmr_model_name = 'xlm-roberta-base'
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

Downloading tokenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 9.72kB/s]
Downloading config.json: 100%|██████████| 570/570 [00:00<?, ?B/s] 
Downloading vocab.txt: 100%|██████████| 208k/208k [00:00<00:00, 231kB/s]  
Downloading tokenizer.json: 100%|██████████| 426k/426k [00:01<00:00, 365kB/s]  
Downloading config.json: 100%|██████████| 615/615 [00:00<00:00, 619kB/s]
Downloading sentencepiece.bpe.model: 100%|██████████| 4.83M/4.83M [00:06<00:00, 742kB/s] 
Downloading tokenizer.json: 100%|██████████| 8.68M/8.68M [00:11<00:00, 800kB/s] 


In [23]:
text = 'Alan Tudge loves Sydney!'
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()
pd.DataFrame([bert_tokens, xlmr_tokens])

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,[CLS],Alan,Tu,##dge,loves,Sydney,!,[SEP],
1,<s>,▁Alan,▁Tud,ge,▁love,s,▁Sydney,!,</s>


In [30]:
test

{'success': True, 'session_token': 'e23294c1-c955-448e-8100-925482220ebc'}