In [27]:
import pandas as pd
import json
from datasets import load_dataset, load_from_disk, DatasetDict
import os

- get data:
    - `./download_data.sh --task all`

In [3]:
CACHE_DIR='.cache'

### 1. parsing

In [4]:
train_langs=[
    "UD_Armenian-ArmTDP", "UD_Norwegian-Nynorsk", "UD_Portuguese-Bosque", "UD_Italian-PoSTWITA", "UD_Old_French-SRCMF", "UD_North_Sami-Giella", "UD_Norwegian-Bokmaal", "UD_French-ParisStories", "UD_Italian-MarkIT", "UD_Chinese-GSDSimp", "UD_English-EWT", "UD_French-Rhapsodie", "UD_French-ParTUT", "UD_Classical_Chinese-Kyoto", "UD_Norwegian-NynorskLIA", "UD_Arabic-NYUAD", "UD_Portuguese-PetroGold", "UD_Italian-TWITTIRO", "UD_Turkish_German-SAGT", "UD_Maghrebi_Arabic_French-Arabizi", "UD_Portuguese-CINTIL", "UD_Ligurian-GLT", "UD_Dutch-Alpino", "UD_Western_Armenian-ArmTDP", "UD_Portuguese-GSD", "singlish", "UD_Arabic-PADT"
]

In [5]:
print(train_langs)

['UD_Armenian-ArmTDP', 'UD_Norwegian-Nynorsk', 'UD_Portuguese-Bosque', 'UD_Italian-PoSTWITA', 'UD_Old_French-SRCMF', 'UD_North_Sami-Giella', 'UD_Norwegian-Bokmaal', 'UD_French-ParisStories', 'UD_Italian-MarkIT', 'UD_Chinese-GSDSimp', 'UD_English-EWT', 'UD_French-Rhapsodie', 'UD_French-ParTUT', 'UD_Classical_Chinese-Kyoto', 'UD_Norwegian-NynorskLIA', 'UD_Arabic-NYUAD', 'UD_Portuguese-PetroGold', 'UD_Italian-TWITTIRO', 'UD_Turkish_German-SAGT', 'UD_Maghrebi_Arabic_French-Arabizi', 'UD_Portuguese-CINTIL', 'UD_Ligurian-GLT', 'UD_Dutch-Alpino', 'UD_Western_Armenian-ArmTDP', 'UD_Portuguese-GSD', 'singlish', 'UD_Arabic-PADT']


In [6]:
f = open('metadata/udp_metadata.json')
metadata = json.load(f)
# Closing file
f.close()

- we finetune each lang where train split available
- if train split not available, we perform zero-shot from UD_English-EWT

In [7]:
all_lang=pd.DataFrame(metadata).T
all_lang.head(10)

Unnamed: 0,lang,code,desc,langgroup,split
singlish,singlish,eng-sing,,English,"[dev, test, train]"
UD_Armenian-ArmTDP,Armenian,hye-east,,Armenian,"[train, test, dev]"
UD_French-ParTUT,French,fre-multigenre,,French,"[train, dev, test]"
UD_English-EWT,English,eng,,English,"[dev, test, train]"
UD_Ligurian-GLT,Ligurian,lij,,Italian,"[train, test]"
UD_Gheg-GPS,Gheg,aln,,Albanian,[test]
UD_Norwegian-Nynorsk,Norwegian,nor-nynorsk,,Norwegian,"[test, dev, train]"
UD_Albanian-TSA,Albanian,alb,,Albanian,[test]
UD_Italian-PUD,Italian,ita-trans,,Italian,[test]
UD_Portuguese-Bosque,Portuguese,por-euro-bra,,Portuguese,"[test, dev, train]"


In [8]:
#explore single lang
lang='UD_Armenian-ArmTDP'
dataset = load_dataset("scripts/universal_dependencies.py", lang,
            cache_dir=CACHE_DIR)

Reusing dataset universal_dependencies (.cache/universal_dependencies/UD_Armenian-ArmTDP/2.7.0/f5558234d16160340d9861389cbd2b7a134edb3ab5a8cb1173a1715c8b095b06)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 506.93it/s]


In [9]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 1974
    })
    validation: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 249
    })
    test: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 277
    })
})


- parsing use tokens and deprel column

In [10]:
print(dataset['train']['tokens'][:5])
print()
print(dataset['train']['deprel'][:5])

[['Մտածում', 'եմ', '՝', 'Ադամի', 'ու', 'Եվայի', 'վտարումը', 'Եդեմական', 'այգուց', '(', 'դրախտից', ')', ',', 'նրանց', 'տեղափոխումն', 'այլ', 'վայր', ',', 'ուր', 'շրջակայքն', 'այլևս', 'բարեկամական', 'չէր', ',', 'այլ', 'խիստ', 'թշնամական', ',', 'ուր', 'իրենց', 'հացը', 'պիտի', 'տանջանքով', 'վաստակեին', ',', 'նույն', 'մոլորակի', 'սահմաններում', 'չէր', ',', 'որ', 'կատարվեց', ':'], ['Եդեմն', 'այլ', 'մոլորակ', 'էր', ',', 'աքսորավայրը', '՝', 'այլ', ',', 'այսինքն', '՝', 'այս', ',', 'ուր', 'այժմ', 'էլ', 'բնակվում', 'ենք', ',', 'բայց', 'միշտ', 'նայում', 'ենք', 'երկինք', '՝', 'բնազդում', 'դրոշմված', 'հիշողությամբ', 'Եդեմը', 'որոնելով', '։'], ['Իսկ', 'այն', 'չկա', ',', 'Տերը', 'պայթեցրել', 'է', 'կամ', 'գուցե', 'ամայացրել', ',', 'բնակության', 'համար', 'անպիտան', 'դարձրել', ',', 'կամ', 'էլ', 'կա', 'ու', 'ախտահանվում', 'է', '՝', 'նոր', 'բնակիչներ', 'ընդունելու', 'համար', '։'], ['Մի', 'խոսքով', '՝', 'մենք', 'դրա', 'հետ', 'էլ', 'գործ', 'չունենք', ',', 'մերը', 'չէ', 'այլևս', ',', 'մերը', 'սա', 'է', '՝', 'դ

### 2. pos tagging

In [11]:
train_langs=["UD_Armenian-ArmTDP", "UD_Norwegian-Nynorsk", "UD_Portuguese-Bosque", "UD_Italian-PoSTWITA", "UD_Old_French-SRCMF", "UD_North_Sami-Giella", "UD_Norwegian-Bokmaal", "UD_French-ParisStories", "UD_Italian-MarkIT", "UD_Chinese-GSDSimp", "UD_English-EWT", "UD_French-Rhapsodie", "UD_French-ParTUT", "UD_Classical_Chinese-Kyoto", "UD_Norwegian-NynorskLIA", "UD_Arabic-NYUAD", "UD_Portuguese-PetroGold", "UD_Italian-TWITTIRO", "UD_Turkish_German-SAGT", "UD_Maghrebi_Arabic_French-Arabizi", "UD_Portuguese-CINTIL", "UD_Ligurian-GLT", "UD_Dutch-Alpino", "UD_Western_Armenian-ArmTDP", "UD_Portuguese-GSD", "singlish", "UD_Arabic-PADT", "UD_French-GSD", "UD_Catalan-AnCora", "UD_Estonian-EDT", "UD_Finnish-TDT", "UD_Spanish-AnCora"]

In [12]:
print(train_langs)

['UD_Armenian-ArmTDP', 'UD_Norwegian-Nynorsk', 'UD_Portuguese-Bosque', 'UD_Italian-PoSTWITA', 'UD_Old_French-SRCMF', 'UD_North_Sami-Giella', 'UD_Norwegian-Bokmaal', 'UD_French-ParisStories', 'UD_Italian-MarkIT', 'UD_Chinese-GSDSimp', 'UD_English-EWT', 'UD_French-Rhapsodie', 'UD_French-ParTUT', 'UD_Classical_Chinese-Kyoto', 'UD_Norwegian-NynorskLIA', 'UD_Arabic-NYUAD', 'UD_Portuguese-PetroGold', 'UD_Italian-TWITTIRO', 'UD_Turkish_German-SAGT', 'UD_Maghrebi_Arabic_French-Arabizi', 'UD_Portuguese-CINTIL', 'UD_Ligurian-GLT', 'UD_Dutch-Alpino', 'UD_Western_Armenian-ArmTDP', 'UD_Portuguese-GSD', 'singlish', 'UD_Arabic-PADT', 'UD_French-GSD', 'UD_Catalan-AnCora', 'UD_Estonian-EDT', 'UD_Finnish-TDT', 'UD_Spanish-AnCora']


- we finetune each lang where train split available
- if train split not available, we perform zero-shot from UD_English-EWT

In [13]:
f = open('metadata/pos_metadata.json')
metadata = json.load(f)
# Closing file
f.close()

In [14]:
all_lang=pd.DataFrame(metadata).T
all_lang.head(10)

Unnamed: 0,lang,code,desc,langgroup,split,dataset
singlish,singlish,eng-sing,,English,"[dev, test, train]",ud
ROci,occitan,ROci,,French-occ,[test],noisy
UD_Armenian-ArmTDP,Armenian,hye-east,,Armenian,"[train, test, dev]",ud
UD_French-ParTUT,French,fre-multigenre,,French,"[train, dev, test]",ud
UD_English-EWT,English,eng,,English,"[dev, test, train]",ud
UD_Ligurian-GLT,Ligurian,lij,,Italian,"[train, test]",ud
UD_Gheg-GPS,Gheg,aln,,Albanian,[test],ud
UD_Norwegian-Nynorsk,Norwegian,nor-nynorsk,,Norwegian,"[test, dev, train]",ud
UD_Albanian-TSA,Albanian,alb,,Albanian,[test],ud
UD_Italian-PUD,Italian,ita-trans,,Italian,[test],ud


#### use different dataset loading script depending on the dataset

In [15]:
lang='singlish' ## metadata[lang]['dataset']=='ud':
script="scripts/universal_dependencies.py"
predict_dataset = load_dataset(script, lang, cache_dir=CACHE_DIR)
print(predict_dataset)
print(predict_dataset['test']['tokens'][:2])
print(predict_dataset['test']['upos'][:2])

Reusing dataset universal_dependencies (.cache/universal_dependencies/singlish/2.7.0/f5558234d16160340d9861389cbd2b7a134edb3ab5a8cb1173a1715c8b095b06)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 565.80it/s]

DatasetDict({
    train: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 2465
    })
    validation: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 286
    })
    test: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 299
    })
})
[['bt', 'still', 'okie', 'la', 'if', 'go', 'hong', 'kong', '.', '.'], ['Semb', 'Corp', '3.28', 'coming', '.']]
[[10, 14, 6, 15, 5, 16, 10, 10, 1, 1], [10, 10, 3, 16, 1]]





In [16]:
lang='ROci' ## metadata[lang]['dataset']=='ud':
script="scripts/pos_tagging/noisy_dialect.py"
data_dir="data/pos_tagging"
predict_dataset = load_dataset(script, lang, data_dir=data_dir,cache_dir=CACHE_DIR)
print(predict_dataset)
print(predict_dataset['test']['tokens'][:2])
print(predict_dataset['test']['upos'][:2])

Using custom data configuration ROci-data_dir=data%2Fpos_tagging


Downloading and preparing dataset noisy_dialect/ROci to .cache/noisy_dialect/ROci-data_dir=data%2Fpos_tagging/1.1.0/a0e4190d7e72716271d4c2496d5dcf596262a4082468db5160758db6f295efe5...


                                                       

Dataset noisy_dialect downloaded and prepared to .cache/noisy_dialect/ROci-data_dir=data%2Fpos_tagging/1.1.0/a0e4190d7e72716271d4c2496d5dcf596262a4082468db5160758db6f295efe5. Subsequent calls will reuse this data.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 790.63it/s]

DatasetDict({
    test: Dataset({
        features: ['id', 'tokens', 'upos'],
        num_rows: 874
    })
})
[['Puei', ',', 'sabètz', ',', 'se', 'fins_a', 'cinc', 'ans', 'avètz', 'gaire', 'de', 'sovenirs', ',', 'après', 'un', 'pichòt', 'saup', 'chifrar', 'e', 'pòu', 'jutjar', 'de', 'son', 'sicut', '.'], ['Me', 'rèstan', 'pasmens', 'quauquei', 'sovenirs', "d'", 'avans', 'cinc', 'ans', '.']]
[[14, 1, 16, 1, 5, 2, 3, 0, 16, 14, 2, 0, 1, 2, 8, 0, 16, 16, 9, 16, 16, 2, 8, 0, 1], [11, 16, 14, 11, 0, 2, 2, 3, 0, 1]]





### 3. ner

- we train one language/dialect per language/dialect group and evaluate on the whole group

In [17]:
wikiann_train_langs=["ar", "az", "ku", "tr", "hsb", "nl", "fr", "zh", "en", "mhr", "it", "de", "pa", "es", "hr", "lv", "hi", "ro", "el", "bn"]

In [18]:
print(wikiann_train_langs)

['ar', 'az', 'ku', 'tr', 'hsb', 'nl', 'fr', 'zh', 'en', 'mhr', 'it', 'de', 'pa', 'es', 'hr', 'lv', 'hi', 'ro', 'el', 'bn']


In [19]:
norwegian_train_langs=["bokmaal" ,"nynorsk" ,"samnorsk"]

In [20]:
print(norwegian_train_langs)

['bokmaal', 'nynorsk', 'samnorsk']


In [21]:
f = open('metadata/ner_metadata.json')
metadata = json.load(f)
# Closing file
f.close()

- langgroup corresponds to the training language per group

In [22]:
all_lang=pd.DataFrame(metadata).T
all_lang.head(10)

Unnamed: 0,lang,code,langgroup,huggingface,dataset,region,train_lang
en,english,en,english,True,wikiann,english,en
ar,arabic,ar,arabic,True,wikiann,arabic,ar
arz,egyptian arabic,arz,arabic,True,wikiann,arabic,ar
kab,kabyle,kab,arabic,False,wikiann,arabic,ar
kbd,kabardian,kbd,adyghe,False,wikiann,adyghe,en
ady,adyghe,ady,adyghe,False,wikiann,adyghe,en
az,azerbaijani,az,azerbaijani,True,wikiann,azerbaijani,az
azb,south azerbaijani,azb,azerbaijani,False,wikiann,azerbaijani,az
ckb,central kurdish,ckb,kurdish,True,wikiann,kurdish,ku
ku,kurdish,ku,kurdish,True,wikiann,kurdish,ku


- different dataset loading script for different datasets

In [23]:
## for langs not present in huggingface wikiann [huggingface=True]
lang='ar' 
script="wikiann"
predict_dataset = load_dataset(script, lang, cache_dir=CACHE_DIR)
print(predict_dataset)
print(predict_dataset['test']['tokens'][:2])
print(predict_dataset['test']['ner_tags'][:2])

Reusing dataset wikiann (.cache/wikiann/ar/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 988.45it/s]

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
})
[['تعلم', 'في', 'جامعة', 'نورث', 'وسترن', 'في', '.'], ['تحويل', 'ده\u200cشهر', '(', 'مقاطعة', 'كلاردشت', ')']]
[[0, 0, 3, 4, 4, 4, 0], [0, 5, 6, 6, 6, 6]]





In [24]:
## for test langs not present in huggingface wikiann [huggingface=False]
lang='kab' 
script="scripts/ner/wikiann_og.py"
predict_dataset = load_dataset(script, lang, cache_dir=CACHE_DIR)
print(predict_dataset)
print(predict_dataset['test']['tokens'][:2])
print(predict_dataset['test']['ner_tags'][:2])

Downloading and preparing dataset wikiann_og/kab to .cache/wikiann_og/kab/1.1.0/bd5069ae42633af8e332aa82fb4f866eafd1dffc876e6bae94c18978db74e5d7...
<datasets.download.download_manager.ArchiveIterable object at 0x7fa042433d90>


                                                                 

Dataset wikiann_og downloaded and prepared to .cache/wikiann_og/kab/1.1.0/bd5069ae42633af8e332aa82fb4f866eafd1dffc876e6bae94c18978db74e5d7. Subsequent calls will reuse this data.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 792.28it/s]

DatasetDict({
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3004
    })
})
[['Aṣqif', 'n', 'Ṭmana'], ['Tizi', 'Wezzu']]
[[5, 6, 6], [5, 6]]





In [26]:
## for NORWEGIAN_NER all_lang[all_lang['dataset']=='norwegian_ner']
lang='bokmaal' 
script="scripts/ner/norwegian_ner.py"
predict_dataset = load_dataset(script, lang, cache_dir=CACHE_DIR)
print(predict_dataset)
print(predict_dataset['test']['tokens'][:2])
print(predict_dataset['test']['ner_tags'][:2])

Downloading and preparing dataset norwegian_ner/bokmaal to .cache/norwegian_ner/bokmaal/1.0.0/d326c5669fdd52e99c78eaa6c9c8a8758a48f7494bf59ed17e12814eb349e443...


Downloading data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 1908.24it/s]
Extracting data files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 1174.22it/s]
                                                                      

Dataset norwegian_ner downloaded and prepared to .cache/norwegian_ner/bokmaal/1.0.0/d326c5669fdd52e99c78eaa6c9c8a8758a48f7494bf59ed17e12814eb349e443. Subsequent calls will reuse this data.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 516.71it/s]

DatasetDict({
    train: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'pos_tags', 'ner_tags'],
        num_rows: 15696
    })
    validation: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'pos_tags', 'ner_tags'],
        num_rows: 2410
    })
    test: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'pos_tags', 'ner_tags'],
        num_rows: 1939
    })
})
[['Honnørordene', 'er', '"', 'dristig', 'formspråk', '"', ',', '"', 'nyskapning', '"', 'og', '"', 'livgivende', 'kontrast', '"', '.'], ['Jeg', 'ser', 'et', 'landskap', 'som', 'er', 'såret', 'og', 'i', 'tilbaketrekning', '.']]
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]





### 4. dialect identification

In [42]:
arabic='data/dialect-identification/arabic/MADAR/MADAR_Corpus'
print('arabic')
for f in os.listdir(arabic):
    print(f)
    df=pd.read_csv(os.path.join(arabic,f))
    print(dict(df['label'].value_counts()))
    print()
    
root='data/dialect-identification/'
for f in os.listdir(root):
    if f!='arabic' and not str(f).startswith('.'):
        print(f)
        for f1 in os.listdir(os.path.join(root,f)):
            print(f1)
            df=pd.read_csv(os.path.join(root,f,f1))
            print(dict(df['label'].value_counts()))
            print()
        print()

arabic
test.csv
{'SAN': 200, 'JER': 200, 'BEI': 200, 'SAL': 200, 'AMM': 200, 'BAG': 200, 'RIY': 200, 'ALE': 200, 'MUS': 200, 'SFX': 200, 'ALG': 200, 'BAS': 200, 'JED': 200, 'TRI': 200, 'MOS': 200, 'ALX': 200, 'FES': 200, 'CAI': 200, 'TUN': 200, 'RAB': 200, 'MSA': 200, 'KHA': 200, 'DOH': 200, 'ASW': 200, 'DAM': 200, 'BEN': 200}

dev.csv
{'ASW': 200, 'BAS': 200, 'JER': 200, 'AMM': 200, 'TUN': 200, 'JED': 200, 'RAB': 200, 'SFX': 200, 'MSA': 200, 'DAM': 200, 'ALG': 200, 'BEI': 200, 'TRI': 200, 'KHA': 200, 'SAL': 200, 'RIY': 200, 'ALE': 200, 'MUS': 200, 'BEN': 200, 'MOS': 200, 'BAG': 200, 'FES': 200, 'CAI': 200, 'DOH': 200, 'SAN': 200, 'ALX': 200}

train.csv
{'KHA': 1600, 'RAB': 1600, 'DOH': 1600, 'DAM': 1600, 'ALX': 1600, 'AMM': 1600, 'SAL': 1600, 'JER': 1600, 'TUN': 1600, 'MUS': 1600, 'BAS': 1600, 'FES': 1600, 'TRI': 1600, 'ASW': 1600, 'MSA': 1600, 'BEI': 1600, 'RIY': 1600, 'BAG': 1600, 'BEN': 1600, 'SFX': 1600, 'ALE': 1600, 'MOS': 1600, 'CAI': 1600, 'JED': 1600, 'ALG': 1600, 'SAN': 1600}

### 5. question-anwering

In [44]:
train_datapath='data/Question-Answering/SDQA-gold-task/sdqa-train-all.json'
dev_datapath='data/Question-Answering/SDQA-gold-task/sdqa-dev-all.json'
test_datapath='data/Question-Answering/SDQA-gold-task/sdqa-test-all.json'
language_dialect_identifier="{lang}-id-{dialect}" #look at id field, eg: "english-6037841464917965779-nga"

### 6. sentiment analysis

In [45]:
arabic='data/sentiment_analysis/arabic'
print('arabic')
for f in os.listdir(arabic):
    print(f)
    df=pd.read_csv(os.path.join(arabic,f))
    print(dict(df['label'].value_counts()))
    print()

arabic
aeb_Latn-train.csv
{'positive': 36942, 'negative': 30589, 'neutral': 2468}

arb_arab-train.csv
{'positive': 4797, 'neutral': 1923, 'negative': 1783, 'mixed': 324}

arq_arab-test.csv
{'negative': 7512, 'positive': 7448}

ar-lb_arab-test.csv
{5: 665, 4: 232, 3: 142, 1: 99, 2: 37}

aeb_Arab-test.csv
{'positive': 1701, 'negative': 1701}

sau_arab-train.csv
{'negative': 6080, 'positive': 1487, 'neutral': 866}

jor_arab-test.csv
{'positive': 295, 'negative': 245}

ary_arab-test.csv
{'positive': 914, 'neutral': 389, 'negative': 279, 'mixed': 58}

jor_arab-train.csv
{'negative': 655, 'positive': 605}

aeb_Latn-test.csv
{'positive': 15769, 'negative': 13178, 'neutral': 1054}

ar-lb_arab-train.csv
{5: 1648, 4: 502, 3: 276, 1: 204, 2: 111}

arz_arab-test.csv
{'objective': 2028, 'negative': 486, 'neutral': 254, 'positive': 235}

arz_arab-train.csv
{'objective': 4663, 'negative': 1198, 'neutral': 578, 'positive': 564}

aeb_Arab-train.csv
{'positive': 7155, 'negative': 6516}

arb_arab-test.cs

### 7. topic classification

- we train one lang per group (identifier: langgroup) and evaluate on all other

#### we only consider the following languages from the sib-200 dataset

In [46]:
f = open('metadata/topic_metadata.json')
metadata = json.load(f)
# Closing file
f.close()

In [47]:
all_lang=pd.DataFrame(metadata).T
all_lang.head(10)

Unnamed: 0,lang,code,langgroup
lmo_Latn,lombard,lmo_Latn,italian
eng_Latn,English,eng_Latn,English
ita_Latn,italian,ita_Latn,italian
fur_Latn,friulian,fur_Latn,italian
scn_Latn,sicilian,scn_Latn,italian
srd_Latn,sardinian,srd_Latn,italian
vec_Latn,venetian,vec_Latn,italian
azb_Arab,south,azb_Arab,azarbaijani
azj_Latn,north,azj_Latn,azarbaijani
tur_Latn,Turkish,tur_Latn,azarbaijani


In [49]:
datapath='data/topic_class'
lang='lmo_Latn'

for f in os.listdir(os.path.join(datapath, lang)):
    if f!='labels.txt':
        print(f)
        df=pd.read_csv(os.path.join(datapath,lang,f))
        print(dict(df['label'].value_counts()))
        print()

test.csv
{'science/technology': 51, 'travel': 40, 'politics': 30, 'sports': 25, 'health': 22, 'entertainment': 19, 'geography': 17}

dev.csv
{'science/technology': 25, 'travel': 20, 'politics': 14, 'sports': 12, 'health': 11, 'entertainment': 9, 'geography': 8}

train.csv
{'science/technology': 176, 'travel': 138, 'politics': 102, 'sports': 85, 'health': 77, 'entertainment': 65, 'geography': 58}



### 8. reading comprehension

- we only consider the following evaluation languages from Belebele dataset

In [51]:
f = open('metadata/rcmc_metadata.json')
metadata = json.load(f)
# Closing file
f.close()

In [52]:
all_lang=pd.DataFrame(metadata).T
all_lang.head(10)

Unnamed: 0,lang,code,langgroup
zho_Hans,Chinese (Simplified),zho_Hans,chinese
zho_Hant,Chinese (Traditional),zho_Hant,chinese
eng_Latn,English,eng_Latn,English
nso_Latn,Northern Sotho,nso_Latn,sotho
sot_Latn,Southern Sotho,sot_Latn,sotho
acm_Arab,Mesopotamian Arabic,acm_Arab,arabic
apc_Arab,North Levantine Arabic,apc_Arab,arabic
arb_Arab,MSA (Arabic),arb_Arab,arabic
ars_Arab,Najdi Arabic,ars_Arab,arabic
ary_Arab,Moroccan Arabic,ary_Arab,arabic


In [54]:
datapath='datapath/reading-comprehension/Belebele'
sample_test_file='arb_Arab.jsonl'
first_example={"link":"https:\/\/en.wikibooks.org\/wiki\/Accordion\/Right_hand","question_number":1,"flores_passage":"Make sure your hand is as relaxed as possible while still hitting all the notes correctly - also try not to make much extraneous motion with your fingers. This way, you will tire yourself out as little as possible. Remember there's no need to hit the keys with a lot of force for extra volume like on the piano. On the accordion, to get extra volume, you use the bellows with more pressure or speed.","question":"According to the passage, what would not be considered an accurate tip for successfully playing the accordion?","mc_answer1":"For additional volume, increase the force with which you hit the keys","mc_answer2":"Keep unnecessary movement to a minimum in order to preserve your stamina","mc_answer3":"Be mindful of hitting the notes while maintaining a relaxed hand","mc_answer4":"Increase the speed with which you operate the bellows to achieve extra volume","correct_answer_num":"1","dialect":"eng_Latn","ds":"2023-05-03"}
print(first_example)

{'link': 'https:\\/\\/en.wikibooks.org\\/wiki\\/Accordion\\/Right_hand', 'question_number': 1, 'flores_passage': "Make sure your hand is as relaxed as possible while still hitting all the notes correctly - also try not to make much extraneous motion with your fingers. This way, you will tire yourself out as little as possible. Remember there's no need to hit the keys with a lot of force for extra volume like on the piano. On the accordion, to get extra volume, you use the bellows with more pressure or speed.", 'question': 'According to the passage, what would not be considered an accurate tip for successfully playing the accordion?', 'mc_answer1': 'For additional volume, increase the force with which you hit the keys', 'mc_answer2': 'Keep unnecessary movement to a minimum in order to preserve your stamina', 'mc_answer3': 'Be mindful of hitting the notes while maintaining a relaxed hand', 'mc_answer4': 'Increase the speed with which you operate the bellows to achieve extra volume', 'cor

In [55]:
combined_train_file=datapath='datapath/reading-comprehension/Belebele/train.jsonl'

### 9. Natural language Inference

- we perform cross-lingual transfer on translate-test langs from eng_Latn

In [70]:
f = open('metadata/nli_metadata.json')
metadata = json.load(f)
# Closing file
f.close()

In [71]:
all_lang=pd.DataFrame(metadata).T
all_lang.head(10)

Unnamed: 0,lang,code,langgroup
lmo_Latn,lombard,lmo_Latn,italian
eng_Latn,English,eng_Latn,English
ita_Latn,italian,ita_Latn,italian
fur_Latn,friulian,fur_Latn,italian
scn_Latn,sicilian,scn_Latn,italian
srd_Latn,sardinian,srd_Latn,italian
vec_Latn,venetian,vec_Latn,italian
azb_Arab,south,azb_Arab,azarbaijani
azj_Latn,north,azj_Latn,azarbaijani
tur_Latn,Turkish,tur_Latn,azarbaijani


In [59]:
all_eval_langs=["eng_Latn","lmo_Latn","ita_Latn","fur_Latn","scn_Latn","srd_Latn","vec_Latn","azb_Arab","azj_Latn","tur_Latn","kmr_Latn","ckb_Arab","nno_Latn","nob_Latn","lim_Latn","ltz_Latn","nld_Latn","lvs_Latn","ltg_Latn","acm_Arab","acq_Arab","aeb_Arab","ajp_Arab","apc_Arab","arb_Arab","ars_Arab","ary_Arab","arz_Arab","kab_Latn","asm_Beng","ben_Beng","lij_Latn","oci_Latn","yue_Hant","zho_Hans","zho_Hant","glg_Latn","spa_Latn","por_Latn","nso_Latn","sot_Latn"]

In [60]:
print(all_eval_langs)

['eng_Latn', 'lmo_Latn', 'ita_Latn', 'fur_Latn', 'scn_Latn', 'srd_Latn', 'vec_Latn', 'azb_Arab', 'azj_Latn', 'tur_Latn', 'kmr_Latn', 'ckb_Arab', 'nno_Latn', 'nob_Latn', 'lim_Latn', 'ltz_Latn', 'nld_Latn', 'lvs_Latn', 'ltg_Latn', 'acm_Arab', 'acq_Arab', 'aeb_Arab', 'ajp_Arab', 'apc_Arab', 'arb_Arab', 'ars_Arab', 'ary_Arab', 'arz_Arab', 'kab_Latn', 'asm_Beng', 'ben_Beng', 'lij_Latn', 'oci_Latn', 'yue_Hant', 'zho_Hans', 'zho_Hant', 'glg_Latn', 'spa_Latn', 'por_Latn', 'nso_Latn', 'sot_Latn']


In [61]:
train_lang='eng_Latn'

In [67]:
lang=train_lang
dataset = load_dataset("scripts/nli/dialect_nli.py", lang,
            cache_dir=CACHE_DIR)
print(dataset)
print(dataset['train'].features)

Reusing dataset dialect_nli (.cache/dialect_nli/eng_Latn/1.1.0/b69815628a902151a9f2b158e6be8fabf359868aa4a25c29c09ff689455041b9)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 363.58it/s]

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 392702
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 5010
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 2490
    })
})
{'premise': Value(dtype='string', id=None), 'hypothesis': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=3, names=['entailment', 'neutral', 'contradiction'], id=None)}





In [69]:
lang='lmo_Latn'
dataset = load_dataset("scripts/nli/dialect_nli.py", lang,
            cache_dir=CACHE_DIR)
print(dataset)
print(dataset['test'].features)

Reusing dataset dialect_nli (.cache/dialect_nli/lmo_Latn/1.1.0/b69815628a902151a9f2b158e6be8fabf359868aa4a25c29c09ff689455041b9)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 963.69it/s]

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 0
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 5010
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 0
    })
})
{'premise': Value(dtype='string', id=None), 'hypothesis': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=3, names=['entailment', 'neutral', 'contradiction'], id=None)}





### 10. Machine Translation