### 🧑‍🏫 Instructions

FLORES 200 dataset

1. dev/ and devtest/ contains the sentences in same order for each language.
2. metadata_dev.tsv and metadata_devtest.tsv contains tab separated metadata for each of the sentences. The sentences are in same order as of dev/<lang>.dev and devtest/<lang>.devtest


In [32]:
%pip install -q -r requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [33]:
import glob
import json
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import os

# Initialize the tokenizer and trainer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.pre_tokenizer = Whitespace()

# Get all the files with .dev extension in the /flores200_dataset/dev directory
files = glob.glob("inputs/flores200_dataset/dev/*.dev")

# Train the tokenizer on all the files
tokenizer.train(files, trainer)

# Save the trained tokenizer
tokenizer.save("results/tokenizer.json")
print("Tokenizer saved to inputs/tokenizer.json")

# Function to extract language code from file name
def get_language_from_filename(filename):
    return os.path.basename(filename).split('.')[0]

language_vocabularies = {}

# Iterate over files to get the vocabulary for each language
for file in files:
    # Extract language code from the file name
    language_code = get_language_from_filename(file)
    
    # Tokenize the content of the file
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
    
    encoded = tokenizer.encode(text)
    unique_tokens = set(encoded.tokens)
    
    # Get the global vocabulary of the tokenizer
    global_vocab = tokenizer.get_vocab()
    
    # Filter the global vocabulary to get only the relevant tokens for this file
    vocab = {token: global_vocab[token] for token in unique_tokens if token in global_vocab}
    language_vocabularies[language_code] = vocab

with open("results/language_vocabularies.json", 'w', encoding='utf-8') as f:
    json.dump(language_vocabularies, f, ensure_ascii=False, indent=4)

print("Language vocabularies saved to results/language_vocabularies.json")





Tokenizer saved to inputs/tokenizer.json
Language vocabularies saved to results/language_vocabularies.json


# 📈 Analysis


In [34]:
# Get length of the tokenizer
vocab_size = len(tokenizer.get_vocab())
print("Vocabulary size:", vocab_size)

Vocabulary size: 30000


In [46]:
def get_basic_stats(vocab):
    return {
        "vocab_size": len(vocab),
        "vocab_size_percentage": len(vocab)/vocab_size*100,
        "average_token_length": sum(len(token) for token in vocab)/len(vocab)
    }

In [47]:
# Get only from the languages we want from inputs/FLORES-200.lang_info.csv (first column)

# Read the language info file
with open("inputs/FLORES-200.lang_info.csv", 'r', encoding='utf-8') as f:
    lines = f.readlines()
    
# Get the languages we want to keep (separated by \t, get last column)
languages = [line.split('\t')[-1].strip() for line in lines]
print(languages)

# Filter the language vocabularies to keep only the languages we want
filtered_language_vocabularies = {language: vocab for language, vocab in language_vocabularies.items() if language in languages}

print("Filtered language vocabularies:" , filtered_language_vocabularies)
print("Number of filtered language vocabularies:" , len(filtered_language_vocabularies))

['lang_script', 'eng_Latn', 'nld_Latn', 'ltz_Latn', 'fra_Latn', 'spa_Latn', 'glg_Latn', 'rus_Cyrl', 'pol_Latn', 'mkd_Cyrl']
Filtered language vocabularies: {'nld_Latn': {'Auckland': 21583, 'ginn': 17331, 'wel': 9618, 'condi': 20299, '1958': 15157, 'Indus': 12895, 'leid': 18994, 'ys': 7245, 'ner': 9174, ':': 31, 'Stardust': 13455, 'ety': 20045, 'person': 9403, 'lat': 7763, 'Riomaggiore': 20578, 'Ara': 12621, 'Strabo': 23662, 'cor': 8284, 'hor': 10492, 'Ili': 18571, 'tran': 12220, 'Pek': 29720, 'ise': 7376, 'ct': 10056, 'Co': 8678, 'à': 155, 'Au': 8145, 'bin': 7747, 'jung': 21829, 'hij': 24899, 've': 6921, 'post': 11926, 'Jamai': 26105, 'fin': 7503, 'Venus': 12665, 'vjet': 22793, 'nog': 12279, 'elk': 29867, 'ev': 11612, '²': 113, 'migr': 23923, 'chu': 7238, 'seri': 14106, 'estre': 20191, 'sur': 8670, 'fes': 12416, 'Massachusetts': 21819, 'ang': 6855, 'Palawan': 20718, 'Casey': 20229, 'doka': 16543, 'Kriegsmarine': 18488, 'syn': 18134, 'Skype': 17731, 'Min': 10451, 'Miami': 21708, 'Lake':

In [49]:
# Get average of token length for filtered languages
for language, vocab in filtered_language_vocabularies.items():
    stats = get_basic_stats(vocab)
    print(f"Language: {language}")
    print(f"  Vocabulary size: {stats['vocab_size']}")
    print(f"  Vocabulary size percentage: {stats['vocab_size_percentage']:.2f}%")
    print(f"  Average token length: {stats['average_token_length']:.2f}")
    

Language: nld_Latn
  Vocabulary size: 3094
  Vocabulary size percentage: 10.31%
  Average token length: 4.07
Language: mkd_Cyrl
  Vocabulary size: 1198
  Vocabulary size percentage: 3.99%
  Average token length: 3.02
Language: rus_Cyrl
  Vocabulary size: 1455
  Vocabulary size percentage: 4.85%
  Average token length: 3.18
Language: glg_Latn
  Vocabulary size: 3155
  Vocabulary size percentage: 10.52%
  Average token length: 4.08
Language: pol_Latn
  Vocabulary size: 2921
  Vocabulary size percentage: 9.74%
  Average token length: 3.83
Language: ltz_Latn
  Vocabulary size: 3141
  Vocabulary size percentage: 10.47%
  Average token length: 3.92
Language: spa_Latn
  Vocabulary size: 3145
  Vocabulary size percentage: 10.48%
  Average token length: 4.06
Language: fra_Latn
  Vocabulary size: 2938
  Vocabulary size percentage: 9.79%
  Average token length: 4.02
Language: eng_Latn
  Vocabulary size: 3272
  Vocabulary size percentage: 10.91%
  Average token length: 4.20
