### Data Fetch

In [1]:
import regex as re
import pandas as pd
from datasets import load_dataset

dataset = load_dataset("coastalcph/tydi_xor_rc")
languages = ['ar', 'ko', 'te', 'en']
train_dataset = dataset["train"].filter(lambda example: example['lang'] in languages)
val_dataset = dataset["validation"].filter(lambda example: example['lang'] in languages)

  from .autonotebook import tqdm as notebook_tqdm


#### Tokenize

In [2]:
ar_questions =  list(train_dataset.filter(lambda x: x["lang"] == "ar")["question"])
ko_questions = list(train_dataset.filter(lambda x: x["lang"] == "ko")["question"])
te_questions = list(train_dataset.filter(lambda x: x["lang"] == "te")["question"])
en_context =  list(train_dataset["context"])

ar_questions_val =  list(val_dataset.filter(lambda x: x["lang"] == "ar")["question"])
ko_questions_val = list(val_dataset.filter(lambda x: x["lang"] == "ko")["question"])
te_questions_val = list(val_dataset.filter(lambda x: x["lang"] == "te")["question"])
en_context_val =  list(val_dataset["context"])

def UnfoldSentences(l):
    return [re.findall(r'\w+', sentence) for sentence in l]

ar_questions = UnfoldSentences(ar_questions)
ko_questions = UnfoldSentences(ko_questions)
te_questions = UnfoldSentences(te_questions)
en_context= UnfoldSentences(en_context)

ar_questions_val = UnfoldSentences(ar_questions_val)
ko_questions_val = UnfoldSentences(ko_questions_val)
te_questions_val = UnfoldSentences(te_questions_val)
en_context_val = UnfoldSentences(en_context_val)

## Unigram

In [3]:
from collections import Counter

class UnigramModel:
    def __init__(self, languages):
        self.languages = languages
        self.counters = {}
        self.probabilities = {}

        for lang in languages:
            self.counters[lang] = Counter()
            self.probabilities[lang] = {}

    def build(self, tokenized_data):
        # tokenize 
        for lang in self.languages:
            sentences = tokenized_data.get(lang, [])
            for sentence_tokens in sentences:
                self.counters[lang].update(sentence_tokens)

        # probabilities
        for lang in self.languages:
            total_count = sum(self.counters[lang].values())
            for word, count in self.counters[lang].items():
                self.probabilities[lang][word] = count / total_count

    def word_probability(self, lang, word):
        return self.probabilities.get(lang, {}).get(word, 0.0)


In [4]:
tokenized_train = {
    "ar": ar_questions,
    "ko": ko_questions,
    "te": te_questions,
    "en": en_context
}

tokenized_val = {
    "ar": ar_questions_val,
    "ko": ko_questions_val,
    "te": te_questions_val,
    "en": en_context_val
}

unigram = UnigramModel(languages)
unigram.build(tokenized_train)

for lang in languages:
    print(f"Top 5 probabilities for {lang}:")
    probs = unigram.probabilities[lang]

    top_5 = sorted(probs.items(), key=lambda x: x[1], reverse=True)[:5]
    for word, prob in top_5:
        print(f"{word}: {prob:.4f}")
        
    print("---------------------------")
    

Top 5 probabilities for ar:
في: 0.0366
من: 0.0362
متى: 0.0331
ما: 0.0273
هو: 0.0216
---------------------------
Top 5 probabilities for ko:
가장: 0.0444
무엇인가: 0.0419
언제: 0.0283
몇: 0.0197
어디인가: 0.0192
---------------------------
Top 5 probabilities for te:
ఎవరు: 0.0356
ఏది: 0.0250
ఎన్ని: 0.0215
ఎప్పుడు: 0.0200
ఏ: 0.0187
---------------------------
Top 5 probabilities for en:
the: 0.0668
of: 0.0411
and: 0.0318
in: 0.0266
to: 0.0177
---------------------------


#### Perplexity

In [5]:
import math

def compute_perplexity(model, tokenized_data, lang):
    total_log_prob = 0
    total_words = 0

    for tokens in tokenized_data:
        for word in tokens:
            prob = model.word_probability(lang, word)
            if prob == 0:
                prob = 1e-6
            total_log_prob += math.log2(prob)
            total_words += 1

    avg_neg_log_prob = - total_log_prob / total_words
    perplexity = 2 ** avg_neg_log_prob

    return perplexity


In [6]:
print("Unigram")

perplexity_ar = compute_perplexity(unigram, ar_questions_val, "ar")
perplexity_ko = compute_perplexity(unigram, ko_questions_val, "ko")
perplexity_te = compute_perplexity(unigram, te_questions_val, "te")
perplexity_en = compute_perplexity(unigram, en_context_val, "en")

print("Perplexities:")
print(f"ko: {perplexity_ko:.6f}")
print(f"ar: {perplexity_ar:.6f}")
print(f"te: {perplexity_te:.6f}")
print(f"en: {perplexity_en:.6f}")

Unigram
Perplexities:
ko: 4542.269394
ar: 3740.785237
te: 2565.343064
en: 2168.677714
