In [122]:
import regex as re
import pandas as pd
from datasets import load_dataset

dataset = load_dataset("coastalcph/tydi_xor_rc")
languages = ['ar', 'ko', 'te']

## Unigram

In [123]:
from collections import Counter

class UnigramModel:
    def __init__(self, languages):
        self.languages = languages
        self.counters = {}
        self.probabilities = {}

        for lang in languages:
            self.counters[lang] = Counter()
            self.probabilities[lang] = {}

    @staticmethod
    def tokenize(text):
        return re.findall(r'\b\w+\b', text.lower())

    def build(self, dataset):
        # tokenize 
        for split in dataset.keys():
            for lang in self.languages:
                lang_data = dataset[split].filter(lambda x: x['lang'] == lang)
                
                for item in lang_data:
                    tokens = self.tokenize(item['question'])
                    self.counters[lang].update(tokens)

        # probabilities
        for lang in self.languages:
            total_count = sum(self.counters[lang].values())
            for word, count in self.counters[lang].items():
                self.probabilities[lang][word] = count / total_count

    def word_probability(self, lang, word):
        return self.probabilities.get(lang, {}).get(word, 0.0)


In [124]:
unigram = UnigramModel(languages)
unigram.build(dataset)

for lang in languages:
    print(f"Top 5 probabilities for {lang}:")
    probs = unigram.probabilities[lang]

    top_5 = sorted(probs.items(), key=lambda x: x[1], reverse=True)[:5]
    for word, prob in top_5:
        print(f"{word}: {prob:.4f}")
        
    print("---------------------------")
    

Top 5 probabilities for ar:
من: 0.0372
في: 0.0363
متى: 0.0319
ما: 0.0278
هو: 0.0221
---------------------------
Top 5 probabilities for ko:
가장: 0.0436
무엇인가: 0.0421
언제: 0.0279
어디인가: 0.0189
몇: 0.0187
---------------------------
Top 5 probabilities for te:
ఎవరు: 0.0348
ఏది: 0.0268
ఏ: 0.0236
ఎన్ని: 0.0191
ఎప్పుడు: 0.0185
---------------------------


#### Perplexity

In [125]:
import math

def compute_perplexity(model, dataset, lang):
    split_data = dataset['validation'].filter(lambda x: x['lang'] == lang)
    total_log_prob = 0
    total_words = 0

    for item in split_data:
        tokens = model.tokenize(item['question'])
        for word in tokens:
            prob = model.word_probability(lang, word)
            if prob == 0:
                prob = 1e-6
            total_log_prob += math.log2(prob)
            total_words += 1

    avg_neg_log_prob = - total_log_prob / total_words
    perplexity = 2 ** avg_neg_log_prob

    return perplexity


In [126]:
print("Unigram")
for lang in languages:
    perplexity = compute_perplexity(unigram, dataset, lang)
    print(f"Perplexity for {lang}: {perplexity:.6f}")

Unigram
Perplexity for ar: 1207.776167
Perplexity for ko: 1100.799844
Perplexity for te: 699.134852
