### Data Fetch

In [1]:
import pandas as pd
from datasets import load_dataset
from collections import defaultdict
from tqdm import tqdm
import regex as re

dataset = load_dataset("coastalcph/tydi_xor_rc")

languages = ['ar', 'ko', 'te']
train_dataset = dataset["train"].filter(lambda example: example['lang'] in languages)
val_dataset = dataset["validation"].filter(lambda example: example['lang'] in languages)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ko_questions = list(train_dataset.filter(lambda x: x["lang"] == "ko")["question"])
ar_questions =  list(train_dataset.filter(lambda x: x["lang"] == "ar")["question"])
te_questions = list(train_dataset.filter(lambda x: x["lang"] == "te")["question"])
en_context =  list(train_dataset["context"])

ko_questions_val = list(val_dataset.filter(lambda x: x["lang"] == "ko")["question"])
ar_questions_val =  list(val_dataset.filter(lambda x: x["lang"] == "ar")["question"])
te_questions_val = list(val_dataset.filter(lambda x: x["lang"] == "te")["question"])
en_context_val =  list(val_dataset["context"])

def UnfoldSentences(l):
    return [re.findall(r'\w+', sentence) for sentence in l]
    

ko_questions = UnfoldSentences(ko_questions)
ar_questions = UnfoldSentences(ar_questions)
te_questions = UnfoldSentences(te_questions)
en_context= UnfoldSentences(en_context)

ko_questions_val = UnfoldSentences(ko_questions_val)
ar_questions_val = UnfoldSentences(ar_questions_val)
te_questions_val = UnfoldSentences(te_questions_val)
en_context_val = UnfoldSentences(en_context_val)


In [6]:
import math

class Trigram:
    model = {}
    V = set()
    
    def __init__(self, name):
        self.model = {}
        self.name = name

    def Train(self, texts):
        self.V = set()
        for t in tqdm(range(len(texts)), desc=f"Training {self.name}"):
            self.V.add(texts[t][0])
            self.V.add(texts[t][1])
            for i in range(len(texts[t]) - 2):
                contextKey = (texts[t][i], texts[t][i+1])
                followupKey = texts[t][i+2]
                self.V.add(followupKey)
                
                if contextKey in self.model.keys():
                    if followupKey in self.model[contextKey].keys():
                        self.model[contextKey][followupKey] += 1
                    else:
                        self.model[contextKey][followupKey] = 1
                else:
                    self.model[contextKey] = {followupKey : 1}
    
    def Print(self):
        for context in self.model.keys():
            options = self.model[context]
            print(f"{context[0]} {context[1]} : {options}")

    def P(self, context, followup, smoothing="None"):
        if smoothing == "None":
            if context in self.model.keys() and followup in self.model[context].keys():
                    return self.model[context][followup]/sum(self.model[context].values()) 
            else:
                return 0 

        elif smoothing == "Laplace":
            V = len(self.V)
            trigramProb = self.model[context][followup] if context in self.model.keys() and followup in self.model[context] else 0
            bigramProb = sum(self.model[context].values()) if context in self.model.keys() else 0

            return (trigramProb + 1)/(bigramProb + V)
    
    def Perplexity(self, wordset, smoothing="None"):
        Sum = 0
        for i in range(len(wordset) - 2):
            inside = self.P((wordset[i], wordset[i+1]), wordset[i+2], smoothing)
            
            Sum += math.log(inside) if inside > 0 else float("-inf")

        return math.exp((-1/len(wordset)) * Sum)

    def AvgPerplexity(self, sentences, smoothing="None"):
        i = 0
        p = 0
        for sentence in sentences:
            p += self.Perplexity(sentence, smoothing)
            i+=1
        return p/i
    
class Bigram:
    model = {}
    V = set()
    
    def __init__(self, name):
        self.model = {}
        self.name = name

    def Train(self, texts):
        self.V = set()
        for t in tqdm(range(len(texts)), desc=f"Training {self.name}"):
            self.V.add(texts[t][0])
            for i in range(len(texts[t]) - 1):
                contextKey = (texts[t][i],)
                followupKey = texts[t][i+1]
                self.V.add(followupKey)
                if contextKey in self.model.keys():
                    if followupKey in self.model[contextKey].keys():
                        self.model[contextKey][followupKey] += 1
                    else:
                        self.model[contextKey][followupKey] = 1
                else:
                    self.model[contextKey] = {followupKey : 1}
    
    def Print(self):
        for context in self.model.keys():
            options = self.model[context]
            print(f"{context[0]} : {options}")

    def P(self, context, followup, smoothing="None"):
        if smoothing == "None":
            if context in self.model.keys() and followup in self.model[context].keys():
                    return self.model[context][followup]/sum(self.model[context].values()) 
            else:
                return 0 

        elif smoothing == "Laplace":
            V = len(self.V)
            trigramProb = self.model[context][followup] if context in self.model.keys() and followup in self.model[context] else 0
            bigramProb = sum(self.model[context].values()) if context in self.model.keys() else 0

            return (trigramProb + 1)/(bigramProb + V)
    
    def Perplexity(self, wordset, smoothing="None"):
        Sum = 0
        for i in range(len(wordset) - 1):
            inside = self.P((wordset[i]), wordset[i+1], smoothing)
            
            Sum += math.log(inside) if inside > 0 else float("-inf")

        return math.exp((-1/len(wordset)) * Sum)

    def AvgPerplexity(self, sentences, smoothing="None"):
        i = 0
        p = 0
        for sentence in sentences:
            p += self.Perplexity(sentence, smoothing)
            i+=1
        return p/i
    

class Unigram:
    model = {}
    V = set()
    
    def __init__(self, name):
        self.model = {}
        self.name = name

    def Train(self, texts):
        self.V = set()
        for sentence in tqdm(texts, desc=f"Training {self.name}"):
            for w in range(len(sentence)):
                self.V.add(w)
                if sentence[w] in self.model.keys():
                    self.model[sentence[w]] += 1
                else:
                    self.model[sentence[w]] = 1
    
    def Print(self):
        for context in self.model.keys():
            options = self.model[context]
            print(f"{context[0]} : {options}")

    def P(self, followup):
        if followup in self.model.keys():
            return self.model[followup] / sum(self.model.values())
        else:
            return 0
    
    def Perplexity(self, wordset):
        Sum = 0
        for i in range(len(wordset)):
            inside = self.P((wordset[i]))
            
            Sum += math.log(inside) if inside > 0 else float("-inf")

        return math.exp((-1/len(wordset)) * Sum)

    def AvgPerplexity(self, sentences):
        i = 0
        p = 0
        for sentence in sentences:
            p += self.Perplexity(sentence)
            i+=1
        return p/i

In [7]:
ko_trigram = Trigram("Korean Trigram")
ar_trigram = Trigram("Arabic Trigram")
te_trigram = Trigram("Telugu Trigram")
en_trigram = Trigram("English Trigram")

ko_bigram = Bigram("Korean Bigram")
ar_bigram = Bigram("Arabic Bigram")
te_bigram = Bigram("Telugu Bigram")
en_bigram = Bigram("English Bigram")

ko_unigram = Unigram("Korean Unigram")
ar_unigram = Unigram("Arabic Unigram")
te_unigram = Unigram("Telugu Unigram")
en_unigram = Unigram("English Unigram")


ko_trigram.Train(ko_questions)
ar_trigram.Train(ar_questions)
te_trigram.Train(te_questions)
en_trigram.Train(en_context)

print("-"*30)

ko_bigram.Train(ko_questions)
ar_bigram.Train(ar_questions)
te_bigram.Train(te_questions)
en_bigram.Train(en_context)

print("-"*30)

ko_unigram.Train(ko_questions)
ar_unigram.Train(ar_questions)
te_unigram.Train(te_questions)
en_unigram.Train(en_context)

            


Training Korean Trigram: 100%|██████████| 2422/2422 [00:00<00:00, 345695.37it/s]
Training Arabic Trigram: 100%|██████████| 2558/2558 [00:00<00:00, 172906.64it/s]
Training Telugu Trigram: 100%|██████████| 1355/1355 [00:00<00:00, 257243.56it/s]
Training English Trigram: 100%|██████████| 6335/6335 [00:00<00:00, 10484.91it/s]


------------------------------


Training Korean Bigram: 100%|██████████| 2422/2422 [00:00<00:00, 331872.08it/s]
Training Arabic Bigram: 100%|██████████| 2558/2558 [00:00<00:00, 195613.87it/s]
Training Telugu Bigram: 100%|██████████| 1355/1355 [00:00<00:00, 339462.54it/s]
Training English Bigram: 100%|██████████| 6335/6335 [00:00<00:00, 10521.54it/s]


------------------------------


Training Korean Unigram: 100%|██████████| 2422/2422 [00:00<00:00, 431968.55it/s]
Training Arabic Unigram: 100%|██████████| 2558/2558 [00:00<00:00, 336945.85it/s]
Training Telugu Unigram: 100%|██████████| 1355/1355 [00:00<00:00, 442519.81it/s]
Training English Unigram: 100%|██████████| 6335/6335 [00:00<00:00, 29737.55it/s]


In [9]:
def PrintPerplexitySmoothed(model, val):
    print("-"*30)
    print(model.name)
    print(f"Validating w/Smoothing - {model.AvgPerplexity(val, smoothing='Laplace')}")

def PrintPerplexity(model, val):
    print("-"*30)
    print(model.name)
    print(f"Validating - {model.AvgPerplexity(val)}")

# PrintPerplexitySmoothed(ko_trigram, ko_questions_val)
# PrintPerplexitySmoothed(ar_trigram, ar_questions_val)
# PrintPerplexitySmoothed(te_trigram, te_questions_val)
# PrintPerplexitySmoothed(en_trigram, en_context_val)

print("-"*30)

PrintPerplexitySmoothed(ko_bigram, ko_questions_val)
PrintPerplexitySmoothed(ar_bigram, ar_questions_val)
PrintPerplexitySmoothed(te_bigram, te_questions_val)
PrintPerplexitySmoothed(en_bigram, en_context_val)

print("-"*30)

PrintPerplexity(ko_unigram, ko_questions_val)
PrintPerplexity(ar_unigram, ar_questions_val)
PrintPerplexity(te_unigram, te_questions_val)
PrintPerplexity(en_unigram, en_context_val)

#BIGRAM NO SMOOTH
#BIGRAM SMOOTH
#TRIGRAM NO SMOOTH
#TRIGRAM SMOOTH

# print(AvgPerplexity(ko_trigram, ko_questions_val))
# print(AvgPerplexity(ko_trigram, ko_questions_val))

------------------------------
------------------------------
Korean Bigram
Validating w/Smoothing - 775.2594398868778
------------------------------
Arabic Bigram
Validating w/Smoothing - 1323.6237265456155
------------------------------
Telugu Bigram
Validating w/Smoothing - 637.4578359556473
------------------------------
English Bigram
Validating w/Smoothing - 40269.97153511367
------------------------------
------------------------------
Korean Unigram
Validating - inf
------------------------------
Arabic Unigram
Validating - inf
------------------------------
Telugu Unigram
Validating - inf
------------------------------
English Unigram
Validating - inf
