### Data Fetch

In [18]:
import pandas as pd
from datasets import load_dataset
from collections import defaultdict
from tqdm import tqdm
import regex as re

dataset = load_dataset("coastalcph/tydi_xor_rc")

languages = ['ar', 'ko', 'te']
train_dataset = dataset["train"].filter(lambda example: example['lang'] in languages)
val_dataset = dataset["validation"].filter(lambda example: example['lang'] in languages)

In [19]:
ko_questions = list(train_dataset.filter(lambda x: x["lang"] == "ko")["question"])
ar_questions =  list(train_dataset.filter(lambda x: x["lang"] == "ar")["question"])
te_questions = list(train_dataset.filter(lambda x: x["lang"] == "te")["question"])
en_context =  list(train_dataset["context"])

ko_questions_val = list(val_dataset.filter(lambda x: x["lang"] == "ko")["question"])
ar_questions_val =  list(val_dataset.filter(lambda x: x["lang"] == "ar")["question"])
te_questions_val = list(val_dataset.filter(lambda x: x["lang"] == "te")["question"])
en_context_val =  list(val_dataset["context"])

def UnfoldSentences(l):
    return [re.findall(r'\w+', sentence) for sentence in l]
    

ko_questions = UnfoldSentences(ko_questions)
ar_questions = UnfoldSentences(ar_questions)
te_questions = UnfoldSentences(te_questions)
en_context= UnfoldSentences(en_context)

ko_questions_val = UnfoldSentences(ko_questions_val)
ar_questions_val = UnfoldSentences(ar_questions_val)
te_questions_val = UnfoldSentences(te_questions_val)
en_context_val = UnfoldSentences(en_context_val)


In [20]:
import math

class Trigram:
    model = {}
    V = set()
    
    def __init__(self, name):
        self.model = {}
        self.name = name

    def Train(self, texts):
        self.V = set()
        for t in tqdm(range(len(texts))):
            self.V.add(texts[t][0])
            self.V.add(texts[t][1])
            for i in range(len(texts[t]) - 2):
                contextKey = (texts[t][i], texts[t][i+1])
                followupKey = texts[t][i+2]
                self.V.add(followupKey)
                
                if contextKey in self.model.keys():
                    if followupKey in self.model[contextKey].keys():
                        self.model[contextKey][followupKey] += 1
                    else:
                        self.model[contextKey][followupKey] = 1
                else:
                    self.model[contextKey] = {followupKey : 1}
    
    def Print(self):
        for context in self.model.keys():
            options = self.model[context]
            print(f"{context[0]} {context[1]} : {options}")

    def P(self, context, followup, smoothing="None"):
        if smoothing == "None":
            if context in self.model.keys() and followup in self.model[context].keys():
                    return self.model[context][followup]/sum(self.model[context].values()) 
            else:
                return 0 

        elif smoothing == "Laplace":
            V = len(self.V)
            trigramProb = self.model[context][followup] if context in self.model.keys() and followup in self.model[context] else 0
            bigramProb = sum(self.model[context].values()) if context in self.model.keys() else 0

            return (trigramProb + 1)/(bigramProb + V)
    
    def Perplexity(self, wordset, smoothing="None"):
        Sum = 0
        for i in range(len(wordset) - 2):
            inside = self.P((wordset[i], wordset[i+1]), wordset[i+2], smoothing)
            
            Sum += math.log(inside) if inside > 0 else float("-inf")

        return math.exp((-1/len(wordset)) * Sum)

    def AvgPerplexity(self, sentences, smoothing="None"):
        i = 0
        p = 0
        for sentence in sentences:
            p += self.Perplexity(sentence, smoothing)
            i+=1
        return p/i
    
class Bigram:
    model = {}
    V = set()
    
    def __init__(self, name):
        self.model = {}
        self.name = name

    def Train(self, texts):
        self.V = set()
        for t in tqdm(range(len(texts))):
            self.V.add(texts[t][0])
            for i in range(len(texts[t]) - 1):
                contextKey = (texts[t][i],)
                followupKey = texts[t][i+1]
                self.V.add(followupKey)
                if contextKey in self.model.keys():
                    if followupKey in self.model[contextKey].keys():
                        self.model[contextKey][followupKey] += 1
                    else:
                        self.model[contextKey][followupKey] = 1
                else:
                    self.model[contextKey] = {followupKey : 1}
    
    def Print(self):
        for context in self.model.keys():
            options = self.model[context]
            print(f"{context[0]} : {options}")

    def P(self, context, followup, smoothing="None"):
        if smoothing == "None":
            if context in self.model.keys() and followup in self.model[context].keys():
                    return self.model[context][followup]/sum(self.model[context].values()) 
            else:
                return 0 

        elif smoothing == "Laplace":
            V = len(self.V)
            trigramProb = self.model[context][followup] if context in self.model.keys() and followup in self.model[context] else 0
            bigramProb = sum(self.model[context].values()) if context in self.model.keys() else 0

            return (trigramProb + 1)/(bigramProb + V)
    
    def Perplexity(self, wordset, smoothing="None"):
        Sum = 0
        for i in range(len(wordset) - 1):
            inside = self.P((wordset[i]), wordset[i+1], smoothing)
            
            Sum += math.log(inside) if inside > 0 else float("-inf")

        return math.exp((-1/len(wordset)) * Sum)

    def AvgPerplexity(self, sentences, smoothing="None"):
        i = 0
        p = 0
        for sentence in sentences:
            p += self.Perplexity(sentence, smoothing)
            i+=1
        return p/i
    

In [21]:
ko_trigram = Trigram("Korean Trigram")
ar_trigram = Trigram("Arabic Trigram")
te_trigram = Trigram("Telegu Trigram")
en_trigram = Trigram("English Trigram")
ko_bigram = Bigram("Korean Bigram")
ar_bigram = Bigram("Arabic Bigram")
te_bigram = Bigram("Telegu Bigram")
en_bigram = Bigram("English Bigram")


ko_trigram.Train(ko_questions)
ar_trigram.Train(ar_questions)
te_trigram.Train(te_questions)
en_trigram.Train(en_context)

ko_bigram.Train(ko_questions)
ar_bigram.Train(ar_questions)
te_bigram.Train(te_questions)
en_bigram.Train(en_context)

            


100%|██████████| 2422/2422 [00:00<00:00, 688438.89it/s]
100%|██████████| 2558/2558 [00:00<00:00, 649378.38it/s]
100%|██████████| 1355/1355 [00:00<00:00, 552150.19it/s]
100%|██████████| 6335/6335 [00:00<00:00, 23631.72it/s]
100%|██████████| 2422/2422 [00:00<00:00, 678189.75it/s]
100%|██████████| 2558/2558 [00:00<00:00, 603704.12it/s]
100%|██████████| 1355/1355 [00:00<00:00, 549056.32it/s]
100%|██████████| 6335/6335 [00:00<00:00, 31022.27it/s]


In [22]:
def PrintPerplexity(model, val, cont):
    print("-"*30)
    print(model.name)
    print(f"Training - {model.AvgPerplexity(cont, smoothing='None')}")
    print(f"Validation - {model.AvgPerplexity(val, smoothing='None')}")
    print(f"Training w/Smoothing - {model.AvgPerplexity(cont, smoothing='Laplace')}")
    print(f"Validating w/Smoothing - {model.AvgPerplexity(val, smoothing='Laplace')}")

PrintPerplexity(ko_trigram, ko_questions_val, ko_questions)
PrintPerplexity(ar_trigram, ar_questions_val, ar_questions)
PrintPerplexity(te_trigram, te_questions_val, te_questions)
PrintPerplexity(en_trigram, en_context_val, en_context)
print("-------------------------------------------------------------------")
PrintPerplexity(ko_bigram, ko_questions_val, ko_questions)
PrintPerplexity(ar_bigram, ar_questions_val, ar_questions)
PrintPerplexity(te_bigram, te_questions_val, te_questions)
PrintPerplexity(en_bigram, en_context_val, en_context)

#BIGRAM NO SMOOTH
#BIGRAM SMOOTH
#TRIGRAM NO SMOOTH
#TRIGRAM SMOOTH

# print(AvgPerplexity(ko_trigram, ko_questions_val))
# print(AvgPerplexity(ko_trigram, ko_questions_val))

------------------------------
Korean Trigram
Training - 1.3724994620448698
Validation - inf
Training w/Smoothing - 96.5326889937332
Validating w/Smoothing - 144.74971104262917
------------------------------
Arabic Trigram
Training - 1.7720454974292472
Validation - inf
Training w/Smoothing - 212.542771765783
Validating w/Smoothing - 334.5481555198457
------------------------------
Telegu Trigram
Training - 1.2122110131502204
Validation - inf
Training w/Smoothing - 91.6131110954308
Validating w/Smoothing - 155.17222614374944
------------------------------
English Trigram
Training - 3.6279745563611394
Validation - inf
Training w/Smoothing - 12306.720430434365
Validating w/Smoothing - 22515.137953758822
-------------------------------------------------------------------
------------------------------
Korean Bigram
Training - inf
Validation - inf
Training w/Smoothing - 778.4789476176213
Validating w/Smoothing - 775.2594398868778
------------------------------
Arabic Bigram
Training - inf
V