In [1]:
import pandas as pd
import nltk
import numpy as np
import os.path
import math
import re
from tabulate import tabulate
from nltk.tokenize import RegexpTokenizer 

In [2]:
# Read the corpus from a file
with open('C:/Users/megha/OneDrive/Desktop/train.txt', "r", encoding="utf-8") as file:
    corpus = file.read()

In [3]:
len(corpus)

438920

In [4]:
import re

def tokenize_and_preprocess(corpus):
    # Tokenization: Split text into words
    tokens = re.findall(r'\b\w+\b', corpus.lower())  # Using regex to tokenize and convert to lowercase
    
    # Add <s> tagging at the beginning of each sentence
    tagged_tokens = ['<s>']  # Initialize with the start of sentence tag
    for token in tokens:
        tagged_tokens.append(token)
        if token.endswith('.') or token.endswith('!') or token.endswith('?'):
            tagged_tokens.append('<s>')  # Add <s> after sentence-ending punctuation
    
    return tagged_tokens

training_tokens = tokenize_and_preprocess(corpus)
print(training_tokens)




In [5]:
print(len(training_tokens))

80301


In [6]:
def build_unigram_freq_dict(tokens):
    unigram_freq_dict = {}
    for token in tokens:
        if token in unigram_freq_dict:
            unigram_freq_dict[token] += 1
        else:
            unigram_freq_dict[token] = 1
    return unigram_freq_dict

def build_bigram_freq_dict(tokens):
    bigram_freq_dict = {}
    for i in range(len(tokens) - 1):
        bigram = (tokens[i], tokens[i + 1])
        if bigram in bigram_freq_dict:
            bigram_freq_dict[bigram] += 1
        else:
            bigram_freq_dict[bigram] = 1
    return bigram_freq_dict

In [7]:
def calculate_unigram_probabilities(unigram_freq_dict, total_tokens):
    unigram_probs = {}
    for word, freq in unigram_freq_dict.items():
        probability = freq / total_tokens
        unigram_probs[word] = probability
    return unigram_probs

def calculate_bigram_probabilities(bigram_freq_dict, unigram_freq_dict):
    bigram_probs = {}
    for bigram, freq in bigram_freq_dict.items():
        word1, word2 = bigram
        unigram_freq_word1 = unigram_freq_dict[word1]
        if unigram_freq_word1 > 0:
            probability = freq / unigram_freq_word1
            bigram_probs[bigram] = probability
    return bigram_probs


In [8]:
unigram_freq_dict = build_unigram_freq_dict(training_tokens)
bigram_freq_dict = build_bigram_freq_dict(training_tokens)

total_tokens = len(training_tokens)
unigram_probs = calculate_unigram_probabilities(unigram_freq_dict, total_tokens)
bigram_probs = calculate_bigram_probabilities(bigram_freq_dict, unigram_freq_dict)


In [9]:
unigram_table = []
bigram_table = []
for word, freq in unigram_freq_dict.items():
    probability = unigram_probs[word]
    unigram_table.append([word, freq, probability])

# Bigram table
for bigram, freq in bigram_freq_dict.items():
    word1, word2 = bigram
    probability = bigram_probs.get(bigram, 0)  
    bigram_table.append([word1, word2, freq, probability])
    
num_rows_to_display = 10
top_rows_unigram = unigram_table[:num_rows_to_display]
top_rows_bigram = bigram_table[:num_rows_to_display]

In [10]:
print("Unigram Frequencies and Probabilities:")
print(tabulate(top_rows_unigram, headers=["Unigram", "Frequency", "Probability"], tablefmt="grid"))

Unigram Frequencies and Probabilities:
+-----------+-------------+---------------+
| Unigram   |   Frequency |   Probability |
| <s>       |           1 |   1.24531e-05 |
+-----------+-------------+---------------+
| i         |        1722 |   0.0214443   |
+-----------+-------------+---------------+
| booked    |          86 |   0.00107097  |
+-----------+-------------+---------------+
| two       |         129 |   0.00160646  |
+-----------+-------------+---------------+
| rooms     |         203 |   0.00252799  |
+-----------+-------------+---------------+
| four      |          21 |   0.000261516 |
+-----------+-------------+---------------+
| months    |           8 |   9.96252e-05 |
+-----------+-------------+---------------+
| in        |        1318 |   0.0164132   |
+-----------+-------------+---------------+
| advance   |           7 |   8.7172e-05  |
+-----------+-------------+---------------+
| at        |         745 |   0.00927759  |
+-----------+-------------+----------

In [11]:
print("\nTop 10 Bigrams (by Frequency and Probability):")
print(tabulate(top_rows_bigram, headers=["Word1", "Word2", "Frequency", "Probability"], tablefmt="grid"))


Top 10 Bigrams (by Frequency and Probability):
+---------+---------+-------------+---------------+
| Word1   | Word2   |   Frequency |   Probability |
| <s>     | i       |           1 |    1          |
+---------+---------+-------------+---------------+
| i       | booked  |          21 |    0.0121951  |
+---------+---------+-------------+---------------+
| booked  | two     |           1 |    0.0116279  |
+---------+---------+-------------+---------------+
| two     | rooms   |           3 |    0.0232558  |
+---------+---------+-------------+---------------+
| rooms   | four    |           1 |    0.00492611 |
+---------+---------+-------------+---------------+
| four    | months  |           1 |    0.047619   |
+---------+---------+-------------+---------------+
| months  | in      |           2 |    0.25       |
+---------+---------+-------------+---------------+
| in      | advance |           7 |    0.00531108 |
+---------+---------+-------------+---------------+
| advance | at  

### Smoothing and handling unknown values

In [12]:
# Vocabulary size 
v = len(set(training_tokens))
v

5963

In [13]:
k = 2.0
OOV_FREQ = 1e-10

In [14]:
def laplace_smoothed_unigram_prob(word):
    return (unigram_freq_dict.get(word,0) +1 / len(training_tokens) + v)

In [15]:
def laplace_smoothed_bigram_prob(word1, word2):
    bigram = (word1, word2)
    return (bigram_freq_dict.get(bigram, 0) + 1) / (unigram_freq_dict.get(word1, 0) + v)

In [16]:
def add_k_smoothed_unigram_prob(word):
    return (unigram_freq_dict.get(word, 0) + k /len(training_tokens)+v)

In [17]:
def add_k_smoothed_bigram_prob(word1, word2):
    bigram = (word1, word2)
    return (bigram_freq_dict.get(bigram, 0) + k) / (unigram_freq_dict.get(word1, 0) + k* v)

### Perplexity Calculation

In [51]:
def calculate_perplexity_unigram(test_corpus, laplace_smoothing=False, add_k_smoothing=False):
    test_tokens = tokenize_and_preprocess(test_corpus)
    N = len(test_tokens)
    log_prob_sum = 0

    for i in range(1, N):
        bigram = (test_tokens[i - 1], test_tokens[i])
        unigram = test_tokens[i]

        if laplace_smoothing:
            bigram_prob = laplace_smoothed_bigram_prob(bigram[0], bigram[1]) + 1e-10  
            unigram_prob = laplace_smoothed_unigram_prob(unigram) + 1e-10
        elif add_k_smoothing:
            bigram_prob = add_k_smoothed_bigram_prob(bigram[0], bigram[1]) + 1e-10  
            unigram_prob = add_k_smoothed_unigram_prob(unigram) + 1e-10
        else:
            # Handle unknown words
            bigram_prob = bigram_freq_dict.get(bigram, OOV_FREQ) / unigram_freq_dict.get(bigram[0], OOV_FREQ)
            unigram_prob = unigram_freq_dict.get(unigram, OOV_FREQ) / len(training_tokens)

        log_prob_sum += math.log2(unigram_prob if unigram_prob > 0 else bigram_prob)

    avg_log_prob = log_prob_sum / N
    perplexity = 2 ** (-avg_log_prob)
    return perplexity

In [52]:
def calculate_perplexity_Bigram(test_corpus, laplace_smoothing=False, add_k_smoothing=False):
    test_tokens = tokenize_and_preprocess(test_corpus)
    N = len(test_tokens)
    log_prob_sum = 0

    for i in range(1, N):
        bigram = (test_tokens[i - 1], test_tokens[i])
        unigram = test_tokens[i]

        if laplace_smoothing:
            bigram_prob = laplace_smoothed_bigram_prob(bigram[0], bigram[1]) + 1e-10  
            unigram_prob = laplace_smoothed_unigram_prob(unigram) + 1e-10
        elif add_k_smoothing:
            bigram_prob = add_k_smoothed_bigram_prob(bigram[0], bigram[1]) + 1e-10  
            unigram_prob = add_k_smoothed_unigram_prob(unigram) + 1e-10
        else:
            # Handle unknown words
            bigram_prob = bigram_freq_dict.get(bigram, OOV_FREQ) / unigram_freq_dict.get(bigram[0], OOV_FREQ)
            unigram_prob = unigram_freq_dict.get(unigram, OOV_FREQ) / len(training_tokens)

        log_prob_sum += math.log2(bigram_prob if bigram_prob > 0 else unigram_prob)

    avg_log_prob = log_prob_sum / N
    perplexity = 2 ** (-avg_log_prob)
    return perplexity

In [53]:
with open('C:/Users/megha/OneDrive/Desktop/val.txt', "r", encoding="utf-8") as validation_file:
    validation_corpus = validation_file.read()

In [54]:
perplexity_laplace_unigram = calculate_perplexity_unigram(validation_corpus, laplace_smoothing=True)

perplexity_add_k_unigram = calculate_perplexity_unigram(validation_corpus, add_k_smoothing=True)

print(f"Perplexity (Laplace Smoothing) Unigram: {perplexity_laplace_unigram}")
print(f"Perplexity (Add-K Smoothing) Unigram: {perplexity_add_k_unigram}")

Perplexity (Laplace Smoothing) Unigram: 0.00014940586190818103
Perplexity (Add-K Smoothing) Unigram: 0.00014940586218951008


In [57]:
k = 0.1
perplexity_laplace_unigram = calculate_perplexity_unigram(validation_corpus, laplace_smoothing=True)

perplexity_add_k_unigram = calculate_perplexity_unigram(validation_corpus, add_k_smoothing=True)

print(f"Perplexity (Laplace Smoothing) Unigram: {perplexity_laplace_unigram}")
print(f"Perplexity (Add-K Smoothing) Unigram: {perplexity_add_k_unigram}")

Perplexity (Laplace Smoothing) Unigram: 0.00014940586190818103
Perplexity (Add-K Smoothing) Unigram: 0.00014940586216138644


In [61]:
k  = 5.0
perplexity_add_k_unigram = calculate_perplexity_unigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Unigram: for {k} is  {perplexity_add_k_unigram}")
k  = 4.0
perplexity_add_k_unigram = calculate_perplexity_unigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Unigram: for {k} is  {perplexity_add_k_unigram}")
k  = 3.0
perplexity_add_k_unigram = calculate_perplexity_unigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Unigram: for {k} is  {perplexity_add_k_unigram}")
k  = 2.0
perplexity_add_k_unigram = calculate_perplexity_unigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Unigram: for {k} is  {perplexity_add_k_unigram}")
k  = 1.0
perplexity_add_k_unigram = calculate_perplexity_unigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Unigram: for {k} is  {perplexity_add_k_unigram}")
k  = 0.9
perplexity_add_k_unigram = calculate_perplexity_unigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Unigram: for {k} is  {perplexity_add_k_unigram}")
k  = 0.8
perplexity_add_k_unigram = calculate_perplexity_unigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Unigram: for {k} is  {perplexity_add_k_unigram}")


Perplexity (Add-K Smoothing) Unigram: for 5.0 is  0.00014940586078274377
Perplexity (Add-K Smoothing) Unigram: for 4.0 is  0.00014940586106409177
Perplexity (Add-K Smoothing) Unigram: for 3.0 is  0.00014940586134542304
Perplexity (Add-K Smoothing) Unigram: for 2.0 is  0.00014940586162678907
Perplexity (Add-K Smoothing) Unigram: for 1.0 is  0.00014940586190818103
Perplexity (Add-K Smoothing) Unigram: for 0.9 is  0.00014940586193631163
Perplexity (Add-K Smoothing) Unigram: for 0.8 is  0.0001494058619644667


In [49]:
perplexity_laplace_bigram = calculate_perplexity_Bigram(validation_corpus, laplace_smoothing=True)

perplexity_add_k_bigram = calculate_perplexity_Bigram(validation_corpus, add_k_smoothing=True)

print(f"Perplexity (Laplace Smoothing) Bigram: {perplexity_laplace_bigram}")
print(f"Perplexity (Add-K Smoothing) Bigram: {perplexity_add_k_bigram}")

Perplexity (Laplace Smoothing) Bigram: 1242.1992498720201
Perplexity (Add-K Smoothing) Bigram: 503.26481488988577


In [50]:
k  = 0.9
perplexity_add_k_bigram = calculate_perplexity_Bigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Bigram: for {k} is  {perplexity_add_k_bigram}")
k  = 0.7
perplexity_add_k_bigram = calculate_perplexity_Bigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Bigram: for {k} is  {perplexity_add_k_bigram}")
k  = 0.6
perplexity_add_k_bigram = calculate_perplexity_Bigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Bigram: for {k} is  {perplexity_add_k_bigram}")
k  = 0.5
perplexity_add_k_bigram = calculate_perplexity_Bigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Bigram: for {k} is  {perplexity_add_k_bigram}")
k  = 0.4
perplexity_add_k_bigram = calculate_perplexity_Bigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Bigram: for {k} is  {perplexity_add_k_bigram}")
k  = 0.3
perplexity_add_k_bigram = calculate_perplexity_Bigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Bigram: for {k} is  {perplexity_add_k_bigram}")
k  = 0.2
perplexity_add_k_bigram = calculate_perplexity_Bigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Bigram: for {k} is  {perplexity_add_k_bigram}")
k  = 0.09
perplexity_add_k_bigram = calculate_perplexity_Bigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Bigram: for {k} is  {perplexity_add_k_bigram}")
k  = 0.01
perplexity_add_k_bigram = calculate_perplexity_Bigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Bigram: for {k} is  {perplexity_add_k_bigram}")
k  = 0.001
perplexity_add_k_bigram = calculate_perplexity_Bigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Bigram: for {k} is  {perplexity_add_k_bigram}")
k  = 0.0001
perplexity_add_k_bigram = calculate_perplexity_Bigram(validation_corpus, add_k_smoothing=True)
print(f"Perplexity (Add-K Smoothing) Bigram: for {k} is  {perplexity_add_k_bigram}")

Perplexity (Add-K Smoothing) Bigram: for 0.9 is  1185.2910197806507
Perplexity (Add-K Smoothing) Bigram: for 0.7 is  1060.2573232573732
Perplexity (Add-K Smoothing) Bigram: for 0.6 is  990.7939736946917
Perplexity (Add-K Smoothing) Bigram: for 0.5 is  915.4019957078116
Perplexity (Add-K Smoothing) Bigram: for 0.4 is  832.5677747187141
Perplexity (Add-K Smoothing) Bigram: for 0.3 is  739.9168701642853
Perplexity (Add-K Smoothing) Bigram: for 0.2 is  633.2378077780857
Perplexity (Add-K Smoothing) Bigram: for 0.1 is  503.26481488988577
Perplexity (Add-K Smoothing) Bigram: for 0.09 is  488.2278925122054
Perplexity (Add-K Smoothing) Bigram: for 0.01 is  352.9055106971306
Perplexity (Add-K Smoothing) Bigram: for 0.001 is  446.2158770216569
Perplexity (Add-K Smoothing) Bigram: for 0.0001 is  799.259342640971


## Conclusion: 

 - a. Add-k smoothing is giving better perplexity than laplace
 - b. As k value decreases the perplexity is getting better