In [5]:
from collections import defaultdict
import math
import nltk
from nltk.util import ngrams
from collections import Counter

def n_gram_count(data, n):
    n_grams = defaultdict(int)
    for i in range(len(data)-n+1):
        n_gram = tuple(data[i:i+n])
        n_grams[n_gram] += 1
    return n_grams

def add_k_smoothing(data, n, k):
    n_grams = n_gram_count(data, n)
    vocabulary_size = len(set(data))
    total_n_grams = sum(n_grams.values())
    for n_gram in n_grams:
        n_grams[n_gram] = (n_grams[n_gram] + k) / (total_n_grams + k * vocabulary_size ** n)
    return n_grams

def n_gram_smoothing(data, n, k):
    n_grams = add_k_smoothing(data, n, k)
    def probability(n_gram):
        return n_grams[n_gram]
    return probability

In [6]:
data = ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
n = 2
k = 1

probability = n_gram_smoothing(data, n, k)

print(probability(("the", "quick")))  # 0.125
print(probability(("quick", "brown")))  # 0.16666666666666666
print(probability(("quick","fox")))  # 0.029411764705882353

0.027777777777777776
0.027777777777777776
0


In [7]:
# Load the text corpus
corpus = "Hi! I am Hritvik Mathur and I am performing experiment of natural language processing"

# Convert the corpus into tokens
tokens = nltk.word_tokenize(corpus)

# Set the value of n for n-gram smoothing
n = 2

# Create n-grams from the tokens
n_grams = list(ngrams(tokens, n))

# Count the frequency of each n-gram
n_gram_freq = Counter(n_grams)

# Count the frequency of each individual token
token_freq = Counter(tokens)

# Calculate the probability of each n-gram using n-gram smoothing
n_gram_prob = {}
for n_gram in n_gram_freq.keys():
    context = n_gram[:-1]
    count = n_gram_freq[n_gram]
    total_count = token_freq[context]
    prob = (count + 1) / (total_count + len(token_freq))
    n_gram_prob[n_gram] = prob

# Print the n-gram probabilities
print("N-gram probabilities:")
for n_gram, prob in n_gram_prob.items():
    print("{} -> {:.2f}".format(n_gram, prob))

N-gram probabilities:
('Hi', '!') -> 0.15
('!', 'I') -> 0.15
('I', 'am') -> 0.23
('am', 'Hritvik') -> 0.15
('Hritvik', 'Mathur') -> 0.15
('Mathur', 'and') -> 0.15
('and', 'I') -> 0.15
('am', 'performing') -> 0.15
('performing', 'experiment') -> 0.15
('experiment', 'of') -> 0.15
('of', 'natural') -> 0.15
('natural', 'language') -> 0.15
('language', 'processing') -> 0.15
