In [23]:
import nltk
import random
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
import pandas as pd

In [24]:
nltk.download("punkt")
nltk.download('gutenberg')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [25]:
from nltk.corpus import gutenberg
corpus = gutenberg.sents("austen-emma.txt")[:1000]

In [26]:
flat_corpus = [word.lower() for sentence in corpus for word in sentence if word.isalpha()]

Unigram Model

In [27]:
unigram_counts = Counter(flat_corpus)
total_unigrams = sum(unigram_counts.values())

Bigram Model

In [28]:
bigram_counts = Counter(bigrams(flat_corpus))
total_bigrams = sum(bigram_counts.values())


Trigram Model

In [29]:
trigram_counts = Counter(trigrams(flat_corpus))
total_trigrams = sum(trigram_counts.values())

In [30]:
def predict_unigram():
    next_word = random.choices(list(unigram_counts.keys()), weights=unigram_counts.values())[0]
    return next_word

In [31]:
def predict_bigram(sequence):
    last_word = sequence[-1]
    possible_bigrams = {bigram[1]: count for bigram, count in bigram_counts.items() if bigram[0] == last_word}
    if possible_bigrams:
        next_word = random.choices(list(possible_bigrams.keys()), weights=possible_bigrams.values())[0]
    else:
        next_word = predict_unigram()  # Fallback to unigram if no bigram is available
    return next_word

In [32]:
def predict_trigram(sequence):

    last_bigram = tuple(sequence[-2:])
    possible_trigrams = {trigram[2]: count for trigram, count in trigram_counts.items() if trigram[:2] == last_bigram}
    if possible_trigrams:
        next_word = random.choices(list(possible_trigrams.keys()), weights=possible_trigrams.values())[0]
    else:
        next_word = predict_bigram(sequence)
    return next_word

In [33]:
sample_sequences = [
    ["the"], ["data"], ["machine", "learning"], ["artificial", "intelligence"],
    ["deep", "learning"], ["in", "the"], ["language", "model"], ["the", "future", "of"],
    ["predictive"], ["modeling", "and"]
]

In [34]:
results = []
for sequence in sample_sequences:
    if len(sequence) == 1:
        # Unigram Prediction
        unigram_pred = predict_unigram()
        # Bigram Prediction
        bigram_pred = predict_bigram(sequence)
        # Trigram Prediction - Not applicable for single-word sequences
        trigram_pred = "N/A (requires 2 words)"
    elif len(sequence) == 2:
        unigram_pred = predict_unigram()
        bigram_pred = predict_bigram(sequence)
        trigram_pred = predict_trigram(sequence)
    else:
        unigram_pred, bigram_pred, trigram_pred = "N/A", "N/A", "N/A"
    results.append((sequence, unigram_pred, bigram_pred, trigram_pred))



Display Results in a Table Format

In [35]:
print(f"{'Sequence of Words':<25} {'Unigram Prediction':<20} {'Bigram Prediction':<20} {'Trigram Prediction':<20}")
for seq, uni, bi, tri in results:
    seq_str = " ".join(seq)
    print(f"{seq_str:<25} {uni:<20} {bi:<20} {tri:<20}")


Sequence of Words         Unigram Prediction   Bigram Prediction    Trigram Prediction  
the                       mr                   young                N/A (requires 2 words)
data                      his                  never                N/A (requires 2 words)
machine learning          a                    a                    the                 
artificial intelligence   height               small                it                  
deep learning             light                again                the                 
in the                    and                  other                right               
language model            were                 mr                   mr                  
the future of             N/A                  N/A                  N/A                 
predictive                than                 had                  N/A (requires 2 words)
modeling and              sink                 said                 saying              
