<a href="https://colab.research.google.com/github/govindakolli/I2LLMs/blob/main/NGramModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Write a program to compute unsmoothed unigrams and bigrams.

In [None]:
from collections import Counter
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
#Preprocess
def preprocess_corpus(corpus):
  sentences = [sent.split() for sent in corpus] #[nltk.word_tokenize(sentence) for sentence in corpus] # to consider <s>, </s> as a single token
  return sentences

In [None]:
def unigram_compute(sentences):
  unigram_counts = Counter()
  total_words = 0

  for sentence in sentences:
    unigram_counts.update(sentence)
    total_words += len(sentence)

  unigram_probs = { word : count / total_words for word, count in unigram_counts.items()}
  return unigram_counts, unigram_probs

In [None]:
def bigram_compute(sentences):
  bigram_counts = Counter()
  bigrams_probs = {}

  for sentence in sentences:
    bigrams = list(nltk.bigrams(sentence))
    bigram_counts.update(bigrams)

  bigram_totals = Counter()
  for (w1, w2), count in bigram_counts.items():
      bigram_totals[w1] += count

  for(w1, w2), count in bigram_counts.items():
      bigrams_probs[(w1,w2)] = count / bigram_totals[w1]

  return bigram_counts, bigrams_probs

In [None]:

# Example Corpus
corpus = [
    "<s> I am Sam </s>",
    "<s> Sam I am </s>",
    "<s> I am Sam </s>",
    "<s> I do not like green eggs and Sam </s>"
]

In [None]:
# Preprocess Corpus
sentences = preprocess_corpus(corpus)

# Compute Unigram and Bigram Probabilities
unigram_counts, unigram_probs = unigram_compute(sentences)
bigram_counts, bigram_probs = bigram_compute(sentences)

# Output Results
print("Unigram Counts:", unigram_counts)
print("Unigram Probabilities:", unigram_probs)
print("Bigram Counts:", bigram_counts)
print("Bigram Probabilities:", bigram_probs)

Unigram Counts: Counter({'<s>': 4, 'I': 4, 'Sam': 4, '</s>': 4, 'am': 3, 'do': 1, 'not': 1, 'like': 1, 'green': 1, 'eggs': 1, 'and': 1})
Unigram Probabilities: {'<s>': 0.16, 'I': 0.16, 'am': 0.12, 'Sam': 0.16, '</s>': 0.16, 'do': 0.04, 'not': 0.04, 'like': 0.04, 'green': 0.04, 'eggs': 0.04, 'and': 0.04}
Bigram Counts: Counter({('<s>', 'I'): 3, ('I', 'am'): 3, ('Sam', '</s>'): 3, ('am', 'Sam'): 2, ('<s>', 'Sam'): 1, ('Sam', 'I'): 1, ('am', '</s>'): 1, ('I', 'do'): 1, ('do', 'not'): 1, ('not', 'like'): 1, ('like', 'green'): 1, ('green', 'eggs'): 1, ('eggs', 'and'): 1, ('and', 'Sam'): 1})
Bigram Probabilities: {('<s>', 'I'): 0.75, ('I', 'am'): 0.75, ('am', 'Sam'): 0.6666666666666666, ('Sam', '</s>'): 0.75, ('<s>', 'Sam'): 0.25, ('Sam', 'I'): 0.25, ('am', '</s>'): 0.3333333333333333, ('I', 'do'): 0.25, ('do', 'not'): 1.0, ('not', 'like'): 1.0, ('like', 'green'): 1.0, ('green', 'eggs'): 1.0, ('eggs', 'and'): 1.0, ('and', 'Sam'): 1.0}


In [None]:
corpus_email = [
    "<s> Hi John I hope you are doing well </s>",
    "<s> Please find the attached report </s>",
    "<s> Let me know if you have any questions </s>",
    "<s> Looking forward to your feedback </s>"
]

corpus_news = [
    "<s> The government announced a new policy today </s>",
    "<s> Scientists discovered a new species in the rainforest </s>",
    "<s> The stock market showed significant growth this week </s>",
    "<s> Experts predict economic recovery by next year </s>"
]

In [None]:
# Preprocess Corpus
sentences = preprocess_corpus(corpus_email)

# Compute Unigram and Bigram Probabilities
unigram_counts, unigram_probs = unigram_compute(sentences)
bigram_counts, bigram_probs = bigram_compute(sentences)

# Output Results
print("Unigram Counts:", unigram_counts)
print("Unigram Probabilities:", unigram_probs)
print("Bigram Counts:", bigram_counts)
print("Bigram Probabilities:", bigram_probs)

Unigram Counts: Counter({'<s>': 4, '</s>': 4, 'you': 2, 'Hi': 1, 'John': 1, 'I': 1, 'hope': 1, 'are': 1, 'doing': 1, 'well': 1, 'Please': 1, 'find': 1, 'the': 1, 'attached': 1, 'report': 1, 'Let': 1, 'me': 1, 'know': 1, 'if': 1, 'have': 1, 'any': 1, 'questions': 1, 'Looking': 1, 'forward': 1, 'to': 1, 'your': 1, 'feedback': 1})
Unigram Probabilities: {'<s>': 0.11764705882352941, 'Hi': 0.029411764705882353, 'John': 0.029411764705882353, 'I': 0.029411764705882353, 'hope': 0.029411764705882353, 'you': 0.058823529411764705, 'are': 0.029411764705882353, 'doing': 0.029411764705882353, 'well': 0.029411764705882353, '</s>': 0.11764705882352941, 'Please': 0.029411764705882353, 'find': 0.029411764705882353, 'the': 0.029411764705882353, 'attached': 0.029411764705882353, 'report': 0.029411764705882353, 'Let': 0.029411764705882353, 'me': 0.029411764705882353, 'know': 0.029411764705882353, 'if': 0.029411764705882353, 'have': 0.029411764705882353, 'any': 0.029411764705882353, 'questions': 0.029411764

In [None]:
# Preprocess Corpus
sentences = preprocess_corpus(corpus_news)

# Compute Unigram and Bigram Probabilities
unigram_counts, unigram_probs = unigram_compute(sentences)
bigram_counts, bigram_probs = bigram_compute(sentences)

# Output Results
print("Unigram Counts:", unigram_counts)
print("Unigram Probabilities:", unigram_probs)
print("Bigram Counts:", bigram_counts)
print("Bigram Probabilities:", bigram_probs)

Unigram Counts: Counter({'<s>': 4, '</s>': 4, 'The': 2, 'a': 2, 'new': 2, 'government': 1, 'announced': 1, 'policy': 1, 'today': 1, 'Scientists': 1, 'discovered': 1, 'species': 1, 'in': 1, 'the': 1, 'rainforest': 1, 'stock': 1, 'market': 1, 'showed': 1, 'significant': 1, 'growth': 1, 'this': 1, 'week': 1, 'Experts': 1, 'predict': 1, 'economic': 1, 'recovery': 1, 'by': 1, 'next': 1, 'year': 1})
Unigram Probabilities: {'<s>': 0.10526315789473684, 'The': 0.05263157894736842, 'government': 0.02631578947368421, 'announced': 0.02631578947368421, 'a': 0.05263157894736842, 'new': 0.05263157894736842, 'policy': 0.02631578947368421, 'today': 0.02631578947368421, '</s>': 0.10526315789473684, 'Scientists': 0.02631578947368421, 'discovered': 0.02631578947368421, 'species': 0.02631578947368421, 'in': 0.02631578947368421, 'the': 0.02631578947368421, 'rainforest': 0.02631578947368421, 'stock': 0.02631578947368421, 'market': 0.02631578947368421, 'showed': 0.02631578947368421, 'significant': 0.026315789

Add an option to your program to generate random sentences.


In [None]:
import random

In [None]:

def generate_sentence(bigram_probs, start_word="<s>", max_length=10):
    sentence = [start_word]

    while len(sentence) < max_length:
        last_word = sentence[-1]
        candidates = [(w2, prob) for (w1, w2), prob in bigram_probs.items() if w1 == last_word]

        if not candidates:
            break  # Stop if no candidates found

        words, probs = zip(*candidates)
        next_word = random.choices(words, weights=probs)[0]

        if next_word == "</s>":
            break

        sentence.append(next_word)

    return " ".join(sentence[1:])


In [None]:
# Preprocess Corpus
sentences = preprocess_corpus(corpus)

# Compute Unigram and Bigram Probabilities
unigram_counts, unigram_probs = unigram_compute(sentences)
bigram_counts, bigram_probs = bigram_compute(sentences)

# Output Results
print("Unigram Counts:", unigram_counts)
print("Unigram Probabilities:", unigram_probs)
print("Bigram Counts:", bigram_counts)
print("Bigram Probabilities:", bigram_probs)



Unigram Counts: Counter({'<s>': 4, 'I': 4, 'Sam': 4, '</s>': 4, 'am': 3, 'do': 1, 'not': 1, 'like': 1, 'green': 1, 'eggs': 1, 'and': 1})
Unigram Probabilities: {'<s>': 0.16, 'I': 0.16, 'am': 0.12, 'Sam': 0.16, '</s>': 0.16, 'do': 0.04, 'not': 0.04, 'like': 0.04, 'green': 0.04, 'eggs': 0.04, 'and': 0.04}
Bigram Counts: Counter({('<s>', 'I'): 3, ('I', 'am'): 3, ('Sam', '</s>'): 3, ('am', 'Sam'): 2, ('<s>', 'Sam'): 1, ('Sam', 'I'): 1, ('am', '</s>'): 1, ('I', 'do'): 1, ('do', 'not'): 1, ('not', 'like'): 1, ('like', 'green'): 1, ('green', 'eggs'): 1, ('eggs', 'and'): 1, ('and', 'Sam'): 1})
Bigram Probabilities: {('<s>', 'I'): 0.75, ('I', 'am'): 0.75, ('am', 'Sam'): 0.6666666666666666, ('Sam', '</s>'): 0.75, ('<s>', 'Sam'): 0.25, ('Sam', 'I'): 0.25, ('am', '</s>'): 0.3333333333333333, ('I', 'do'): 0.25, ('do', 'not'): 1.0, ('not', 'like'): 1.0, ('like', 'green'): 1.0, ('green', 'eggs'): 1.0, ('eggs', 'and'): 1.0, ('and', 'Sam'): 1.0}


In [None]:
# Generate a random sentence
print("Random Sentence:", generate_sentence(bigram_probs))

Random Sentence: I do not like green eggs and Sam


 Add an option to your program to compute the perplexity of a test set.

In [None]:
import math

In [None]:
def compute_perplexity(test_sentences, bigram_probs):
    log_prob_sum = 0
    total_bigrams = 0

    for sentence in test_sentences:
        bigrams = list(nltk.bigrams(sentence))
        for w1, w2 in bigrams:
            prob = bigram_probs.get((w1, w2), 1e-6)  # Small value to avoid log(0)
            log_prob_sum += math.log(prob)
            total_bigrams += 1

    perplexity = math.exp(-log_prob_sum / total_bigrams) if total_bigrams > 0 else float('inf')
    return perplexity


In [None]:
# Example Test Set
test_corpus = ["<s> I am Sam </s>", "<s> Sam I do </s>"]
test_sentences = preprocess_corpus(test_corpus)

# Compute Perplexity
print("Perplexity of test set:", compute_perplexity(test_sentences, bigram_probs))


Perplexity of test set: 11.082453821661106


###References
1. Speech and Language Processing. Daniel Jurafsky & James H. Martin.
https://web.stanford.edu/~jurafsky/slp3/3.pdf

2. Medium Blogs

3. ChatGPT ( Aware that it making Mistakes )