In [21]:
from collections import defaultdict, Counter
from typing import List, Tuple

In [22]:
# Sample corpus (can be extended)
corpus = [
    "I love deep learning",
    "I love machine learning",
    "deep learning is fun",
    "machine learning is powerful"
]

In [23]:
# Tokenize sentences into words
def tokenize(corpus: List[str]) -> List[List[str]]:
    return [sentence.lower().split() for sentence in corpus]


In [24]:
# Generate n-grams
def generate_ngrams(tokens: List[List[str]], n: int) -> List[Tuple[str, ...]]:
    ngrams = []
    for sentence in tokens:
        if len(sentence) >= n:
            for i in range(len(sentence) - n + 1):
                ngrams.append(tuple(sentence[i:i+n]))
    return ngrams

In [25]:
# Build n-gram model (frequency based)
def build_ngram_model(ngrams: List[Tuple[str, ...]]) -> Counter:
    return Counter(ngrams)

In [26]:
# Tokenize corpus
tokenized = tokenize(corpus)

In [27]:
# Generate unigram, bigram, trigram models
unigrams = generate_ngrams(tokenized, 1)
bigrams = generate_ngrams(tokenized, 2)
trigrams = generate_ngrams(tokenized, 3)

print("Unigrams:", unigrams)
print("Unigrams:", bigrams)
print("Trigrams:", trigrams)

Unigrams: [('i',), ('love',), ('deep',), ('learning',), ('i',), ('love',), ('machine',), ('learning',), ('deep',), ('learning',), ('is',), ('fun',), ('machine',), ('learning',), ('is',), ('powerful',)]
Unigrams: [('i', 'love'), ('love', 'deep'), ('deep', 'learning'), ('i', 'love'), ('love', 'machine'), ('machine', 'learning'), ('deep', 'learning'), ('learning', 'is'), ('is', 'fun'), ('machine', 'learning'), ('learning', 'is'), ('is', 'powerful')]
Trigrams: [('i', 'love', 'deep'), ('love', 'deep', 'learning'), ('i', 'love', 'machine'), ('love', 'machine', 'learning'), ('deep', 'learning', 'is'), ('learning', 'is', 'fun'), ('machine', 'learning', 'is'), ('learning', 'is', 'powerful')]


In [28]:
# Build models
unigram_model = build_ngram_model(unigrams)
bigram_model = build_ngram_model(bigrams)
trigram_model = build_ngram_model(trigrams)
print("Unigram Model:", unigram_model)
print("Bigram Model:", bigram_model)
print("Trigram Model:", trigram_model)



Unigram Model: Counter({('learning',): 4, ('i',): 2, ('love',): 2, ('deep',): 2, ('machine',): 2, ('is',): 2, ('fun',): 1, ('powerful',): 1})
Bigram Model: Counter({('i', 'love'): 2, ('deep', 'learning'): 2, ('machine', 'learning'): 2, ('learning', 'is'): 2, ('love', 'deep'): 1, ('love', 'machine'): 1, ('is', 'fun'): 1, ('is', 'powerful'): 1})
Trigram Model: Counter({('i', 'love', 'deep'): 1, ('love', 'deep', 'learning'): 1, ('i', 'love', 'machine'): 1, ('love', 'machine', 'learning'): 1, ('deep', 'learning', 'is'): 1, ('learning', 'is', 'fun'): 1, ('machine', 'learning', 'is'): 1, ('learning', 'is', 'powerful'): 1})


In [29]:
# Show top 5 most common in each model
top_unigrams = unigram_model.most_common(5)
top_bigrams = bigram_model.most_common(5)
top_trigrams = trigram_model.most_common(5)
top_unigrams, top_bigrams, top_trigrams

([(('learning',), 4),
  (('i',), 2),
  (('love',), 2),
  (('deep',), 2),
  (('machine',), 2)],
 [(('i', 'love'), 2),
  (('deep', 'learning'), 2),
  (('machine', 'learning'), 2),
  (('learning', 'is'), 2),
  (('love', 'deep'), 1)],
 [(('i', 'love', 'deep'), 1),
  (('love', 'deep', 'learning'), 1),
  (('i', 'love', 'machine'), 1),
  (('love', 'machine', 'learning'), 1),
  (('deep', 'learning', 'is'), 1)])

In [37]:
# Function to predict next word based on n-gram model
def predict_next_word(ngram_model: Counter, context: Tuple[str, ...], n: int) -> List[Tuple[str, int]]:
    predictions = []
    for ngram, count in ngram_model.items():
        if ngram[:-1] == context:
            predictions.append((ngram[-1], count))
    return sorted(predictions, key=lambda x: x[1], reverse=True)


In [41]:
# Example usage of prediction
context_bigram = ("deep", )
predictions = predict_next_word(bigram_model, context_bigram, 2)
print("Predictions for context", context_bigram, ":", predictions)


Predictions for context ('deep',) : [('learning', 2)]


In [42]:
# Example usage of prediction for trigram
context_trigram = ("machine", "learning", )
predictions_trigram = predict_next_word(trigram_model, context_trigram, 3)
print("Predictions for context", context_trigram, ":", predictions_trigram)


Predictions for context ('machine', 'learning') : [('is', 1)]


In [44]:
# Example usage of prediction for unigram
context_unigram = ("i",)
predictions_unigram = predict_next_word(bigram_model, context_unigram, 1)
print("Predictions for context", context_unigram, ":", predictions_unigram)


Predictions for context ('i',) : [('love', 2)]


In [34]:
# Example usage of prediction for bigram
context_bigram = ("i", "love")
predictions_bigram = predict_next_word(bigram_model, context_bigram, 2) 