In [29]:
import random
import heapq
import re
from collections import defaultdict, Counter

In [14]:
# Load and store the wikitext-103 corpus in a variable
with open("wikitext_103.txt", "r") as file:
    corpus = file.readlines()

In [15]:
# Get all words as a single string
words = ""
for text in corpus:
    words += text

In [16]:
# Preprocessing
def preprocess(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove punctuation
    text = text.lower()
    words = text.split()
    return words

words = preprocess(words)

In [17]:
# Build trigram model
def build_trigram_model(words: list[str]):
    trigram_counts = defaultdict(Counter)
    bigram_counts = Counter()

    for i in range(len(words) - 2):
        bigram = (words[i], words[i+1])  # Two-word prefix (bigram)
        next_word = words[i+2]        # Third word

        bigram_counts[bigram] += 1       # count bigram occurrences
        trigram_counts[bigram][next_word] += 1  # Add to the list of possible next words
    
    return trigram_counts, bigram_counts

In [None]:
trigram_counts, bigram_counts = build_trigram_model(words)

In [27]:
def generate_text(trigram_counts, start_words: str, length=30):
    
    text = start_words.split()  # Starting bigram
    
    for _ in range(length):
        current_bigram = tuple(text[-2:])  # Get the last two words (bigram)
        
        # If the current bigram is not in the model, stop generation
        if current_bigram not in trigram_counts:
            break
        
        # Get the possible next words and their probabilities
        next_word_candidates = trigram_counts[current_bigram]
        probabilities = {word: count / bigram_counts[current_bigram] for word, count in next_word_candidates.items()}
        
        # Use heapq to get the top 4 candidates by probability
        top_candidates = heapq.nlargest(4, probabilities.items(), key=lambda item: item[1])
        
        # Randomly choose one of the top 4 candidates based on their probabilities
        next_word = random.choices(
            [word for word, _ in top_candidates],
            weights=[prob for _, prob in top_candidates]
        )[0]
        
        # Append the chosen word to the text
        text.append(next_word)
    
    return " ".join(text)

In [33]:
starting_bigrams = list(trigram_counts.keys())

In [None]:
# Generate text using the Trigram Model
start_seq = " ".join(random.choice(starting_bigrams))
generate_text(trigram_counts, start_seq, 50)

'sandstone game pantry dating from around the world s first season of american idol on may 15 2012 in the united kingdom and the united kingdom and united states the song was written in the early years in the united states the united states and the other side of the year award'