### a) Build a language model based on n-grams using the Laplace smoothing method for the following models

In [14]:
from nltk import word_tokenize
from collections import defaultdict
import nltk
import random
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dungd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
with (open('tedtalk.txt', 'r', encoding='utf-8')) as f:
    corpus=f.read()
tokens=word_tokenize(corpus.lower())

In [8]:
def build_ngram_counts(tokens, n):
    counts = defaultdict(int)
    i = 0
    while i <= len(tokens) - n:
        ngram = tuple(tokens[i:i+n])
        counts[ngram] += 1
        i += 1
    return counts

In [9]:
def prepare_counts(tokens, max_n):
    return {n: build_ngram_counts(tokens, n) for n in range(1, max_n+1)}
counts_by_n = prepare_counts(tokens, 3)

In [10]:
def stupid_backoff_prob(counts_by_n, context, word, lam=0.4):
    n = len(context)+1  # n-gram order
    ngram = context + (word,)

    if n in counts_by_n and counts_by_n[n].get(ngram, 0) > 0:
        numer = counts_by_n[n][ngram]
        denom = sum(v for key, v in counts_by_n[n].items() if key[:-1] == context)
        return numer / denom
    else:
        if len(context) > 0:
            return lam * stupid_backoff_prob(counts_by_n, context[1:], word, lam)
        else:
            # unigram fallback
            unigram_counts = counts_by_n[1]
            total = sum(unigram_counts.values())
            return unigram_counts.get((word,), 0) / total

### b) Compare with the results from In Class Exercise.




In [13]:
def laplace_smoothed_trigram_prob(w1, w2, w3, trigram_counts, bigram_counts, V):
    return (trigram_counts[(w1, w2, w3)] + 1) / (bigram_counts[(w1, w2)] + V)

print("Laplace probability:", laplace_smoothed_trigram_prob("This", "conference", "is", counts_by_n[3], counts_by_n[2], len(set(tokens))))
print("Backoff probability:", stupid_backoff_prob(counts_by_n, ("This","conference"), "is"))


Laplace probability: 1.1822706690469716e-05
Backoff probability: 0.019459459459459462


### c) Use the newly built model to generate the next words for a given word sequence.

In [15]:
def generate_next_word(counts_by_n, context, lam=0.4):
    vocab = [w[0] for w in counts_by_n[1]]  # keys dạng ('word',)
    if not vocab:
        return None

    probs = {w: stupid_backoff_prob(counts_by_n, context, w, lam) for w in vocab}
    max_p = max(probs.values())
    best_words = [w for w, p in probs.items() if p == max_p]
    return random.choice(best_words)


In [16]:
def generate_sequence(counts_by_n, seed, length=10, lam=0.4):
    context = tuple(seed.split())
    sentence = list(context)
    for i in range(length):
        best_word = generate_next_word(counts_by_n, context, lam)
        sentence.append(best_word)

        if len(context) >= 2: 
            context = tuple(sentence[-len(context):])
        else:
            context = (best_word,)
    return ' '.join(sentence)


In [18]:
test_tokens = " The green agenda is probably the most important agenda and issue of the day".split()
test_counts_by_n = prepare_counts(test_tokens, 3)

print(generate_sequence(test_counts_by_n, "the most important", 6))


the most important agenda and issue of the day


### d. Combine with a function that calculates the distance between words to predict the correct word for a misspelled word position. (from difflib import get_close_matches)

In [26]:
from difflib import get_close_matches

def correct_misspelled_word(counts_by_n, context, misspelled, lam=0.4, n_candidates=5):
    # vocab lấy từ unigram keys dạng ('word',)
    vocab = [k[0] for k in counts_by_n[1]]

    candidates = get_close_matches(misspelled, vocab, n=n_candidates, cutoff=0.0)
    print(f"true, {candidates}")
    if not candidates:
        candidates = vocab
        print(f"false, {candidates}")

    probs_dict = {c: stupid_backoff_prob(counts_by_n, context, c, lam) for c in candidates}

    max_p = max(probs_dict.values()) if probs_dict else 0.0
    best = [c for c, p in probs_dict.items() if p == max_p] or vocab
    best_word = random.choice(best)

    return best_word, list(probs_dict.items())


In [25]:
context = ('sustainable',)
predicted_word, probs = correct_misspelled_word(counts_by_n, context, 'enrgey')
print(predicted_word)

true, ['sergey', 'energy', 'henfrey', 'enraged', 'enlarge']
energy
