Assignment 5: N-grams

In [1]:
import nltk
import pandas as pd
import string

nltk.download('brown')
from nltk.corpus import brown

#preprocessing
tokenized_text = brown.words()

def preprocess(text):
    return [word.lower() for word in text if word not in string.punctuation]

cleaned_tokens = preprocess(tokenized_text)

[nltk_data] Downloading package brown to C:\Users\Cristina
[nltk_data]     Ortega\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


In [2]:
#N-grams frequency dataframe
from nltk.util import ngrams
from collections import Counter

def get_ngram_freq_df(tokens, n):
    n_grams = list(ngrams(tokens, n))
    freq_dist = Counter(n_grams)
    df = pd.DataFrame(freq_dist.items(), columns=['ngram', 'frequency'])
    df = df.sort_values(by='frequency', ascending=False).reset_index(drop=True)
    return df

In [3]:
#Top next words
def predict_next_words(seq, ngram_df, k=5):
    seq_tuple = tuple(seq.lower().split())
    seq_len = len(seq_tuple)
 
    filtered = ngram_df[ngram_df['ngram'].apply(lambda x: x[:seq_len] == seq_tuple)]
    
    top_ngrams = filtered.sort_values(by='frequency', ascending=False).head(k)
    
    next_words = [ngram[-1] for ngram in top_ngrams['ngram']]
    
    return next_words

In [4]:
#example
bigram_df = get_ngram_freq_df(cleaned_tokens, 2)

#predicting next word after "the"
print(predict_next_words("the", bigram_df, k=5))

['first', 'same', 'most', 'other', '``']


In [5]:
#example2
trigram_df = get_ngram_freq_df(cleaned_tokens, 3)

#predicting next words after "in the"
print(predict_next_words("in the", trigram_df, k=5))

['world', 'first', 'united', 'same', 'past']


In [8]:
#Split into 90% train, 10% test
split_idx = int(0.9 * len(cleaned_tokens))
train_tokens = cleaned_tokens[:split_idx]
test_tokens = cleaned_tokens[split_idx:]

In [9]:
from nltk.util import ngrams
from collections import Counter
import pandas as pd

def get_ngram_freq_df(tokens, n):
    n_grams = list(ngrams(tokens, n))
    freq_dist = Counter(n_grams)
    df = pd.DataFrame(freq_dist.items(), columns=['ngram', 'frequency'])
    return df

In [10]:
def predict_next_words(seq, ngram_df, k=5):
    seq_tuple = tuple(seq.lower().split())
    seq_len = len(seq_tuple)
    
    filtered = ngram_df[ngram_df['ngram'].apply(lambda x: x[:seq_len] == seq_tuple)]
    top_ngrams = filtered.sort_values(by='frequency', ascending=False).head(k)
    next_words = [ngram[-1] for ngram in top_ngrams['ngram']]
    
    return next_words

In [11]:
def evaluate_accuracy(test_tokens, ngram_df, n, k=5, max_tests=1000):
    correct = 0
    total = 0

    for i in range(len(test_tokens) - n):
        context = test_tokens[i:i+n-1]
        true_next = test_tokens[i+n-1]
        context_str = ' '.join(context)

        predicted = predict_next_words(context_str, ngram_df, k)

        if true_next in predicted:
            correct += 1
        total += 1

        if total >= max_tests:
            break

    accuracy = correct / total if total > 0 else 0
    return accuracy

In [12]:
#accuracy
#Bigram
bigram_df = get_ngram_freq_df(train_tokens, 2)
bigram_accuracy = evaluate_accuracy(test_tokens, bigram_df, n=2, k=5)
print(f"Bigram Top-5 Accuracy: {bigram_accuracy:.2%}")

#Trigram
trigram_df = get_ngram_freq_df(train_tokens, 3)
trigram_accuracy = evaluate_accuracy(test_tokens, trigram_df, n=3, k=5)
print(f"Trigram Top-5 Accuracy: {trigram_accuracy:.2%}")

Bigram Top-5 Accuracy: 23.60%
Trigram Top-5 Accuracy: 14.00%
