In [2]:
import nltk
from nltk.corpus import brown, treebank
from nltk.util import ngrams
from collections import Counter

# Download corpora
nltk.download('brown')
nltk.download('treebank')

# Preprocess and build n-gram model
def build_ngram_model(sentences, n=2):
    sentences = [[word.lower() for word in sent] for sent in sentences]
    ngram_counts, context_counts = Counter(), Counter()
    for sent in sentences:
        sent = ['<s>'] * (n - 1) + sent + ['</s>']
        ngram_counts.update(ngrams(sent, n))
        context_counts.update(ngrams(sent, n - 1))
    return ngram_counts, context_counts, sentences

# Predict next word and find matching sentences
def predict_and_match(ngram_counts, context_counts, sentences, context, phrase_words, top_n=3, sent_n=10):
    context = tuple(context)
    candidates = {ngram[-1]: count for ngram, count in ngram_counts.items() if ngram[:-1] == context}
    if not candidates:
        return [], []
    total_count = context_counts.get(context, 1)
    predictions = sorted([(word, count / total_count) for word, count in candidates.items()], 
                         key=lambda x: x[1], reverse=True)[:top_n]
    top_word = predictions[0][0]
    matches = [' '.join(sent) for sent in sentences 
               if phrase_words in [sent[i:i+len(phrase_words)] for i in range(len(sent)-len(phrase_words)+1)] 
               and top_word in sent[slice(sent.index(phrase_words[-1])+1, len(sent))]][:sent_n]
    return predictions, matches

# Main function
def process_input(user_input):
    words = user_input.lower().strip().split()
    
    # Load and build models
    bigram_brown, bigram_context_brown, brown_sents = build_ngram_model(brown.sents(), 2)
    trigram_brown, trigram_context_brown, _ = build_ngram_model(brown_sents, 3)
    bigram_wsj, bigram_context_wsj, wsj_sents = build_ngram_model(treebank.sents(), 2)
    trigram_wsj, trigram_context_wsj, _ = build_ngram_model(wsj_sents, 3)
    
    # Bigram predictions
    if len(words) >= 1:
        bigram_pred_brown, brown_matches = predict_and_match(bigram_brown, bigram_context_brown, brown_sents, 
                                                            [words[-1]], words)
        bigram_pred_wsj, wsj_matches = predict_and_match(bigram_wsj, bigram_context_wsj, wsj_sents, 
                                                        [words[-1]], words)
        
        print("\nBigram Model Predictions:")
        print("Brown Corpus:")
        for word, prob in bigram_pred_brown:
            print(f"{word}: {prob:.4f}")
        print("WSJ Corpus:")
        for word, prob in bigram_pred_wsj:
            print(f"{word}: {prob:.4f}")
        
        if bigram_pred_brown and bigram_pred_wsj:
            print(f"\nTop 10 Brown Corpus sentences with '{user_input} {bigram_pred_brown[0][0]}':")
            print("\n".join(f"{i}. {sent}" for i, sent in enumerate(brown_matches, 1)) or "No matching sentences found.")
            print(f"\nTop 10 WSJ Corpus sentences with '{user_input} {bigram_pred_wsj[0][0]}':")
            print("\n".join(f"{i}. {sent}" for i, sent in enumerate(wsj_matches, 1)) or "No matching sentences found.")
    
    # Trigram predictions
    if len(words) >= 2:
        trigram_pred_brown, brown_matches = predict_and_match(trigram_brown, trigram_context_brown, brown_sents, 
                                                             words[-2:], words)
        trigram_pred_wsj, wsj_matches = predict_and_match(trigram_wsj, trigram_context_wsj, wsj_sents, 
                                                         words[-2:], words)
        
        print("\nTrigram Model Predictions:")
        print("Brown Corpus:")
        for word, prob in trigram_pred_brown:
            print(f"{word}: {prob:.4f}")
        print("WSJ Corpus:")
        for word, prob in trigram_pred_wsj:
            print(f"{word}: {prob:.4f}")
        
        if trigram_pred_brown and trigram_pred_wsj:
            print(f"\nTop 10 Brown Corpus sentences with '{user_input} {trigram_pred_brown[0][0]}':")
            print("\n".join(f"{i}. {sent}" for i, sent in enumerate(brown_matches, 1)) or "No matching sentences found.")
            print(f"\nTop 10 WSJ Corpus sentences with '{user_input} {trigram_pred_wsj[0][0]}':")
            print("\n".join(f"{i}. {sent}" for i, sent in enumerate(wsj_matches, 1)) or "No matching sentences found.")

# Test
process_input("in the")

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!



Bigram Model Predictions:
Brown Corpus:
first: 0.0095
same: 0.0090
most: 0.0060
WSJ Corpus:
company: 0.0327
u.s.: 0.0239
new: 0.0147

Top 10 Brown Corpus sentences with 'in the first':
1. vandiver likely will mention the $100 million highway bond issue approved earlier in the session as his first priority item .
2. operating budget for the day schools in the five counties of dallas , harris , bexar , tarrant and el paso would be $451,500 , which would be a savings of $157,460 yearly after the first year's capital outlay of $88,000 was absorbed , parkhouse told the senate .
3. officials estimated the annual tax boost for the medical plan would amount to 1.5 billion dollars and that medical benefits paid out would run 1 billion or more in the first year , 1963 .
4. the secretary of state himself , in his first speech , gave some idea of the tremendous march of events inside and outside the united states that has preoccupied the new administration in the past four months .
5. in the 1920