•	Random Word Generation: Selecting words randomly from a predefined word corpus (dictionary) like the NLTK words dataset. 
•	Word Validity Check: Verifying if a word exists in a dictionary corpus. 
•	Generating Words from Characters: Creating possible words by permuting given characters and filtering valid ones. 
•	Adding Prefixes and Suffixes: Modifying words by attaching common prefixes (e.g., "un-", "pre-") or suffixes (e.g., "-ing", "-ness") and checking if they form real words. 
•	Extracting Unique Words & Computing Word Length: Filtering words by removing stopwords (common words like "the", "is") and punctuation, then measuring word length statistics.


In [3]:
import nltk
import random

In [4]:
nltk.download('words')
word_list= nltk.corpus.words.words()

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [6]:
n = int(input("Enter the number of random words you want: "))
random_word= random.sample(word_list,n)
print(random_word)

['supercabinet', 'refectorial', 'supererogator', 'omninescience', 'Faliscan', 'supramental', 'craggy', 'noncontributor', 'radiescent', 'possessed']


In [13]:
word= input("Enter a word: ")
if word in word_list:
    print("valid word")
else:
    print("invalid word")

valid word


In [12]:
from itertools import permutations

def generatewords(word,length):
    return [''.join(p)for p in permutations(word,length) if ''.join(p).lower() in set(word_list) ]
word= input("Enter a word: ")
length= int(input("Enter the length of the word: "))
words= generatewords(word,length)
print(set(words))


{'act', 'sec', 'ear', 'ras', 'eta', 'car', 'are', 'rea', 'tar', 'tea', 'arc', 'tra', 'ace', 'tec', 'cat', 'art', 'era', 'set', 'tst', 'sea', 'sac', 'sar', 'ers', 'ate', 'ser', 'tae', 'sat', 'ast', 'aes', 'ase', 'aer', 'rat', 'eat', 'tat', 'ret'}


In [14]:
# Define some common prefixes and suffixes (can be extended)
prefixes = ['un', 're', 'dis', 'in', 'im']
suffixes = ['ed', 'ing', 'es', 'er', 'ly']
word= input("Enter a word: ")
valid= set()

for prefix in prefixes:
    new_word= prefix+ word
    if new_word in word_list:
        valid.add(new_word)
for suffix in suffixes:
    new_word= word+ suffix
    if new_word in word_list:
        valid.add(new_word)
print("Valid words with prefixes/suffixes: ", valid)

Valid words with prefixes/suffixes:  {'comforter', 'recomfort', 'discomfort', 'comforting', 'uncomfort'}


In [15]:

from nltk.corpus import brown, stopwords
import string

# Download required NLTK datasets
nltk.download('brown')
nltk.download('stopwords')

# Define stopwords and punctuation
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# Extract words from the Brown corpus
words = brown.words()

# Filter words: Remove stopwords and punctuation
filtered_words = [word.lower() for word in words if word.lower() not in stop_words and word not in punctuation]

# Extract unique words
unique_words = set(filtered_words)

# Compute word length statistics
word_lengths = [len(word) for word in unique_words]
average_length = sum(word_lengths) / len(word_lengths) if word_lengths else 0
min_length = min(word_lengths) if word_lengths else 0
max_length = max(word_lengths) if word_lengths else 0

# Output the statistics
print(f"Total unique words: {len(unique_words)}")
print(f"Average word length: {average_length:.2f}")
print(f"Minimum word length: {min_length}")
print(f"Maximum word length: {max_length}")


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Total unique words: 49621
Average word length: 8.09
Minimum word length: 1
Maximum word length: 33


In [9]:
import nltk
from nltk.corpus import brown, treebank
from collections import Counter

nltk.download('brown')
nltk.download('treebank')

def prepare(corpus):
    return [[w.lower() for w in s] for s in corpus]

def build_ngram_model(sentences, n):
    ngram_counts, context_counts = Counter(), Counter()
    for sent in sentences:
        sent = ['<s>'] * (n - 1) + sent + ['</s>']
        for i in range(len(sent) - n + 1):
            ngram = tuple(sent[i:i + n])
            context = ngram[:-1]
            ngram_counts[ngram] += 1
            context_counts[context] += 1
    return ngram_counts, context_counts

def predict_next(ngram_counts, context_counts, context):
    context = tuple(context)
    candidates = [(ng[-1], count / context_counts[context])
                  for ng, count in ngram_counts.items() if ng[:-1] == context]
    return sorted(candidates, key=lambda x: -x[1])[:3]

# Load and preprocess
brown_sents = prepare(brown.sents())
wsj_sents = prepare(treebank.sents())

# Build models
big_brown, ctx_big_brown = build_ngram_model(brown_sents, 2)
tri_brown, ctx_tri_brown = build_ngram_model(brown_sents, 3)
big_wsj, ctx_big_wsj = build_ngram_model(wsj_sents, 2)
tri_wsj, ctx_tri_wsj = build_ngram_model(wsj_sents, 3)

# Predict
def run_predict(text):
    words = text.lower().split()
    print(f"Input: {text}\n")
    
    if len(words) >= 1:
        ctx = (words[-1],)
        print("Bigram Predictions:")
        print("Brown:", predict_next(big_brown, ctx_big_brown, ctx))
        print("WSJ  :", predict_next(big_wsj, ctx_big_wsj, ctx))
        
    if len(words) >= 2:
        ctx = (words[-2], words[-1])
        print("\nTrigram Predictions:")
        print("Brown:", predict_next(tri_brown, ctx_tri_brown, ctx))
        print("WSJ  :", predict_next(tri_wsj, ctx_tri_wsj, ctx))

# Example usage
run_predict("in the")


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


Input: in the

Bigram Predictions:
Brown: [('first', 0.009461062440153777), ('same', 0.008975146846550713), ('most', 0.005959611839190522)]
WSJ  : [('company', 0.0327455919395466), ('u.s.', 0.02392947103274559), ('new', 0.014693534844668345)]

Trigram Predictions:
Brown: [('world', 0.014937759336099586), ('first', 0.014605809128630706), ('united', 0.012282157676348548)]
WSJ  : [('u.s.', 0.07435897435897436), ('past', 0.041025641025641026), ('first', 0.02564102564102564)]


In [10]:
import nltk
from nltk.corpus import brown, treebank
from collections import Counter

nltk.download('brown')
nltk.download('treebank')

def build_model(corpus, n):
    model, context = Counter(), Counter()
    prepped = []
    for sent in corpus:
        sent = ['<s>']*(n-1) + [w.lower() for w in sent] + ['</s>']
        prepped.append(sent)
        for i in range(len(sent)-n+1):
            ng = tuple(sent[i:i+n])
            model[ng] += 1
            context[ng[:-1]] += 1
    return model, context, prepped

def predict(model, context, ctx):
    ctx = tuple(ctx)
    candidates = [(w[-1], c / context[ctx]) for w, c in model.items() if w[:-1] == ctx]
    return sorted(candidates, key=lambda x: -x[1])[:3]

def find_sentences(corpus, phrase, next_word):
    out = []
    for sent in corpus:
        for i in range(len(sent) - len(phrase)):
            if sent[i:i+len(phrase)] == phrase and i + len(phrase) < len(sent):
                if sent[i + len(phrase)] == next_word:
                    out.append(' '.join(sent))
                    break
        if len(out) == 10: break
    return out

# Build models
b2, cb2, brown2 = build_model(brown.sents(), 2)
b3, cb3, brown3 = build_model(brown.sents(), 3)
w2, cw2, wsj2 = build_model(treebank.sents(), 2)
w3, cw3, wsj3 = build_model(treebank.sents(), 3)

def run(text):
    words = text.lower().split()
    print(f"\nInput: {text}")
    
    if len(words) >= 1:
        ctx = [words[-1]]
        print("\nBigram Predictions:")
        for label, model, ctxs, corpus in [("Brown", b2, cb2, brown2), ("WSJ", w2, cw2, wsj2)]:
            pred = predict(model, ctxs, ctx)
            print(f"{label}:", pred)
            if pred:
                top = pred[0][0]
                matches = find_sentences(corpus, ctx, top)
                print(f"Top 10 {label} sentences with '{text} {top}':")
                for i, s in enumerate(matches, 1): print(f"{i}. {s}")
                if not matches: print("No matching sentences found.")

    if len(words) >= 2:
        ctx = words[-2:]
        print("\nTrigram Predictions:")
        for label, model, ctxs, corpus in [("Brown", b3, cb3, brown3), ("WSJ", w3, cw3, wsj3)]:
            pred = predict(model, ctxs, ctx)
            print(f"{label}:", pred)
            if pred:
                top = pred[0][0]
                matches = find_sentences(corpus, ctx, top)
                print(f"Top 10 {label} sentences with '{text} {top}':")
                for i, s in enumerate(matches, 1): print(f"{i}. {s}")
                if not matches: print("No matching sentences found.")

# Example usage
run("in the")



[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!



Input: in the

Bigram Predictions:
Brown: [('first', 0.009461062440153777), ('same', 0.008975146846550713), ('most', 0.005959611839190522)]
Top 10 Brown sentences with 'in the first':
1. <s> meanwhile , it was learned the state highway department is very near being ready to issue the first $30 million worth of highway reconstruction bonds . </s>
2. <s> operating budget for the day schools in the five counties of dallas , harris , bexar , tarrant and el paso would be $451,500 , which would be a savings of $157,460 yearly after the first year's capital outlay of $88,000 was absorbed , parkhouse told the senate . </s>
3. <s> the president spent much of the week-end at his summer home on cape cod writing the first drafts of portions of the address with the help of white house aids in washington with whom he talked by telephone . </s>
4. <s> the social security payroll tax is now 6 per cent -- 3 per cent on each worker and employer -- on the first $4,800 of pay per year . </s>
5. <s> offic

In [13]:
import nltk
from nltk.corpus import brown
from collections import defaultdict

# Download necessary data
nltk.download('brown')
nltk.download('universal_tagset')

# Load tagged sentences and initialize counts
tagged_sentences = brown.tagged_sents(tagset='universal')
emission_counts = defaultdict(int)
transition_counts = defaultdict(lambda: defaultdict(int))
tag_counts = defaultdict(int)

# Count emissions and transitions
for sentence in tagged_sentences:
    prev_tag = None
    for word, tag in sentence:
        emission_counts[(word.lower(), tag)] += 1
        tag_counts[tag] += 1
        if prev_tag:
            transition_counts[prev_tag][tag] += 1
        prev_tag = tag

# Normalize counts to probabilities
def normalize(counts, total_counts=None):
    probs = {}
    for k, v in counts.items():
        total = total_counts.get(k, sum(v.values())) if total_counts else sum(v.values())
        probs[k] = {k2: c / total for k2, c in v.items()} if isinstance(v, defaultdict) else v / total
    return probs

emission_probs = normalize(emission_counts)
transition_probs = normalize(transition_counts, tag_counts)

# POS tagging for user input sentence
def tag_sentence(sentence):
    words = sentence.split()
    tags, prev_tag = [], 'NOUN'
    for word in words:
        word_lower = word.lower()
        emission_prob = emission_probs.get((word_lower, 'NOUN'), 1e-6)
        transition_prob = transition_probs.get(prev_tag, {}).get('NOUN', 1e-6)
        best_tag = max(emission_probs.get(word_lower, {}), key=lambda tag: emission_prob * transition_prob)
        tags.append((word, best_tag))
        prev_tag = best_tag
    return " ".join([f"{word}/{tag}" for word, tag in tags])

# Example usage
user_input = input("Enter a sentence: ")
print(tag_sentence(user_input))


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


AttributeError: 'int' object has no attribute 'values'