In [None]:
!pip install streamlit nltk



In [None]:
import nltk
import nltk.data # add this line to import data
from nltk.corpus import brown
from collections import defaultdict, Counter
import pickle

# Download NLTK resources (only needs to be run once)
nltk.download('brown')
nltk.download('universal_tagset')

# Load and preprocess the data
tagged_sents = brown.tagged_sents(tagset='universal')
tagged_sents = [[(word.lower(), tag) for word, tag in sent] for sent in tagged_sents]

# Initialize counters
start_counts = Counter()
transition_counts = defaultdict(Counter)
emission_counts = defaultdict(Counter)
tags = set()

# Count start, transition, and emission frequencies
for sent in tagged_sents:
    prev_tag = None
    for i, (word, tag) in enumerate(sent):
        tags.add(tag)
        emission_counts[tag][word] += 1
        if i == 0:
            start_counts[tag] += 1
        if prev_tag is not None:
            transition_counts[prev_tag][tag] += 1
        prev_tag = tag

# Normalize counters to probabilities
def normalize(counter):
    total = sum(counter.values())
    return {key: count / total for key, count in counter.items()}

start_probs = normalize(start_counts)
transition_probs = {tag: normalize(counts) for tag, counts in transition_counts.items()}
emission_probs = {tag: normalize(counts) for tag, counts in emission_counts.items()}

# Save everything to a pickle file
model = {
    "start_probs": start_probs,
    "transition_probs": transition_probs,
    "emission_probs": emission_probs,
    "tags": list(tags)
}

with open("hmm_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("✅ Model trained and saved to hmm_model.pkl")

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


✅ Model trained and saved to hmm_model.pkl


In [None]:

import nltk
from nltk.corpus import brown
from collections import defaultdict, Counter
import pickle

nltk.download('brown')
nltk.download('universal_tagset')

# Prepare the dataset
tagged_sents = [[(word.lower(), tag) for word, tag in sent]
                for sent in brown.tagged_sents(tagset='universal')]

tags = set(tag for sent in tagged_sents for _, tag in sent)

# Count occurrences
transition_counts = defaultdict(Counter)
emission_counts = defaultdict(Counter)
start_counts = Counter()

for sent in tagged_sents:
    prev_tag = None
    for i, (word, tag) in enumerate(sent):
        emission_counts[tag][word] += 1
        if i == 0:
            start_counts[tag] += 1
        if prev_tag is not None:
            transition_counts[prev_tag][tag] += 1
        prev_tag = tag

# Convert counts to probabilities
def normalize(counter):
    total = float(sum(counter.values()))
    return {key: val / total for key, val in counter.items()}

transition_probs = {tag: normalize(counts) for tag, counts in transition_counts.items()}
emission_probs = {tag: normalize(counts) for tag, counts in emission_counts.items()}
start_probs = normalize(start_counts)

# Save the model
with open("hmm_model.pkl", "wb") as f:
    pickle.dump((transition_probs, emission_probs, start_probs, list(tags)), f)

print("Model saved as hmm_model.pkl")


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


Model saved as hmm_model.pkl
