In [6]:
import nltk
from collections import Counter

nltk.download('brown')
nltk.download('universal_tagset')

def build_hmm():
    sents = nltk.corpus.brown.tagged_sents(tagset='universal')
    tags = Counter()
    emits = Counter()
    trans = Counter()
    vocab = set()
    
    for sent in sents:
        prev = '<s>'
        for w, t in sent:
            w = w.lower()
            vocab.add(w)
            tags[t] += 1
            emits[(w, t)] += 1
            trans[(prev, t)] += 1
            prev = t
        trans[(prev, '</s>')] += 1
        tags['</s>'] += 1
    
    V, T = len(vocab), len(tags)
    emit_p = {(w, t): (c + 1) / (tags[t] + V) for (w, t), c in emits.items()}
    trans_p = {(t1, t2): (c + 1) / (tags[t1] + T) for (t1, t2), c in trans.items()}
    return emit_p, trans_p, tags, tags.most_common(1)[0][0]

def tag_viterbi(text, emit_p, trans_p, tags, default):
    words = text.lower().split()
    V, back = [{}], [{}]
    ts = [t for t in tags if t not in ['<s>', '</s>', '.', 'X']]  # Exclude punctuation and X
    
    for t in ts:
        ep = emit_p.get((words[0], t), 1 / (tags[t] + len(ts)))
        tp = trans_p.get(('<s>', t), 1 / (tags['<s>'] + len(ts)))
        V[0][t] = ep * tp
        back[0][t] = (None, ep, tp)
    
    for i in range(1, len(words)):
        V.append({})
        back.append({})
        for t in ts:
            max_score, best_prev, best_ep, best_tp = 0, None, 0, 0
            for p in ts:
                ep = emit_p.get((words[i], t), 1 / (tags[t] + len(ts)))
                tp = trans_p.get((p, t), 1 / (tags[p] + len(ts)))
                score = V[i-1][p] * ep * tp
                if score > max_score:
                    max_score, best_prev, best_ep, best_tp = score, p, ep, tp
            V[i][t] = max_score
            back[i][t] = (best_prev, best_ep, best_tp)
    
    t = max(V[-1], key=V[-1].get)
    result = [(words[-1], t, back[-1][t][1], back[-1][t][2])]
    for i in range(len(words)-1, 0, -1):
        t, ep, tp = back[i][t]
        result.append((words[i-1], t, ep, tp))
    return result[::-1]

emit_p, trans_p, tags, default = build_hmm()
print("Viterbi POS Tagging:")
for w, t, ep, tp in tag_viterbi("The sun shines bright", emit_p, trans_p, tags, default):
    print(f"{w}: {t} (Emission: {ep:.4e}, Transition: {tp:.4e})")

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


Viterbi POS Tagging:
the: DET (Emission: 3.4422e-04, Transition: 6.2642e-01)
sun: NOUN (Emission: 2.1499e-05, Transition: 1.5881e-01)
shines: VERB (Emission: 6.1407e-04, Transition: 5.7512e-02)
bright: ADJ (Emission: 6.1407e-04, Transition: 5.7512e-02)
