In [9]:
import nltk
# nltk utils
nltk.download('treebank') # download Penn treebank -> POS tagging
nltk.download('punkt_tab') # punkt_tab -> tokenize, preprocess sentences

# nltk libs
import nltk
from nltk.probability import (
    LaplaceProbDist,
    WittenBellProbDist,
    LidstoneProbDist,
    SimpleGoodTuringProbDist,
)
from nltk.tag.hmm import HiddenMarkovModelTrainer
from nltk.tag import UnigramTagger
import numpy as np

import re

def normalize_corpus(corpus):
    def normalize_token(token):
        if re.fullmatch(r'\d+', token):
            return '<NUM>'
        return token.lower()
    
    return [
        [normalize_token(word) for word in sentence]
        for sentence in corpus
    ]




[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\gbaon\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\gbaon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [10]:
tagged_sents = nltk.corpus.treebank.tagged_sents()
tagged_sents = list(tagged_sents)
import random
random.seed(42)
random.shuffle(tagged_sents)

split = int(len(tagged_sents) * 0.8)


train_sents = tagged_sents[:split]
test_sents = tagged_sents[split:]

traintext = [[word for word, tag in sent] for sent in train_sents]
trainlabel = [[tag for word, tag in sent] for sent in train_sents]
testtext = [[word for word, tag in sent] for sent in test_sents]
testlabel = [[tag for word, tag in sent] for sent in test_sents]

traintext = normalize_corpus(traintext)
testtext = normalize_corpus(testtext)


In [11]:
hmm = HiddenMarkovModelTrainer().train_supervised(
    train_sents, estimator=WittenBellProbDist
)

acc = hmm.accuracy(test_sents)
print(f"Evaluation accuracy: {acc}")

Evaluation accuracy: 0.9282625286186721


In [12]:
from nltk.probability import LidstoneProbDist

def lidstone_estimator(freqdist, bins):
    return LidstoneProbDist(freqdist, 0.01, bins=bins)

# def run(traintext, trainlabel, testtext):
#     train_sents = [[(word, tag) for word, tag in zip(traintext[i], trainlabel[i])] for i in range(len(traintext))]
#     print(train_sents[0])

#     hmm = HiddenMarkovModelTrainer().train_supervised(train_sents, estimator=LaplaceProbDist)

#     results = []
#     for sent in testtext:
#         tagged_sent = hmm.tag(sent)
#         words = [t[0] for t in tagged_sent]
#         tags = [t[1] for t in tagged_sent]
#         results.append(tags)
#     return results

from btth3_22520109 import run

# def run(traintext, trainlabel, testtext):
#     train_sents = [
#         [(word, tag) for word, tag in zip(traintext[i], trainlabel[i])]
#         for i in range(len(traintext))
#     ]

#     unigram_tagger = nltk.UnigramTagger(train_sents)

#     results = [

#         [tag for _, tag in unigram_tagger.tag(sent)]
#         for sent in testtext
#     ]

#     return results
testpred = run(traintext, trainlabel, testtext)



In [13]:
hmm_testpred = hmm.tag_sents(testtext)
hmm_testpred = [[tag[1] for tag in sent] for sent in hmm_testpred]
hmm_testpred[0]

['NNS', 'VBD', 'RB', 'VBN', '-NONE-', '.']

In [14]:
# calculate accuracy 
def calculate_accuracy(predictions, labels):
    correct = 0
    total = 0
    for pred, label in zip(predictions, labels):
        total += len(label)
        correct += sum(p == l for p, l in zip(pred, label))
    return correct / total if total > 0 else 0

# acc for hmm_testpred
accuracy = calculate_accuracy(hmm_testpred, testlabel)
print(f"Acc for hmm_testpred: {accuracy:.4f}")

# acc for testpred
accuracy = calculate_accuracy(testpred, testlabel)
print(f"Acc for testpred: {accuracy:.4f}")



Acc for hmm_testpred: 0.8430
Acc for testpred: 0.9012


In [15]:
from nltk.corpus import treebank
vocab = set(w.lower() for w, t in treebank.tagged_words())
print(len(vocab))
tags = set(t for w, t in treebank.tagged_words())
print(len(tags), tags)

11387
46 {'PRP$', 'RP', 'VBD', 'WRB', 'SYM', ',', 'RB', 'WDT', '-RRB-', 'WP', 'WP$', 'FW', 'UH', 'JJS', 'VBP', 'NNPS', '-LRB-', 'EX', 'PRP', '.', '$', ':', 'MD', 'VBN', 'CC', '-NONE-', "''", 'TO', 'RBS', 'LS', 'CD', 'VBG', 'VBZ', 'VB', 'NNS', 'JJR', 'PDT', 'NNP', 'DT', 'JJ', 'NN', 'POS', '#', '``', 'RBR', 'IN'}


In [16]:
hmm.tag(['The', 'dog', 'barked', 'loudly', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])

[('The', 'DT'),
 ('dog', 'NNP'),
 ('barked', 'NNP'),
 ('loudly', 'NNP'),
 ('1', 'CD'),
 ('2', 'CD'),
 ('3', 'CD'),
 ('4', 'CD'),
 ('5', 'CD'),
 ('6', 'CD'),
 ('7', 'CD'),
 ('8', 'CD'),
 ('9', 'CD'),
 ('10', 'CD')]