<b>Simple script to train Hidden Markov Model for Part of Speech tagging using NLTK</b>

In [None]:
#import the models
import nltk
from nltk import HiddenMarkovModelTagger as hmm # do not use nltk.tag.hmm
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
import warnings
import dill


warnings.filterwarnings('ignore')

<b>Download the data. Run only once</b>

In [None]:
# Ensure the treebank dataset is downloaded
#nltk.download('treebank')
#nltk,download('punkt')
#nltk,download('brown')

<b>Prepare the data. We'll use the Brown which is an English Corpus that includes pos tagging. We split the data into training and testing. Try to change the data size and experiment with the accuracy change.</b>

In [None]:
print(f'The number of tagged examples in the dataset is: {len(brown.tagged_sents(tagset="universal"))}')
train_data = brown.tagged_sents(tagset='universal')[:50000]
test_data = brown.tagged_sents(tagset='universal')[50000:]

print(f'len of training data is {len(train_data)}')
print(f'len of testing data is {len(test_data)}')
print(train_data[0])

# Extracting unique tags from train_data
unique_tags = set(tag for sent in train_data for _, tag in sent)

print(unique_tags)

<b>Define the trainer and train the model</b>

In [None]:
tagger = hmm.train(train_data, verbose=True)

In [None]:
# Evaluate the model's accuracy on the test data
accuracy = tagger.accuracy(test_data)
print(f"Accuracy: {accuracy:.2f}")

<b>Generate true tags list and model prediction to get more detailed stats on where the model performed better and where it didn't perform so well</b>

In [None]:
# Generate Predictions
true_tags = [tag for sent in test_data for _, tag in sent]
predicted_tags = [tag for sent in tagger.tag_sents([[word for word, _ in sent] for sent in test_data]) for _, tag in sent]

In [None]:
# Compute accuracy for each label
labels = list(set(true_tags))
for label in labels:
    correct_predictions = sum(1 for t, p in zip(true_tags, predicted_tags) if t == label and p == label)
    total_predictions = sum(1 for t in true_tags if t == label)
    wrong_predictions = total_predictions - correct_predictions
    label_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0

    print(f"Label: {label}")
    print(f"Correct Predictions: {correct_predictions}")
    print(f"Wrong Predictions: {wrong_predictions}")
    print(f"Accuracy: {label_accuracy:.2f}\n")

In [None]:
from sklearn.metrics import confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Compute the confusion matrix
conf_matrix = confusion_matrix(true_tags, predicted_tags, labels=labels)

# To make the confusion matrix more readable, you can use a DataFrame
conf_matrix_df = pd.DataFrame(conf_matrix, index=labels, columns=labels)

print("Confusion Matrix:")
print(conf_matrix_df)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix_df, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

<b>If I'm happy with the model, I can save it for later usage</b>

In [None]:
# Save the trained model to a file
with open('hmm_tagger.pkl', 'wb') as f:
    dill.dump(tagger, f)

<b>You can load the model at anytime to use it for tagging sentences</b>

In [None]:
# Load the trained model from the file
with open('hmm_tagger.pkl', 'rb') as f:
    loaded_tagger = dill.load(f)

In [None]:
sentence = 'I took the train from Zurich to Italy last night'

tokens = nltk.word_tokenize(sentence)

# Tag the tokenized sentence
tagged_sentence = loaded_tagger.tag(tokens)

print(tagged_sentence)

<b> Let us try another model for the same task. How about CRF (Conditional Random Forest)? Will it perform better in a token classifcation task?</b>

In [None]:
from sklearn_crfsuite import metrics
import sklearn_crfsuite

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]


In [None]:
X_train = [sent2features(s) for s in train_data]
y_train = [sent2labels(s) for s in train_data]

X_test = [sent2features(s) for s in test_data]
y_test = [sent2labels(s) for s in test_data]

In [None]:
print(X_train[0])
print(y_train[0])

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

In [None]:
y_pred = crf.predict(X_test)
print(metrics.flat_accuracy_score(y_test, y_pred))

In [None]:
with open("crf_model.pkl", "wb") as out_file:
    dill.dump(crf, out_file)

In [None]:
with open("crf_model.pkl", "rb") as in_file:
    crf = dill.load(in_file)

In [None]:
def pos_tag(sentence):
    # Tokenize the sentence
    words = word_tokenize(sentence)
    # Convert the words into features
    features = sent2features([(word, None) for word in words])
    # Predict the tags
    tags = crf.predict_single(features)
    # Return the words with their predicted tags
    return list(zip(words, tags))

# Example usage
sentence = "This is a test sentence."
print(pos_tag(sentence))

In [None]:
import seaborn as sns; sns.set()  # for plot styling


# Flatten the test and predicted labels lists
y_test_flat = [label for sentence in y_test for label in sentence]
y_pred_flat = [label for sentence in y_pred for label in sentence]

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test_flat, y_pred_flat, labels=list(unique_tags))

# Plotting the confusion matrix
plt.figure(figsize=(10,7))
sns.heatmap(conf_matrix, annot=True, fmt='d', xticklabels=list(unique_tags), yticklabels=list(unique_tags))
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')

plt.show()
