**Code Implementation of HMM for POS tagging**

In [2]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [4]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
import nltk
from nltk.corpus import brown
from nltk import FreqDist
from nltk.probability import ConditionalFreqDist

# Training the HMM POS Tagger
def train_hmm_tagger():
    # Get the tagged sentences from the Brown corpus
    tagged_sentences = brown.tagged_sents(tagset='universal')

    # Splitting into train and test datasets
    split_size = int(0.9 * len(tagged_sentences))
    train_sents = tagged_sentences[:split_size]
    test_sents = tagged_sentences[split_size:]

    # Create the emission and transition probability distributions
    emissions = ConditionalFreqDist()
    transitions = ConditionalFreqDist()

    for sentence in train_sents:
        previous_tag = None
        for (word, tag) in sentence:
            # Train emission probabilities
            emissions[tag][word] += 1

            # Train transition probabilities
            if previous_tag:
                transitions[previous_tag][tag] += 1
            previous_tag = tag

    # Calculate probabilities based on frequency distributions
    emission_probabilities = ConditionalFreqDist()
    transition_probabilities = ConditionalFreqDist()

    for tag in emissions.conditions():
        total_count = sum(emissions[tag].values())
        for word in emissions[tag]:
            emission_probabilities[tag][word] = emissions[tag][word] / total_count

    for tag in transitions.conditions():
        total_count = sum(transitions[tag].values())
        for next_tag in transitions[tag]:
            transition_probabilities[tag][next_tag] = transitions[tag][next_tag] / total_count

    return emission_probabilities, transition_probabilities

# Implement the HMM POS tagger
def hmm_pos_tagger(emission_probs, transition_probs, sentence):
    tags = list(emission_probs.keys())
    tagged_sequence = []

    for word in sentence:
        max_prob = 0
        assigned_tag = ''
        for tag in tags:
            if word in emission_probs[tag]:
                emission_prob = emission_probs[tag][word]
            else:
                emission_prob = 0.0001  # Assign a small probability for unknown words

            transition_prob = transition_probs[tag].freq(tag)
            current_prob = emission_prob * transition_prob

            if current_prob > max_prob:
                max_prob = current_prob
                assigned_tag = tag

        tagged_sequence.append((word, assigned_tag))

    return tagged_sequence

# Train the HMM POS tagger
emission_probabilities, transition_probabilities = train_hmm_tagger()

# Test the HMM POS tagger
test_sentence = "The quick brown fox jumps over the lazy dog"
tokenized_test_sentence = nltk.word_tokenize(test_sentence.lower())  # Lowercasing for simplicity
tagged_sequence = hmm_pos_tagger(emission_probabilities, transition_probabilities, tokenized_test_sentence)
print(tagged_sequence)


[('the', 'DET'), ('quick', 'X'), ('brown', 'X'), ('fox', 'X'), ('jumps', 'X'), ('over', 'PRT'), ('the', 'DET'), ('lazy', 'X'), ('dog', 'X')]
