In [2]:
%pip install numpy
import numpy as np
from collections import defaultdict, Counter

class HMMPoSTagger:
    def __init__(self):
        self.transition_probs = defaultdict(lambda: defaultdict(float))
        self.emission_probs = defaultdict(lambda: defaultdict(float))
        self.tag_counts = defaultdict(int)
        self.tags = set()
        
    def train(self, tagged_sentences):
        """Train HMM tagger on tagged sentences"""
        # Count transitions and emissions
        for sentence in tagged_sentences:
            prev_tag = '<START>'
            for word, tag in sentence:
                # Transition probabilities
                self.transition_probs[prev_tag][tag] += 1
                # Emission probabilities
                self.emission_probs[tag][word] += 1
                self.tag_counts[tag] += 1
                self.tags.add(tag)
                prev_tag = tag
        
        # Convert counts to probabilities with smoothing
        for prev_tag in self.transition_probs:
            total = sum(self.transition_probs[prev_tag].values())
            for tag in self.transition_probs[prev_tag]:
                self.transition_probs[prev_tag][tag] = np.log(self.transition_probs[prev_tag][tag] / total)
        
        for tag in self.emission_probs:
            total = self.tag_counts[tag]
            for word in self.emission_probs[tag]:
                self.emission_probs[tag][word] = np.log(self.emission_probs[tag][word] / total)
    
    def viterbi(self, words):
        """Viterbi decoding algorithm"""
        n = len(words)
        viterbi = defaultdict(lambda: defaultdict(float))
        backpointer = defaultdict(lambda: defaultdict(str))
        
        # Initialize
        for tag in self.tags:
            emit_prob = self.emission_probs[tag].get(words[0], -float('inf'))
            trans_prob = self.transition_probs['<START>'].get(tag, -float('inf'))
            viterbi[0][tag] = trans_prob + emit_prob
        
        # Forward pass
        for t in range(1, n):
            for tag in self.tags:
                emit_prob = self.emission_probs[tag].get(words[t], -float('inf'))
                max_prob = -float('inf')
                best_prev_tag = None
                
                for prev_tag in self.tags:
                    trans_prob = self.transition_probs[prev_tag].get(tag, -float('inf'))
                    prob = viterbi[t-1][prev_tag] + trans_prob + emit_prob
                    if prob > max_prob:
                        max_prob = prob
                        best_prev_tag = prev_tag
                
                viterbi[t][tag] = max_prob
                backpointer[t][tag] = best_prev_tag
        
        # Backtrack
        best_path = []
        last_tag = max(self.tags, key=lambda tag: viterbi[n-1][tag])
        best_path.append(last_tag)
        
        for t in range(n-1, 0, -1):
            last_tag = backpointer[t][last_tag]
            best_path.append(last_tag)
        
        best_path.reverse()
        return list(zip(words, best_path))

# Create and train the tagger (example with sample data)
tagger = HMMPoSTagger()

# Sample tagged training data
sample_sentences = [
    [('The', 'DT'), ('cat', 'NN'), ('sat', 'VBD')],
    [('A', 'DT'), ('dog', 'NN'), ('runs', 'VBZ')]
]

tagger.train(sample_sentences)

# Tag new sentence
test_words = ['The', 'dog', 'sat']
result = tagger.viterbi(test_words)
print("Tagged result:", result)

Collecting numpy
  Downloading numpy-2.4.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Downloading numpy-2.4.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m26.4 MB/s[0m  [33m0:00:00[0mm0:00:01[0m0:01[0m
[?25hInstalling collected packages: numpy
[0mSuccessfully installed numpy-2.4.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Tagged result: [('The', 'DT'), ('dog', 'NN'), ('sat', 'VBD')]
