In [9]:
# Download required NLTK data
import nltk
import ssl

# Handle SSL certificate issues if any
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download required resources
nltk.download('punkt', quiet=False)
nltk.download('stopwords', quiet=False)
nltk.download('averaged_perceptron_tagger', quiet=False)

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 

stop_words = set(stopwords.words('english')) 
txt = "The Natural Language Toolkit NLTK is a platform used for building programs for text analysis. One of the more powerful aspects of the NLTK module is the Part of Speech tagging." 
tokenized = sent_tokenize(txt) 
for i in tokenized: 
    wordsList = nltk.word_tokenize(i) 
    wordsList = [w for w in wordsList if not w in stop_words] 
    tagged = nltk.pos_tag(wordsList) 
    print(tagged)

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/codespace/nltk_data'
    - '/usr/local/python/3.12.1/nltk_data'
    - '/usr/local/python/3.12.1/share/nltk_data'
    - '/usr/local/python/3.12.1/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [10]:
import numpy as np
from collections import defaultdict, Counter

class HMMPoSTagger:
    def __init__(self):
        self.transition_probs = defaultdict(lambda: defaultdict(float))
        self.emission_probs = defaultdict(lambda: defaultdict(float))
        self.tag_counts = defaultdict(int)
        self.tags = set()
        
    def train(self, tagged_sentences):
        """Train HMM tagger on tagged sentences"""
        # Count transitions and emissions
        for sentence in tagged_sentences:
            prev_tag = '<START>'
            for word, tag in sentence:
                # Transition probabilities
                self.transition_probs[prev_tag][tag] += 1
                # Emission probabilities
                self.emission_probs[tag][word] += 1
                self.tag_counts[tag] += 1
                self.tags.add(tag)
                prev_tag = tag
        
        # Convert counts to probabilities with smoothing
        for prev_tag in self.transition_probs:
            total = sum(self.transition_probs[prev_tag].values())
            for tag in self.transition_probs[prev_tag]:
                self.transition_probs[prev_tag][tag] = np.log(self.transition_probs[prev_tag][tag] / total)
        
        for tag in self.emission_probs:
            total = self.tag_counts[tag]
            for word in self.emission_probs[tag]:
                self.emission_probs[tag][word] = np.log(self.emission_probs[tag][word] / total)
    
    def viterbi(self, words):
        """Viterbi decoding algorithm"""
        n = len(words)
        viterbi = defaultdict(lambda: defaultdict(float))
        backpointer = defaultdict(lambda: defaultdict(str))
        
        # Initialize
        for tag in self.tags:
            emit_prob = self.emission_probs[tag].get(words[0], -float('inf'))
            trans_prob = self.transition_probs['<START>'].get(tag, -float('inf'))
            viterbi[0][tag] = trans_prob + emit_prob
        
        # Forward pass
        for t in range(1, n):
            for tag in self.tags:
                emit_prob = self.emission_probs[tag].get(words[t], -float('inf'))
                max_prob = -float('inf')
                best_prev_tag = None
                
                for prev_tag in self.tags:
                    trans_prob = self.transition_probs[prev_tag].get(tag, -float('inf'))
                    prob = viterbi[t-1][prev_tag] + trans_prob + emit_prob
                    if prob > max_prob:
                        max_prob = prob
                        best_prev_tag = prev_tag
                
                viterbi[t][tag] = max_prob
                backpointer[t][tag] = best_prev_tag
        
        # Backtrack
        best_path = []
        last_tag = max(self.tags, key=lambda tag: viterbi[n-1][tag])
        best_path.append(last_tag)
        
        for t in range(n-1, 0, -1):
            last_tag = backpointer[t][last_tag]
            best_path.append(last_tag)
        
        best_path.reverse()
        return list(zip(words, best_path))

# Create and train the tagger (example with sample data)
tagger = HMMPoSTagger()

# Sample tagged training data
sample_sentences = [
    [('The', 'DT'), ('cat', 'NN'), ('sat', 'VBD')],
    [('A', 'DT'), ('dog', 'NN'), ('runs', 'VBZ')]
]

tagger.train(sample_sentences)

# Tag new sentence
test_words = ['The', 'dog', 'sat']
result = tagger.viterbi(test_words)
print("Tagged result:", result)

ModuleNotFoundError: No module named 'numpy'