<a href="https://colab.research.google.com/github/explorer-Sanjita/MiniProjects_LP5_LP6/blob/main/NLP_POS_Taggers_hindi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install nltk indic-nlp-library

import nltk
from nltk.tag.hmm import HiddenMarkovModelTrainer
from nltk.tag import UnigramTagger, RegexpTagger
from nltk.corpus import indian

# Download required corpora
nltk.download('indian')
nltk.download('punkt')

# Load tagged Hindi corpus
corpus = indian.tagged_sents('hindi.pos')

# Split into training and testing
train_data = corpus[:int(0.9 * len(corpus))]
test_data = corpus[int(0.9 * len(corpus)):]

# Rule-based pattern tagger
patterns = [
    (r'.*ता$', 'NN'),
    (r'.*ाएंगे$', 'VM'),
    (r'.*कर$', 'VM'),
    (r'.*ने$', 'PSP'),
    (r'^\d+$', 'CD'),
    (r'.*', 'NN')
]

regexp_tagger = RegexpTagger(patterns)

# Unigram tagger with fallback
unigram_tagger = UnigramTagger(train_data, backoff=regexp_tagger)

# HMM-based tagger
hmm_trainer = HiddenMarkovModelTrainer()
hmm_tagger = hmm_trainer.train_supervised(train_data)

# Evaluate accuracy
print("Rule-Based (Unigram + Regexp) Accuracy:", unigram_tagger.evaluate(test_data))
print("HMM Tagger Accuracy:", hmm_tagger.evaluate(test_data))

# Use simple tokenizer for Hindi
example_sentence = "तुम्हारी मेहनत और ईमानदारी ही तुम्हारे भविष्य को उज्जवल बनाएगी।"
tokens = example_sentence.strip().split()

print("\nTagged by Rule-Based Tagger:")
print(unigram_tagger.tag(tokens))

print("\nTagged by HMM Tagger:")
print(hmm_tagger.tag(tokens))




[nltk_data] Downloading package indian to /root/nltk_data...
[nltk_data]   Package indian is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("Rule-Based (Unigram + Regexp) Accuracy:", unigram_tagger.evaluate(test_data))
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("HMM Tagger Accuracy:", hmm_tagger.evaluate(test_data))


Rule-Based (Unigram + Regexp) Accuracy: 0.8099891422366993
HMM Tagger Accuracy: 0.26058631921824105

Tagged by Rule-Based Tagger:
[('तुम्हारी', 'NN'), ('मेहनत', 'NN'), ('और', 'CC'), ('ईमानदारी', 'NN'), ('ही', 'RP'), ('तुम्हारे', 'NN'), ('भविष्य', 'NN'), ('को', 'PREP'), ('उज्जवल', 'NN'), ('बनाएगी।', 'NN')]

Tagged by HMM Tagger:
[('तुम्हारी', 'JJ'), ('मेहनत', 'JJ'), ('और', 'JJ'), ('ईमानदारी', 'JJ'), ('ही', 'JJ'), ('तुम्हारे', 'JJ'), ('भविष्य', 'JJ'), ('को', 'JJ'), ('उज्जवल', 'JJ'), ('बनाएगी।', 'JJ')]
