In [1]:
import nltk
from nltk.corpus import indian
from nltk.tag import tnt
import re

# Download corpora
nltk.download("indian")
nltk.download("punkt")

# Heuristic tagger to handle unknown words
def heuristic_tagger(word):
    if word in ["।", "|"]:
        return "SYM"
    elif word.endswith("ना"):
        return "VM"
    elif word.endswith("ता") or word.endswith("ती") or word.endswith("ते"):
        return "VM"
    elif word in ["है", "थे", "था", "थी", "रहा", "रही", "रहे"]:
        return "VAUX"
    elif word in ["और", "या", "लेकिन"]:
        return "CC"
    elif re.match(r"^[0-9०-९]+$", word):
        return "QC"
    else:
        return "NN"

# Custom tokenizer to separate words and punctuation like । and |
def custom_tokenize(text):
    return re.findall(r"[\u0900-\u097F]+|\d+|[^\w\s]", text, re.UNICODE)

# Train TnT POS tagger
def train():
    taggedSet = "hindi.pos"
    data = indian.tagged_sents(taggedSet)
    total_sentences = len(data)

    train_size = int(0.9 * total_sentences)
    train_data = data[:train_size]
    test_data = data[train_size:]

    print("Training dataset length:", len(train_data))
    print("Testing dataset length:", len(test_data))

    pos_tagger = tnt.TnT()
    pos_tagger.train(train_data)
    print("TnT Accuracy (no fallback):", pos_tagger.accuracy(test_data))

    return pos_tagger, test_data

# Tag a sentence using TnT and fallback for UNK
def tagger(pos_tagger, sentence):
    tokens = custom_tokenize(sentence)
    tagged = pos_tagger.tag(tokens)

    final_tags = []
    for word, tag in tagged:
        if tag == 'Unk':
            tag = heuristic_tagger(word)
        final_tags.append((word, tag))
    return final_tags

# Evaluate accuracy with fallback
def evaluate_with_fallback(pos_tagger, test_data):
    total = 0
    correct = 0

    for sent in test_data:
        words = [w for w, t in sent]
        gold_tags = [t for w, t in sent]

        tagged = pos_tagger.tag(words)
        predicted_tags = []
        for word, tag in tagged:
            if tag == 'Unk':
                tag = heuristic_tagger(word)
            predicted_tags.append(tag)

        total += len(gold_tags)
        correct += sum(1 for gt, pt in zip(gold_tags, predicted_tags) if gt == pt)

    return correct / total

# Main
if __name__ == "__main__":
    pos_tagger, test_data = train()

    # Example sentence
    sentence = "भारत वह महान देश है जहां अनेक प्रकार के योद्धा, देश प्रेमी जन्म लिए और अपनी आखिरी सांस तक देश के लिए लड़ते रहे|"
    output = tagger(pos_tagger, sentence)

    print("\nTagged Sentence:")
    for word, tag in output:
        print(f"{word} -> {tag}")

    final_acc = evaluate_with_fallback(pos_tagger, test_data)
    print("\nFinal Accuracy with fallback:", round(final_acc, 4))


[nltk_data] Downloading package indian to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package indian is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Training dataset length: 486
Testing dataset length: 54
TnT Accuracy (no fallback): 0.8110749185667753

Tagged Sentence:
भारत -> NNP
वह -> PRP
महान -> JJ
देश -> NN
है -> VFM
जहां -> NLOC
अनेक -> QF
प्रकार -> NN
के -> PREP
योद्धा -> NN
, -> PUNC
देश -> NN
प्रेमी -> NN
जन्म -> NN
लिए -> PREP
और -> CC
अपनी -> PRP
आखिरी -> NN
सांस -> NN
तक -> PREP
देश -> NN
के -> PREP
लिए -> PREP
लड़ते -> VM
रहे -> VAUX
| -> SYM

Final Accuracy with fallback: 0.8317
