In [2]:
import nltk
from nltk.corpus import indian
from nltk.tag import tnt
import string

# Downloading Indian Languages Corpora which consists Hindi, Bangla, Marathi and Telugu corpus respectively
nltk.download("indian")
nltk.download("punkt")

# Training the POS Tagger Model using Hindi dataset
def train():
    taggedSet = "hindi.pos"
    wordSet = indian.sents(taggedSet)
    count = 0
    
    # Joining dataset words to form a sentence
    for sen in wordSet:
        count += 1
        sen = " ".join([
            " " + i if not i.startswith("\"") and i not in string.punctuation else i
            for i in sen
        ]).strip()
    
    # Total Sentence Count
    print("Total sentences in the tagged file are", count)
    
    # Splitting dataset into Training Data and Test Data
    trainPerc = 0.9
    trainRows = int(trainPerc * count)
    testRows = trainRows + 1
    
    # Slicing the corpus
    data = indian.tagged_sents(taggedSet)
    train_data = data[:trainRows]
    test_data = data[testRows:]
    
    # Stats
    print("Training dataset length: ", len(train_data))
    print("Testing dataset length: ", len(test_data))
    
    pos_tagger = tnt.TnT()
    pos_tagger.train(train_data)
    print("Accuracy: ", pos_tagger.accuracy(test_data))
    
    return pos_tagger

# Tagging function to tag all words in a sentence
def tagger(pos_tagger, sentenceToBeTagged):
    tokenized = nltk.word_tokenize(sentenceToBeTagged)
    return pos_tagger.tag(tokenized)

# Main Driving Module
if __name__ == "__main__":
    pos_tagger = train()
    sentence_to_be_tagged = "भारत वह महान देश है जहां अनेक प्रकार के योद्धा, देश प्रेमी जन्म लिए और अपनी आखिरी सांस तक देश के लिए लड़ते रहे।"
    print(tagger(pos_tagger, sentence_to_be_tagged))

[nltk_data] Downloading package indian to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package indian is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Total sentences in the tagged file are 540
Training dataset length:  486
Testing dataset length:  53
Accuracy:  0.8111964873765093
[('भारत', 'NNP'), ('वह', 'PRP'), ('महान', 'JJ'), ('देश', 'NN'), ('है', 'VFM'), ('जहां', 'NLOC'), ('अनेक', 'QF'), ('प्रकार', 'NN'), ('के', 'PREP'), ('योद्धा', 'Unk'), (',', 'PUNC'), ('देश', 'NN'), ('प्रेमी', 'Unk'), ('जन्म', 'Unk'), ('लिए', 'PREP'), ('और', 'CC'), ('अपनी', 'PRP'), ('आखिरी', 'Unk'), ('सांस', 'Unk'), ('तक', 'PREP'), ('देश', 'NN'), ('के', 'PREP'), ('लिए', 'PREP'), ('लड़ते', 'Unk'), ('रहे।', 'Unk')]
