## a. Part of Speech (POS) tagging

In [46]:
text = "The curious cat quietly watched the small bird hop across the garden fence."

# tag-code defnitions for averaged_preceptron_tagger for clean output print
penn_treebank_tags = {
    'CC': 'Coordinating conjunction', 'CD': 'Cardinal number', 'DT': 'Determiner', 'EX': 'Existential there',
    'FW': 'Foreign word', 'IN': 'Preposition or subordinating conjunction', 'JJ': 'Adjective', 'JJR': 'Adjective, comparative',
    'JJS': 'Adjective, superlative', 'LS': 'List item marker', 'MD': 'Modal', 'NN': 'Noun, singular or mass',
    'NNS': 'Noun, plural', 'NNP': 'Proper noun, singular', 'NNPS': 'Proper noun, plural', 'PDT': 'Predeterminer',
    'POS': 'Possessive ending', 'PRP': 'Personal pronoun', 'PRP$': 'Possessive pronoun', 'RB': 'Adverb',
    'RBR': 'Adverb, comparative', 'RBS': 'Adverb, superlative', 'RP': 'Particle', 'SYM': 'Symbol', 'TO': 'to',
    'UH': 'Interjection', 'VB': 'Verb, base form', 'VBD': 'Verb, past tense', 'VBG': 'Verb, gerund or present participle',
    'VBN': 'Verb, past participle', 'VBP': 'Verb, non-3rd person singular present', 'VBZ': 'Verb, 3rd person singular present',
    'WDT': 'Wh-determiner', 'WP': 'Wh-pronoun', 'WP$': 'Possessive wh-pronoun', 'WRB': 'Wh-adverb',
    '.': 'Punctuation mark, sentence closer', ',': 'Punctuation mark, comma', ':': 'Punctuation mark, colon or ellipsis',
    '(': 'Punctuation mark, opening parenthesis', ')': 'Punctuation mark, closing parenthesis', '"': 'Quotation mark',
    "''": 'Closing quotation mark', "``": 'Opening quotation mark', '#': 'Symbol, number sign', '$': 'Symbol, dollar sign',
}

# dinctonary for user-defined mannual tagging
tagger_dict = {
    "Determinant":{"the","an", "a"},
    "Adjective":{"curious", "wise", "small"},
    "Noun":{"cat", "bird", "graden", "fence"},
    "Adverb":{"quietly", "fastly"},
    "Verb":{"watched", "hop"},
    "Preposition":{"across","on"},
    "Punctuation":{".", ","}
}

In [60]:
# Predefined Library: PerceptronTagger
import nltk


# TOKENIZATION
# download the package
nltk.download("punkt_tab")

# import the tokeniser function
from nltk.tokenize import word_tokenize

# finally tokenize the text
tokens = word_tokenize(text)


# POS TAGGING
# Note: dont do tagging after stemming, stemming might remove tagginf hint to tagger model
# download the package
nltk.download("averaged_perceptron_tagger")

# imports
from nltk.tag.perceptron import PerceptronTagger

# instantiate and tag
tagged = PerceptronTagger().tag(tokens)


# print output
print(f"Text:{text}\n")
print(f"{"Words":8s}{"POS":5s}{"Meaning"}\n{'='*20}")
for tag in tagged:
    print(f"{tag[0]:8s}{tag[1]:5s}{penn_treebank_tags[tag[1]]}")

[nltk_data] Downloading package punkt_tab to /home/div/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/div/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Text:The curious cat quietly watched the small bird hop across the garden fence.

Words   POS  Meaning
The     DT   Determiner
curious JJ   Adjective
cat     NN   Noun, singular or mass
quietly RB   Adverb
watched VBD  Verb, past tense
the     DT   Determiner
small   JJ   Adjective
bird    NN   Noun, singular or mass
hop     NN   Noun, singular or mass
across  IN   Preposition or subordinating conjunction
the     DT   Determiner
garden  NN   Noun, singular or mass
fence   NN   Noun, singular or mass
.       .    Punctuation mark, sentence closer


In [63]:
# User defined methods for POS Tagging
def get_tag(token):
    for pos in tagger_dict.keys():
        if token.lower() in tagger_dict[pos]:
            return pos

# Use tokenised text from above cell
print(f"{"Words":8s}{"POS"}\n{"="*20}")
for token in tokens:
    print(f"{token:8s}{get_tag(token)}")

Words   POS
The     Determinant
curious Adjective
cat     Noun
quietly Adverb
watched Verb
the     Determinant
small   Adjective
bird    Noun
hop     Verb
across  Preposition
the     Determinant
garden  None
fence   Noun
.       Punctuation


## b. Lemmatization

In [64]:
text = ""

In [None]:
import nltk

# TOKENIZATION

# 