# Experiment: 2

## a. Part of Speech (POS) tagging

In [3]:
text1 = "The curious cat quietly watched the small bird hop across the garden fence."

# tag-code defnitions for averaged_preceptron_tagger for clean output print
penn_treebank_tags = {
    'CC': 'Coordinating conjunction', 'CD': 'Cardinal number', 'DT': 'Determiner', 'EX': 'Existential there',
    'FW': 'Foreign word', 'IN': 'Preposition or subordinating conjunction', 'JJ': 'Adjective', 'JJR': 'Adjective, comparative',
    'JJS': 'Adjective, superlative', 'LS': 'List item marker', 'MD': 'Modal', 'NN': 'Noun, singular or mass',
    'NNS': 'Noun, plural', 'NNP': 'Proper noun, singular', 'NNPS': 'Proper noun, plural', 'PDT': 'Predeterminer',
    'POS': 'Possessive ending', 'PRP': 'Personal pronoun', 'PRP$': 'Possessive pronoun', 'RB': 'Adverb',
    'RBR': 'Adverb, comparative', 'RBS': 'Adverb, superlative', 'RP': 'Particle', 'SYM': 'Symbol', 'TO': 'to',
    'UH': 'Interjection', 'VB': 'Verb, base form', 'VBD': 'Verb, past tense', 'VBG': 'Verb, gerund or present participle',
    'VBN': 'Verb, past participle', 'VBP': 'Verb, non-3rd person singular present', 'VBZ': 'Verb, 3rd person singular present',
    'WDT': 'Wh-determiner', 'WP': 'Wh-pronoun', 'WP$': 'Possessive wh-pronoun', 'WRB': 'Wh-adverb',
    '.': 'Punctuation mark, sentence closer', ',': 'Punctuation mark, comma', ':': 'Punctuation mark, colon or ellipsis',
    '(': 'Punctuation mark, opening parenthesis', ')': 'Punctuation mark, closing parenthesis', '"': 'Quotation mark',
    "''": 'Closing quotation mark', "``": 'Opening quotation mark', '#': 'Symbol, number sign', '$': 'Symbol, dollar sign',
}

# dinctonary for user-defined mannual tagging
tagger_dict = {
    "Determinant":{"the","an", "a"},
    "Adjective":{"curious", "wise", "small"},
    "Noun":{"cat", "bird", "graden", "fence"},
    "Adverb":{"quietly", "fastly"},
    "Verb":{"watched", "hop"},
    "Preposition":{"across","on"},
    "Punctuation":{".", ","}
}

In [6]:
# Predefined Library: PerceptronTagger
import nltk


# TOKENIZATION
# download the package
nltk.download("punkt_tab")

# import the tokeniser function
from nltk.tokenize import word_tokenize

from nltk.tokenize import PunktTokenizer

# finally tokenize the text
tokens = word_tokenize(text1)


# POS TAGGING
# Note: dont do tagging after stemming, stemming might remove tagginf hint to tagger model
# download the package
nltk.download("averaged_perceptron_tagger")

# imports
from nltk.tag.perceptron import PerceptronTagger

# instantiate and tag
tagged = nltk.pos_tag(tokens)#PerceptronTagger().tag(tokens)
# print output
print(f"Original Text:{text1}\n")
print(f"{"Words":8s}{"POS":5s}{"Meaning"}\n{'='*20}")
for tag in tagged:
    print(f"{tag[0]:8s}{tag[1]:5s}{penn_treebank_tags[tag[1]]}")

[nltk_data] Downloading package punkt_tab to /home/div/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/div/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Original Text:The curious cat quietly watched the small bird hop across the garden fence.

Words   POS  Meaning
The     DT   Determiner
curious JJ   Adjective
cat     NN   Noun, singular or mass
quietly RB   Adverb
watched VBD  Verb, past tense
the     DT   Determiner
small   JJ   Adjective
bird    NN   Noun, singular or mass
hop     NN   Noun, singular or mass
across  IN   Preposition or subordinating conjunction
the     DT   Determiner
garden  NN   Noun, singular or mass
fence   NN   Noun, singular or mass
.       .    Punctuation mark, sentence closer


In [3]:
# User defined methods for POS Tagging
def get_tag(token):
    for pos in tagger_dict.keys():
        if token.lower() in tagger_dict[pos]:
            return pos

# Use tokenised text from above cell
print(f"{"Words":8s}{"POS"}\n{"="*20}")
for token in tokens:
    print(f"{token:8s}{get_tag(token)}")

Words   POS
The     Determinant
curious Adjective
cat     Noun
quietly Adverb
watched Verb
the     Determinant
small   Adjective
bird    Noun
hop     Verb
across  Preposition
the     Determinant
garden  None
fence   Noun
.       Punctuation


## b. Lemmatization

In [4]:
text2 = "The researchers were analyzing datasets and discovered that the trained models performed better on cleaned data."

# dictionary & function to help classify pos tag
pos_dic = {
    "a":{"better"}, #adjective
    "v":{"were", "analyzing", "discovered", "trained", "performed", "cleaned"}, #verb
}

def get_pos_tag(token):
    for pos in pos_dic.keys():
        if token in pos_dic[pos]:
            return pos
    return 'n'

In [5]:
import nltk

# TOKENIZATION

# download package
nltk.download("punkt_tab")
# import the tokenizer function
from nltk.tokenize import word_tokenize

# finally tokenize the text
tokens = word_tokenize(text2)

# LEMMATIZATION
# imports
from nltk.stem import WordNetLemmatizer

# instantiate lemmatizer and lemmatize the tokens
wnl = WordNetLemmatizer()
lemmatized = [wnl.lemmatize(token, pos=get_pos_tag(token.lower())) for token in tokens]
print(f"Original Text: {text2}")
print(f"{"Word":13s}{"Lemma"}\n{"="*20}")

for word, lemma in zip(tokens, lemmatized):
    print(f"{word:13s}{lemma}")

[nltk_data] Downloading package punkt_tab to /home/div/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Original Text: The researchers were analyzing datasets and discovered that the trained models performed better on cleaned data.
Word         Lemma
The          The
researchers  researcher
were         be
analyzing    analyze
datasets     datasets
and          and
discovered   discover
that         that
the          the
trained      train
models       model
performed    perform
better       good
on           on
cleaned      clean
data         data
.            .


# Exercise

### 1. Study and use the Stanford Part of speech tagger on a suitable corpus available freely. The corpus should be of decent size. (Use spaCy and stanza).

In [8]:
import nltk

nltk.download('brown')


from nltk.corpus import brown

# Join sentences into text (use first 20k sentences for decent size)
sentences = brown.sents()[:2]
text = [" ".join(sent) for sent in sentences]

[nltk_data] Downloading package brown to /home/div/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [9]:
import stanza

# Download English model
stanza.download('en')

# Initialize pipeline
nlp_stanza = stanza.Pipeline(
    lang='en',
    processors='tokenize,pos',
    use_gpu=False
)

stanza_tags = []

for doc_text in text:
    doc = nlp_stanza(doc_text)
    for sent in doc.sentences:
        for word in sent.words:
            stanza_tags.append((word.text, word.upos))

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 439kB [00:00, 20.1MB/s]                                                                 
2026-02-21 14:56:02 INFO: Downloaded file to /home/div/stanza_resources/resources.json
2026-02-21 14:56:02 INFO: Downloading default packages for language: en (English) ...
2026-02-21 14:56:04 INFO: File exists: /home/div/stanza_resources/en/default.zip
2026-02-21 14:56:14 INFO: Finished downloading models and saved to /home/div/stanza_resources
2026-02-21 14:56:14 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 439kB [00:00, 14.5MB/s]                                                                 
2026-02-21 14:56:14 INFO: Downloaded file to /home/div

In [10]:
import spacy

nlp_spacy = spacy.load("en_core_web_sm")

spacy_tags = []

for doc_text in text:
    doc = nlp_spacy(doc_text)
    for token in doc:
        spacy_tags.append((token.text, token.pos_))

In [11]:
print("Stanza sample:", stanza_tags[:10])
print("spaCy sample:", spacy_tags[:10])

Stanza sample: [('The', 'DET'), ('Fulton', 'PROPN'), ('County', 'PROPN'), ('Grand', 'ADJ'), ('Jury', 'PROPN'), ('said', 'VERB'), ('Friday', 'PROPN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP')]
spaCy sample: [('The', 'DET'), ('Fulton', 'PROPN'), ('County', 'PROPN'), ('Grand', 'PROPN'), ('Jury', 'PROPN'), ('said', 'VERB'), ('Friday', 'PROPN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP')]


### 2. Write a python program for lemmatization using spaCy and stanza.

In [16]:
import nltk
from nltk.corpus import brown

nltk.download('brown')

# Use a reasonably sized subset
sentences = brown.sents()[:2]
text = [" ".join(sent) for sent in sentences]

[nltk_data] Downloading package brown to /home/div/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [17]:
import spacy

nlp_spacy = spacy.load("en_core_web_sm")

spacy_lemmas = []

for doc_text in text:
    doc = nlp_spacy(doc_text)
    for token in doc:
        if token.is_alpha:
            spacy_lemmas.append((token.text, token.lemma_))

In [18]:
import stanza

stanza.download('en')

nlp_stanza = stanza.Pipeline(
    lang='en',
    processors='tokenize,pos,lemma',
    use_gpu=False
)

stanza_lemmas = []

for doc_text in text:
    doc = nlp_stanza(doc_text)
    for sent in doc.sentences:
        for word in sent.words:
            if word.text.isalpha():
                stanza_lemmas.append((word.text, word.lemma))

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 439kB [00:00, 22.3MB/s]                                                                 
2026-02-21 14:59:55 INFO: Downloaded file to /home/div/stanza_resources/resources.json
2026-02-21 14:59:55 INFO: Downloading default packages for language: en (English) ...
2026-02-21 14:59:56 INFO: File exists: /home/div/stanza_resources/en/default.zip
2026-02-21 15:00:07 INFO: Finished downloading models and saved to /home/div/stanza_resources
2026-02-21 15:00:07 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 439kB [00:00, 15.7MB/s]                                                                 
2026-02-21 15:00:07 INFO: Downloaded file to /home/div

In [19]:
print("spaCy lemmas:", spacy_lemmas[:10])
print("Stanza lemmas:", stanza_lemmas[:10])

spaCy lemmas: [('The', 'the'), ('Fulton', 'Fulton'), ('County', 'County'), ('Grand', 'Grand'), ('Jury', 'Jury'), ('said', 'say'), ('Friday', 'Friday'), ('an', 'an'), ('investigation', 'investigation'), ('of', 'of')]
Stanza lemmas: [('The', 'the'), ('Fulton', 'Fulton'), ('County', 'County'), ('Grand', 'Grand'), ('Jury', 'Jury'), ('said', 'say'), ('Friday', 'Friday'), ('an', 'a'), ('investigation', 'investigation'), ('of', 'of')]
