# Here, we do,
- Stemming
- Lemmatization

In [9]:
# %pip install nltk
import nltk
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\debnathk\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [2]:
# Read the textfile
with open('textfile.txt', 'r') as f:
    corpus = [line.strip() for line in f]
corpus

['This is the first document.',
 'This document is the second document.',
 'And this is the third one.',
 'Is this the first document?']

In [12]:
# Tokenize words, stemming them
words = word_tokenize(corpus[0])
words

['This', 'is', 'the', 'first', 'document', '.']

## Stemming

In [4]:
stemmer = PorterStemmer()
for word in words:
    print(f'{word}: {stemmer.stem(word)}')

This: thi
is: is
the: the
first: first
document: document
.: .


## Lemmatizing

In [6]:
lemmatizer = WordNetLemmatizer()
for word in words:
    print(f'{word}: {lemmatizer.lemmatize(word)}')

This: This
is: is
the: the
first: first
document: document
.: .


## Extra: Parts-of-Speech (POS) tagging

In [7]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    if treebank_tag.startswith('V'):
        return wordnet.VERB
    if treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    

In [14]:
sentence = "Donal Trump has a devoted following."
sentence_tok = word_tokenize(sentence)
sentence_tok

['Donal', 'Trump', 'has', 'a', 'devoted', 'following', '.']

In [16]:
words_and_tags = nltk.pos_tag(sentence_tok)
words_and_tags

[('Donal', 'NNP'),
 ('Trump', 'NNP'),
 ('has', 'VBZ'),
 ('a', 'DT'),
 ('devoted', 'VBN'),
 ('following', 'NN'),
 ('.', '.')]

In [17]:
for word, tag in words_and_tags:
    lemma = lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag))
    print(lemma, end=" ")

Donal Trump have a devote following . 