# Stemming and Lemmatization using NLTK
## Index
### 1. Download NTLK package
### 2. Stemming  - PorterStemmer 
### 3. Lemmatization -  WordNetLemmatizer
### 4. Stemming vs Lemmtization comparision for different words
       - Marraige, University, Better
       - verb with ing form 
       - adjective
#### 5. Stemming vs Lemmatization on a sentence/corpus


In [1]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\JaiBrahma\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## 2. Stemming using NLTK Porter Stemmer

In [2]:
from nltk.stem.porter import PorterStemmer

In [3]:
porter_stemmer  = PorterStemmer()
word = 'Marriage'
print("word= {}, stemming_form={}".format(word, porter_stemmer.stem(word)))
word = 'University'
print("word= {}, stemming_form={}".format(word, porter_stemmer.stem(word)))
word = 'better'
print("word= {}, stemming_form={}".format(word, porter_stemmer.stem(word)))

word= Marriage, stemming_form=marriag
word= University, stemming_form=univers
word= better, stemming_form=better


In [4]:
porter_stemmer  = PorterStemmer()
text = "studies beautiful awful tastier studying cries cry bitter studying walking enjoying"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Stemming for {} is {}".format(w,porter_stemmer.stem(w)))  

Stemming for studies is studi
Stemming for beautiful is beauti
Stemming for awful is aw
Stemming for tastier is tastier
Stemming for studying is studi
Stemming for cries is cri
Stemming for cry is cri
Stemming for bitter is bitter
Stemming for studying is studi
Stemming for walking is walk
Stemming for enjoying is enjoy


## 3. Lemmatization using NLTK WordNetLemmatizer

In [5]:
# import these modules
from nltk.stem import WordNetLemmatizer

In [6]:
lemmatizer = WordNetLemmatizer()
  
print("rocks => ", lemmatizer.lemmatize("rocks"))
print("studying =>", lemmatizer.lemmatize("studying", pos="v"))
print("walking =>", lemmatizer.lemmatize("walking", pos="v"))
print("better =>", lemmatizer.lemmatize("better")) 
# a denotes adjective in "pos"
print("better =>", lemmatizer.lemmatize("better", pos ="a"))


rocks =>  rock
studying => study
walking => walk
better => better
better => good


In [7]:
help(lemmatizer.lemmatize)

Help on method lemmatize in module nltk.stem.wordnet:

lemmatize(word: str, pos: str = 'n') -> str method of nltk.stem.wordnet.WordNetLemmatizer instance
    Lemmatize `word` using WordNet's built-in morphy function.
    Returns the input word unchanged if it cannot be found in WordNet.
    
    :param word: The input word to lemmatize.
    :type word: str
    :param pos: The Part Of Speech tag. Valid options are `"n"` for nouns,
        `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"`
        for satellite adjectives.
    :param pos: str
    :return: The lemma of `word`, for the given `pos`.



#### lammatizer.lemmatize takes two arguements
    - word and its POS (parts-of-speech)
    - if no POS is given, it will consider word as Noun

In [8]:
word = "going"
print("{} => {}".format(word, lemmatizer.lemmatize(word)))

going => going


In [9]:
word = "going"
print("{} => {}".format(word, lemmatizer.lemmatize(word, pos="v")))
word = "went"
print("{} => {}".format(word, lemmatizer.lemmatize(word, pos="v")))
word = "gone"
print("{} => {}".format(word, lemmatizer.lemmatize(word, pos="v")))

going => go
went => go
gone => go


### 4. Stemming vs Lemmtization comparision for different words

#### 4.1 verb list with -ing forms

In [10]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()
porter_stemmer  = PorterStemmer()
word = 'Marriage'
print("word= {}, stemming_form={}, lemmatized_form={}".format(word, porter_stemmer.stem(word), lemmatizer.lemmatize(word, pos ="n")))

word = 'University'
print("word= {}, stemming_form={}, lemmatized_form={}".format(word, porter_stemmer.stem(word), lemmatizer.lemmatize(word, pos ="n")))

word = 'better'
print("word= {}, stemming_form={}, lemmatized_form={}".format(word, porter_stemmer.stem(word), lemmatizer.lemmatize(word, pos ="a")))

word= Marriage, stemming_form=marriag, lemmatized_form=Marriage
word= University, stemming_form=univers, lemmatized_form=University
word= better, stemming_form=better, lemmatized_form=good


In [11]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()
porter_stemmer  = PorterStemmer()

verb_list = "are studying walking crying cries reading enjoying smoking increasing kissing changing".split(" ")
for word in verb_list:
    print("word= {}, stemming_form={}, lemmatized_form={}".format(word, porter_stemmer.stem(word), lemmatizer.lemmatize(word, pos ="v")))

word= are, stemming_form=are, lemmatized_form=be
word= studying, stemming_form=studi, lemmatized_form=study
word= walking, stemming_form=walk, lemmatized_form=walk
word= crying, stemming_form=cri, lemmatized_form=cry
word= cries, stemming_form=cri, lemmatized_form=cry
word= reading, stemming_form=read, lemmatized_form=read
word= enjoying, stemming_form=enjoy, lemmatized_form=enjoy
word= smoking, stemming_form=smoke, lemmatized_form=smoke
word= increasing, stemming_form=increas, lemmatized_form=increase
word= kissing, stemming_form=kiss, lemmatized_form=kiss
word= changing, stemming_form=chang, lemmatized_form=change


#### 4.2 adjective

In [12]:
adjective_list = "marriage awful better awesome homeless lucky lonely vast annoyed".split(' ')
for word in adjective_list:
    print("word= {}, stemming_form={}, lemmatized_form={}".format(word, porter_stemmer.stem(word), lemmatizer.lemmatize(word, pos ="a")))

word= marriage, stemming_form=marriag, lemmatized_form=marriage
word= awful, stemming_form=aw, lemmatized_form=awful
word= better, stemming_form=better, lemmatized_form=good
word= awesome, stemming_form=awesom, lemmatized_form=awesome
word= homeless, stemming_form=homeless, lemmatized_form=homeless
word= lucky, stemming_form=lucki, lemmatized_form=lucky
word= lonely, stemming_form=lone, lemmatized_form=lonely
word= vast, stemming_form=vast, lemmatized_form=vast
word= annoyed, stemming_form=annoy, lemmatized_form=annoyed


#### 4.3 adverb

In [13]:
adverb_list= "abnormally boldly quickly regularly monthly more verbally yearly".split(' ')
for word in adverb_list:
    print("word= {}, stemming_form={}, lemmatized_form={}".format(word, porter_stemmer.stem(word), lemmatizer.lemmatize(word, pos ="r")))

word= abnormally, stemming_form=abnorm, lemmatized_form=abnormally
word= boldly, stemming_form=boldli, lemmatized_form=boldly
word= quickly, stemming_form=quickli, lemmatized_form=quickly
word= regularly, stemming_form=regularli, lemmatized_form=regularly
word= monthly, stemming_form=monthli, lemmatized_form=monthly
word= more, stemming_form=more, lemmatized_form=more
word= verbally, stemming_form=verbal, lemmatized_form=verbally
word= yearly, stemming_form=yearli, lemmatized_form=yearly


In [14]:
### 5. Stemmming vs Lemmatization on a sentence/corpus
 - the indivdual word has to be stemmed or lemmatized

IndentationError: unexpected indent (266632938.py, line 2)

In [None]:
# WORDNET LEMMATIZER (with appropriate pos tags)
from nltk.corpus import wordnet
 
lemmatizer = WordNetLemmatizer()
 
# Define function to lemmatize each word with its POS tag
 
# POS_TAGGER_FUNCTION : TYPE 1
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None
 


#> the cat can be sit with the bat on the striped mat under many fly geese

In [None]:
sentence = 'the cat is sitting with the bats on the striped mat under many badly flying geese'

word_list = nltk.word_tokenize(sentence)

# tokenize the sentence and find the POS tag for each token
pos_tagged = nltk.pos_tag(word_list) 
 
print('pos_tagged::', pos_tagged, end='\n\n')
#>[('the', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('sitting', 'VBG'), ('with', 'IN'),
# ('the', 'DT'), ('bats', 'NNS'), ('on', 'IN'), ('the', 'DT'), ('striped', 'JJ'),
# ('mat', 'NN'), ('under', 'IN'), ('many', 'JJ'), ('flying', 'VBG'), ('geese', 'JJ')]
 
# As you may have noticed, the above pos tags are a little confusing.
 
# we use our own pos_tagger function to make things simpler to understand.
wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
print('wordnet_tagged :::', wordnet_tagged, end='\n\n')
#>[('the', None), ('cat', 'n'), ('is', 'v'), ('sitting', 'v'), ('with', None),
# ('the', None), ('bats', 'n'), ('on', None), ('the', None), ('striped', 'a'),
# ('mat', 'n'), ('under', None), ('many', 'a'), ('flying', 'v'), ('geese', 'a')]
 

In [None]:

for  word, tag in wordnet_tagged:
    #print(word, tag)
    if tag is None:
        print("word={}, stemmed form => {}, lemmatized form=>{}".format(word, porter_stemmer.stem(word), word) )
    else:
        print("word={}, stemmed form => {}, lemmatized form=> {}".format(word, porter_stemmer.stem(word),  lemmatizer.lemmatize(word, pos=tag) ))

In [None]:
stemmed_sentence = []
lemmatized_sentence = []
for  word, tag in wordnet_tagged:
    stemmed_sentence.append(porter_stemmer.stem(word))
    if tag is None:
        lemmatized_sentence.append(word)
    else:
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos=tag) )

print('Original Sentence => ', ' '.join(word_list), end= '\n\n')
print('stemmed_sentence => ', ' '.join(stemmed_sentence), end= '\n\n')
print('lemmatized_sentence => ', ' '.join(lemmatized_sentence), end= '\n\n')