<a href="https://colab.research.google.com/github/gaellinkeu/Sentiment-Analysis/blob/main/Stemming_VS_Lemmetization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stemming VS Lemmatization

## Import Libraries

In [12]:
# Natural Language Tool Kit (nltk)
import nltk

# Stemming method
from nltk.stem import SnowballStemmer

# tokenize
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Part Of Speech (POS) Tag
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet

# lemmatize method
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
sentence = 'kids were sitting on dirty benches while their parents are busily working'
sentence_words = nltk.word_tokenize(sentence)
sentence_words

['kids',
 'were',
 'sitting',
 'on',
 'dirty',
 'benches',
 'while',
 'their',
 'parents',
 'are',
 'busily',
 'working']

## Stemming

In [11]:
# Initialize stemming method
stemmer = SnowballStemmer(language='english')

# Stemming on tokens (words)
for words in sentence_words:
    print(words + " ---> " + stemmer.stem(words))

kids ---> kid
were ---> were
sitting ---> sit
on ---> on
dirty ---> dirti
benches ---> bench
while ---> while
their ---> their
parents ---> parent
are ---> are
busily ---> busili
working ---> work


## Lemmatization

In [10]:
# Initialize lemmatization method
wordnet_lemmatizer = WordNetLemmatizer()

# Stemming on tokens (words)
for words in sentence_words:
    print(words + " ---> " + wordnet_lemmatizer.lemmatize(words))

kids ---> kid
were ---> were
sitting ---> sitting
on ---> on
dirty ---> dirty
benches ---> bench
while ---> while
their ---> their
parents ---> parent
are ---> are
busily ---> busily
working ---> working


## Lemmatization (with POS tag)

In [14]:
# convert pos_tag formats to wordnet tags formats
def get_wordnet_pos(treebank_tag):
    if treebank_tag[0] == 'J':
        return wordnet.ADJ
    elif treebank_tag[0] == 'V':
        return wordnet.VERB
    elif treebank_tag[0] == 'N':
        return wordnet.NOUN
    elif treebank_tag[0] == 'R':
        return wordnet.ADV
    else:
        return ''


# Pos tagging
pos_tagged_words = pos_tag(sentence_words)
print(pos_tagged_words)

# convert to wordnet format
wordnet_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), pos_tagged_words))

# lemmatization with pos tags
wordnet_lemmatizer = WordNetLemmatizer()
for words in sentence_words:
    print(words + " ---> " + wordnet_lemmatizer.lemmatize(words))

[('kids', 'NNS'), ('were', 'VBD'), ('sitting', 'VBG'), ('on', 'IN'), ('dirty', 'NN'), ('benches', 'NNS'), ('while', 'IN'), ('their', 'PRP$'), ('parents', 'NNS'), ('are', 'VBP'), ('busily', 'RB'), ('working', 'VBG')]
kids ---> kid
were ---> were
sitting ---> sitting
on ---> on
dirty ---> dirty
benches ---> bench
while ---> while
their ---> their
parents ---> parent
are ---> are
busily ---> busily
working ---> working
