In [1]:
import spacy
import nltk

In [2]:
from nltk.stem import PorterStemmer
stemmer= PorterStemmer()

In [3]:
words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]
for word in words:
    print(word, " | ", stemmer.stem(word))

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  ate
adjustable  |  adjust
rafting  |  raft
ability  |  abil
meeting  |  meet


In [5]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("eating eats eat ate adjustable rafting ability meeting better")

for token in doc:
    print(token, " | ", token.lemma_, " | ", token.lemma)

eating  |  eat  |  9837207709914848172
eats  |  eat  |  9837207709914848172
eat  |  eat  |  9837207709914848172
ate  |  eat  |  9837207709914848172
adjustable  |  adjustable  |  6033511944150694480
rafting  |  raft  |  7154368781129989833
ability  |  ability  |  11565809527369121409
meeting  |  meet  |  6880656908171229526
better  |  well  |  4525988469032889948


In [7]:
doc = nlp("Mando talked for 3 hours although talking isn't his thing he became talkative")

for token in doc:
    print(token, " | ", token.lemma_, " | ", token.lemma)

Mando  |  Mando  |  7837215228004622142
talked  |  talk  |  13939146775466599234
for  |  for  |  16037325823156266367
3  |  3  |  602994839685422785
hours  |  hour  |  9748623380567160636
although  |  although  |  343236316598008647
talking  |  talk  |  13939146775466599234
is  |  be  |  10382539506755952630
n't  |  not  |  447765159362469301
his  |  his  |  2661093235354845946
thing  |  thing  |  2473243759842082748
he  |  he  |  1655312771067108281
became  |  become  |  12558846041070486771
talkative  |  talkative  |  13364764166055324990


In [8]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [9]:
ar = nlp.get_pipe("attribute_ruler")

ar.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]], {"LEMMA":"Brother"})

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brother
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust


# Stemming and Lemmatization: Exercises
- Run this cell to import all necessary packages

In [11]:
#let import necessary libraries and create the object

#for nltk
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

#downloading all neccessary packages related to nltk
nltk.download('all')


#for spacy
import spacy
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/debojit/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/debojit/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/debojit/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /home/debojit/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/debojit/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nlt

## Exercise1:
- Convert these list of words into base form using Stemming and Lemmatization and observe the transformations
- Write a short note on the words that have different base words using stemming and Lemmatization

In [17]:
#using stemming in nltk
lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']
print("Below is Stemming:")
for word in lst_words:
    print(word, " | ", stemmer.stem(word))
print("===========================================")
print("===========================================")
print("Below is Lemmatization:")
doc = nlp("running painting walking dressing likely children whom good ate fishing")
for token in doc:
    print(token, " | ", token.lemma_)

Below is Stemming:
running  |  run
painting  |  paint
walking  |  walk
dressing  |  dress
likely  |  like
children  |  children
whom  |  whom
good  |  good
ate  |  ate
fishing  |  fish
Below is Lemmatization:
running  |  run
painting  |  painting
walking  |  walking
dressing  |  dress
likely  |  likely
children  |  child
whom  |  whom
good  |  good
ate  |  eat
fishing  |  fish


### Observations
- Words that are different in stemming and lemmatization are:
  - painting
  - walking
  - children
  - ate
- As Stemming achieves the base word by removing the **suffixes** [ing, ly etc], so it successfully transform the words like 'painting', 'likely', 'fishing' and lemmatization fails for some words ending with suffixes here.
- As Lemmatization uses the **dictionary** meanings while converting to the base form, so words like 'children' and 'ate' are successfully transformed and stemming fails here.

## Exercise2:

- convert the given text into it's base form using both stemming and lemmatization

In [18]:
text = """Latha is very multi talented girl.She is good at many skills like dancing, running, singing, playing.She also likes eating Pav Bhagi. she has a 
habit of fishing and swimming too.Besides all this, she is a wonderful at cooking too.
"""

In [22]:
from nltk.tokenize import word_tokenize
#using stemming in nltk
all_word_tokens = word_tokenize(text)

#step2: getting the base form for each token using stemmer
all_base_words=[]

for token in all_word_tokens:
    base_form = stemmer.stem(token)
    all_base_words.append(base_form)

#step3: joining all words in a list into string using 'join()'
base_text = " ".join(all_base_words)
print(base_text)

latha is veri multi talent girl.sh is good at mani skill like danc , run , sing , playing.sh also like eat pav bhagi . she ha a habit of fish and swim too.besid all thi , she is a wonder at cook too .


In [23]:
#using lemmatisation in spacy

#step1: Creating the object for the given text
doc = nlp(text)
all_base_words = []

#step2: getting the base form for each token using spacy 'lemma_'
for token in doc:
    base_word = token.lemma_
    all_base_words.append(base_word)

#step3: joining all words in a list into string using 'join()'
final_base_text = " ".join(all_base_words)
print(final_base_text)

Latha be very multi talented girl . she be good at many skill like dancing , running , singing , play . she also like eat Pav Bhagi . she have a 
 habit of fishing and swim too . besides all this , she be a wonderful at cook too . 

