# Stemming and Lemmatization

In [26]:
import nltk, spacy ## spacy uses stemming and nltk uses lemmatization

In [27]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

In [4]:
words = ["eating", "eats", "eat", "ate", "adjustable", "rubbing", "ability", "fucking"]

for word in words:
    print(word, " | ", stemmer.stem(word))

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  ate
adjustable  |  adjust
rubbing  |  rub
ability  |  abil
fucking  |  fuck


In [7]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("eating eats eat ate adjustable rubbing ability fucking better")
for token in doc:
    print(token, " | ", token.lemma_, " | ", token.lemma) ## token | lemmatization | unique hash

eating  |  eat  |  9837207709914848172
eats  |  eat  |  9837207709914848172
eat  |  eat  |  9837207709914848172
ate  |  eat  |  9837207709914848172
adjustable  |  adjustable  |  6033511944150694480
rubbing  |  rub  |  3833470278404931097
ability  |  ability  |  11565809527369121409
fucking  |  fuck  |  12903434802346126505
better  |  well  |  4525988469032889948


In [34]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("I talk for 3 hrs. although taking isn't my thing, I became talkative")
for token in doc:
    print(token, " | ", token.lemma_) ## token | lemmatization

I  |  I
talk  |  talk
for  |  for
3  |  3
hrs  |  hrs
.  |  .
although  |  although
taking  |  take
is  |  be
n't  |  not
my  |  my
thing  |  thing
,  |  ,
I  |  I
became  |  become
talkative  |  talkative


## Custom lemma

In [12]:
doc = nlp("Bro, its late to go Home. Bruh chill up there still time.")
for token in doc:
    print(token, " | ", token.lemma_) ## token | lemmatization

Bro  |  bro
,  |  ,
its  |  its
late  |  late
to  |  to
go  |  go
Home  |  home
.  |  .
Bruh  |  bruh
chill  |  chill
up  |  up
there  |  there
still  |  still
time  |  time
.  |  .


In [17]:
ar = nlp.get_pipe('attribute_ruler')
ar.add([[{"TEXT": "Bro"}], [{"TEXT": "Bruh"}]], {"LEMMA": "Brother"})

In [20]:
## now "Bro and bruh" will change to Brother
doc = nlp("Bro, its late to go Home. Bruh chill up there still time.")
for token in doc:
    print(token, " | ", token.lemma_) ## token | lemmatization

Bro  |  Brother
,  |  ,
its  |  its
late  |  late
to  |  to
go  |  go
Home  |  home
.  |  .
Bruh  |  Brother
chill  |  chill
up  |  up
there  |  there
still  |  still
time  |  time
.  |  .


## Exercise

In [28]:
## Q1. Convert these list of words into base form using Stemming and Lemmatization and observe the transformations

# using stemming in nltk
lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']

In [30]:
for word in lst_words:
    print(word, " | ", stemmer.stem(word))

running  |  run
painting  |  paint
walking  |  walk
dressing  |  dress
likely  |  like
children  |  children
whom  |  whom
good  |  good
ate  |  ate
fishing  |  fish


In [40]:
## Q2. Write a short note on the words that have different base words using stemming and Lemmatization

#using lemmatization in spacy
doc = nlp("running painting walking dressing likely children who good ate fishing")
for token in doc:
    print(token.text, " | ", token.lemma_)

running  |  run
painting  |  paint
walking  |  walk
dressing  |  dress
likely  |  likely
children  |  child
who  |  who
good  |  good
ate  |  eat
fishing  |  fishing


In [49]:
## Q3. Convert the given text into it's base form using both stemming and lemmatization

### using nltk
text = """Latha is very multi talented girl.She is good at many skills like dancing, running, singing, playing.She also likes eating Pav Bhagi. she has a 
habit of fishing and swimming too.Besides all this, she is a wonderful at cooking too.
"""

word_tokenized = nltk.word_tokenize(text)

nltk_stem_word_lst = []
for word in word_tokenized:
    nltk_stem_word_lst.append(stemmer.stem(word))

print(" ".join(nltk_stem_word_lst))

latha is veri multi talent girl.sh is good at mani skill like danc , run , sing , playing.sh also like eat pav bhagi . she ha a habit of fish and swim too.besid all thi , she is a wonder at cook too .


In [54]:
### using spacy

doc = nlp(text)
all_word_lst = []
for token in doc:
    all_word_lst.append(token.lemma_)

print(" ".join(all_word_lst))

Latha be very multi talented girl . she be good at many skill like dancing , running , singing , play . she also like eat Pav Bhagi . she have a 
 habit of fishing and swim too . besides all this , she be a wonderful at cook too . 

