<a href="https://colab.research.google.com/github/deepak1195/NLP_Using_Spacy/blob/main/003_StemmingAndLemmatization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
import spacy

### Stemming Using NLTK

In [2]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [3]:
words=['rocks','underlying','overlaying','overlapping','coextensive','coinciding','intersecting','coterminous','better','Bro','Brah']

for wr in words:
  print(f"{wr} --> {stemmer.stem(wr)}")

rocks --> rock
underlying --> underli
overlaying --> overlay
overlapping --> overlap
coextensive --> coextens
coinciding --> coincid
intersecting --> intersect
coterminous --> cotermin
better --> better
Bro --> bro
Brah --> brah


### Lemmatization Using NLTK

In [4]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemm = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
for wr in words:
  print(f"{wr} --> {lemm.lemmatize(wr)}")

rocks --> rock
underlying --> underlying
overlaying --> overlaying
overlapping --> overlapping
coextensive --> coextensive
coinciding --> coinciding
intersecting --> intersecting
coterminous --> coterminous
better --> better
Bro --> Bro
Brah --> Brah


### Lemmatization Using Spacy

In [6]:
spNlp=spacy.load("en_core_web_sm")
doc=spNlp(' '.join(words))

for tk in doc:
  print(f"{tk} --> {tk.lemma_}")

rocks --> rock
underlying --> underlie
overlaying --> overlay
overlapping --> overlap
coextensive --> coextensive
coinciding --> coinciding
intersecting --> intersect
coterminous --> coterminous
better --> well
Bro --> Bro
Brah --> Brah


### Customize Spacy for Lemmatization

In [7]:
spNlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [8]:
ar=spNlp.get_pipe('attribute_ruler')
ar.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]],{"LEMMA":"Brother"})

doc1=spNlp(' '.join(words))
for tk in doc1:
  print(f"{tk.text} --> {tk.lemma_}")

rocks --> rock
underlying --> underlie
overlaying --> overlay
overlapping --> overlap
coextensive --> coextensive
coinciding --> coinciding
intersecting --> intersect
coterminous --> coterminous
better --> well
Bro --> Brother
Brah --> Brother
