# Lemmatization
* How to identify Parts of Speech(POS) tags
* Hot to lemmatize using NLTK library
* Hot to lemmatize tokens using spaCy library

In [None]:
import nltk, pandas as pd, spacy 
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

In [None]:
tokenizer = RegexpTokenizer('\w+')
lema = WordNetLemmatizer()

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
doc = 'I visited my grandparents last week; We had a good time together'
tokens = tokenizer.tokenize(doc.lower())
tokens

['i',
 'visited',
 'my',
 'grandparents',
 'last',
 'week',
 'we',
 'had',
 'a',
 'good',
 'time',
 'together']

In [None]:
for x in tokens:
  print(x,'--->', lema.lemmatize(x))

i ---> i
visited ---> visited
my ---> my
grandparents ---> grandparent
last ---> last
week ---> week
we ---> we
had ---> had
a ---> a
good ---> good
time ---> time
together ---> together


### Part of Speech(POS)

In [None]:
for x in tokens:
  print(x,'--->', lema.lemmatize(x, 'v')) # Specifying verb with 'v'

i ---> i
visited ---> visit
my ---> my
grandparents ---> grandparents
last ---> last
week ---> week
we ---> we
had ---> have
a ---> a
good ---> good
time ---> time
together ---> together


In [None]:
# Part of Speech(POS) tags:
nltk.pos_tag(['visited']) # We just passed 'visited' from above doc and found it's a VBN(past participle of a verb)

[('visited', 'VBN')]

In [None]:
nltk.pos_tag(['visited'])[0]

('visited', 'VBN')

In [None]:
nltk.pos_tag(['visited'])[0][1]

'VBN'

In [None]:
nltk.pos_tag(tokens)

[('i', 'NN'),
 ('visited', 'VBD'),
 ('my', 'PRP$'),
 ('grandparents', 'NNS'),
 ('last', 'JJ'),
 ('week', 'NN'),
 ('we', 'PRP'),
 ('had', 'VBD'),
 ('a', 'DT'),
 ('good', 'JJ'),
 ('time', 'NN'),
 ('together', 'RB')]

# Using wordnet
* We can know the short forms 

In [None]:
from nltk.corpus import wordnet
wordnet.ADJ

'a'

In [None]:
wordnet.VERB

'v'

In [None]:
def get_wordnet(st):
  if st.startswith('J'):
    return wordnet.ADJ
  elif st.startswith('V'):
    return wordnet.VERB
  elif st.startswith('N'):
    return wordnet.NOUN
  elif st.startswith('R'):
    return wordnet.ADV
  else:
    return None

In [None]:
for x in tokens:
  tok_pos = nltk.pos_tag([x])[0][1] # extracting 'pos-tag'
  wordnet_pos = get_wordnet(tok_pos) # extracting 'short-form'
  print(x, '|', tok_pos, '|', wordnet_pos)

i | NN | n
visited | VBN | v
my | PRP$ | None
grandparents | NNS | n
last | JJ | a
week | NN | n
we | PRP | None
had | VBD | v
a | DT | None
good | JJ | a
time | NN | n
together | RB | r


In [None]:
for x in tokens:
  tok_pos = nltk.pos_tag([x])[0][1]
  word_pos = get_wordnet(tok_pos)
  if word_pos is not None:
    lemma = lema.lemmatize(x, pos=word_pos)
  else:
    lemma = lema.lemmatize(x)
  print(x, '|', lemma)

i | i
visited | visit
my | my
grandparents | grandparent
last | last
week | week
we | we
had | have
a | a
good | good
time | time
together | together


In [None]:
nlp = spacy.load('en_core_web_sm')
spadoc = nlp(doc.lower())
spadoc

i visited my grandparents last week; we had a good time together

In [None]:
for x in spadoc:
  print(x, '|', x.pos_) # extracting 'POS'

i | PRON
visited | VERB
my | DET
grandparents | NOUN
last | ADJ
week | NOUN
; | PUNCT
we | PRON
had | AUX
a | DET
good | ADJ
time | NOUN
together | ADV


In [None]:
for x in spadoc:
  print(x, '|', x.pos_, '|', x.lemma_) # extracting 'POS' with lemmatization

i | PRON | i
visited | VERB | visit
my | DET | -PRON-
grandparents | NOUN | grandparent
last | ADJ | last
week | NOUN | week
; | PUNCT | ;
we | PRON | -PRON-
had | AUX | have
a | DET | a
good | ADJ | good
time | NOUN | time
together | ADV | together


#### Spacy is much easier than nltk library and does the job automatically