# Import nltk and Porter Stemmer

In [29]:
import nltk
from nltk.stem import PorterStemmer

In [30]:
ps = PorterStemmer()

In [31]:
words = ['charging', 'charges', 'charged', 'charge', 'study', 'studying','easily','fairly', 'losses', 'was', 'necessary', 'implementation','permission', 'learn']

In [32]:
for word in words:
  print(word, ": ", ps.stem(word))

charging :  charg
charges :  charg
charged :  charg
charge :  charg
study :  studi
studying :  studi
easily :  easili
fairly :  fairli
losses :  loss
was :  wa
necessary :  necessari
implementation :  implement
permission :  permiss
learn :  learn


In [33]:
sentence = "Police is chasing the suspect as it ran away".split()
# Since porter.stem needs one word at a time, therefore we used split()

In [34]:
sentence

['Police', 'is', 'chasing', 'the', 'suspect', 'as', 'it', 'ran', 'away']

In [35]:
for token in sentence:
  print(ps.stem(token))

polic
is
chase
the
suspect
as
it
ran
away


# Snowball Stemmer

In [36]:
from nltk.stem.snowball import SnowballStemmer

snow_stemmer = SnowballStemmer(language='english') # Need to pass a language parameter

In [37]:
for word in words:
  print(word, ": ", snow_stemmer.stem(word))

charging :  charg
charges :  charg
charged :  charg
charge :  charg
study :  studi
studying :  studi
easily :  easili
fairly :  fair
losses :  loss
was :  was
necessary :  necessari
implementation :  implement
permission :  permiss
learn :  learn


In [38]:
for token in sentence:
  print(snow_stemmer.stem(token))

polic
is
chase
the
suspect
as
it
ran
away


# Lemmatization with nltk

In [39]:
from nltk.corpus import wordnet # import wordnet

In [40]:
from nltk.stem import WordNetLemmatizer

In [41]:
lemmatizer = WordNetLemmatizer()

In [42]:
lemmatizer.lemmatize("studying")

'studying'

In [43]:
lemmatizer.lemmatize("studying", pos=wordnet.VERB)

'study'

# Function for mapping the treebank tags to WordNet part of speech names

In [44]:
# Reference : https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [45]:
sentence = "Police is chasing the suspect as it ran away".split()

In [46]:
sentence

['Police', 'is', 'chasing', 'the', 'suspect', 'as', 'it', 'ran', 'away']

In [47]:
words_and_tags = nltk.pos_tag(sentence)
words_and_tags

[('Police', 'NNP'),
 ('is', 'VBZ'),
 ('chasing', 'VBG'),
 ('the', 'DT'),
 ('suspect', 'NN'),
 ('as', 'IN'),
 ('it', 'PRP'),
 ('ran', 'VBD'),
 ('away', 'RB')]

In [48]:
for word, tag in words_and_tags:
  lemma = lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag))
  print(lemma, end=" ")

Police be chase the suspect a it run away 

In [None]:
# For detail about the Treebank tag set visit https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [49]:
sentence2 = "He will be meeting me on Friday at the meeting".split()

In [50]:
words_and_tags = nltk.pos_tag(sentence2)
words_and_tags

[('He', 'PRP'),
 ('will', 'MD'),
 ('be', 'VB'),
 ('meeting', 'VBG'),
 ('me', 'PRP'),
 ('on', 'IN'),
 ('Friday', 'NNP'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('meeting', 'NN')]

In [51]:
for word, tag in words_and_tags:
  lemma = lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag))
  print(lemma, end=" ")

He will be meet me on Friday at the meeting 

# Comparing Stemming and Lemmatization

In [52]:
words = ['charging', 'charges', 'charged', 'charge', 'study', 'studying','easily','fairly', 'losses', 'was', 'necessary', 'implementation','permission', 'learn', 'learning', 'learned', 'learns', 'mice']

In [53]:
words_and_tags = nltk.pos_tag(words)

In [54]:
for word, tag in words_and_tags:
  lemma = lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag))
  print(lemma,   "       :       ", ps.stem(word))

charge        :        charg
charge        :        charg
charge        :        charg
charge        :        charg
study        :        studi
study        :        studi
easily        :        easili
fairly        :        fairli
loss        :        loss
be        :        wa
necessary        :        necessari
implementation        :        implement
permission        :        permiss
learn        :        learn
learn        :        learn
learn        :        learn
learns        :        learn
mouse        :        mice
