In [1]:
# import the necessary packages

import nltk
from nltk.stem import *
import pandas as pd

In [2]:
# build a list of words to test stemming

words = ["friend", "friendly", "friendship", "friends", "friendships", \
         "stabile", "destabilize", "destabile", \
         "misunderstanding", "misunderstand", \
         "lying", "lie", "lied", \
         "motivated", "motivate", "motivational", "motivating"]

In [3]:
# create a Porter stemmer

ps = PorterStemmer()

In [4]:
# and stem all the words

[(word, ps.stem(word)) for word in words]

[('friend', 'friend'),
 ('friendly', 'friendli'),
 ('friendship', 'friendship'),
 ('friends', 'friend'),
 ('friendships', 'friendship'),
 ('stabile', 'stabil'),
 ('destabilize', 'destabil'),
 ('destabile', 'destabil'),
 ('misunderstanding', 'misunderstand'),
 ('misunderstand', 'misunderstand'),
 ('lying', 'lie'),
 ('lie', 'lie'),
 ('lied', 'lie'),
 ('motivated', 'motiv'),
 ('motivate', 'motiv'),
 ('motivational', 'motiv'),
 ('motivating', 'motiv')]

In [5]:
# and a Lancaster stemmer

ls = LancasterStemmer()

In [6]:
# and check out how that handles the words

[(word, ls.stem(word)) for word in words]

[('friend', 'friend'),
 ('friendly', 'friend'),
 ('friendship', 'friend'),
 ('friends', 'friend'),
 ('friendships', 'friend'),
 ('stabile', 'stabl'),
 ('destabilize', 'dest'),
 ('destabile', 'dest'),
 ('misunderstanding', 'misunderstand'),
 ('misunderstand', 'misunderstand'),
 ('lying', 'lying'),
 ('lie', 'lie'),
 ('lied', 'lied'),
 ('motivated', 'mot'),
 ('motivate', 'mot'),
 ('motivational', 'mot'),
 ('motivating', 'mot')]

In [7]:
# snowball stemmer can do languages other than English

SnowballStemmer.languages

('arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

In [8]:
# but lt's try english and see how it compares
ss = SnowballStemmer("english")
[(word, ss.stem(word)) for word in words]

[('friend', 'friend'),
 ('friendly', 'friend'),
 ('friendship', 'friendship'),
 ('friends', 'friend'),
 ('friendships', 'friendship'),
 ('stabile', 'stabil'),
 ('destabilize', 'destabil'),
 ('destabile', 'destabil'),
 ('misunderstanding', 'misunderstand'),
 ('misunderstand', 'misunderstand'),
 ('lying', 'lie'),
 ('lie', 'lie'),
 ('lied', 'lie'),
 ('motivated', 'motiv'),
 ('motivate', 'motiv'),
 ('motivational', 'motiv'),
 ('motivating', 'motiv')]

In [9]:
# and check out how they all look side to side
df = pd.DataFrame(
    {
        "Words": words, 
        "Porter": [ps.stem(word) for word in words],
        "Lancaster": [ls.stem(word) for word in words],
        "Snowball": [ss.stem(word) for word in words]
    })
df

Unnamed: 0,Words,Porter,Lancaster,Snowball
0,friend,friend,friend,friend
1,friendly,friendli,friend,friend
2,friendship,friendship,friend,friendship
3,friends,friend,friend,friend
4,friendships,friendship,friend,friendship
5,stabile,stabil,stabl,stabil
6,destabilize,destabil,dest,destabil
7,destabile,destabil,dest,destabil
8,misunderstanding,misunderstand,misunderstand,misunderstand
9,misunderstand,misunderstand,misunderstand,misunderstand
