# Task 2: Stemming and Lemmatizing

*by Lukas Dötlinger*


In [17]:
from pprint import pprint
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

text = """A longer sentence was used for doing some stemming by using multiple stemmers which
    are implemented in nltk. This didn't yield an groundbreaking results. We can conclude
    that the results are not that different overall when using different stemmers. During the
    testing, only some minor differences were found between the approaches."""
words = word_tokenize(text)
filtered_words = [w for w in words if not w in stop_words]

ps = PorterStemmer()
ss = SnowballStemmer("english")

ps_result = list(map(lambda w: ps.stem(w), words))
ss_result = list(map(lambda w: ss.stem(w), words))
print("Porter Stemmer:\n", ps_result)
print("\nSnowball Stemmer:\n", ss_result)

diff = list(set(ss_result) - set(ps_result))
print('\nDifference when using all words:', diff)

ps_result = list(map(lambda w: ps.stem(w), filtered_words))
ss_result = list(map(lambda w: ss.stem(w), filtered_words))

diff = list(set(ss_result) - set(ps_result))
print('\nDifference when excluding stopwords:', diff)

Porter Stemmer:
 ['A', 'longer', 'sentenc', 'wa', 'use', 'for', 'do', 'some', 'stem', 'by', 'use', 'multipl', 'stemmer', 'which', 'are', 'implement', 'in', 'nltk', '.', 'thi', 'did', "n't", 'yield', 'an', 'groundbreak', 'result', '.', 'We', 'can', 'conclud', 'that', 'the', 'result', 'are', 'not', 'that', 'differ', 'overal', 'when', 'use', 'differ', 'stemmer', '.', 'dure', 'the', 'test', ',', 'onli', 'some', 'minor', 'differ', 'were', 'found', 'between', 'the', 'approach', '.']

Snowball Stemmer:
 ['a', 'longer', 'sentenc', 'was', 'use', 'for', 'do', 'some', 'stem', 'by', 'use', 'multipl', 'stemmer', 'which', 'are', 'implement', 'in', 'nltk', '.', 'this', 'did', "n't", 'yield', 'an', 'groundbreak', 'result', '.', 'we', 'can', 'conclud', 'that', 'the', 'result', 'are', 'not', 'that', 'differ', 'overal', 'when', 'use', 'differ', 'stemmer', '.', 'dure', 'the', 'test', ',', 'onli', 'some', 'minor', 'differ', 'were', 'found', 'between', 'the', 'approach', '.']

Difference when using all word

When looking at the computed differences, we cannot observe any major difference between the two stemmers. However the `SnowballStemmer` seems to be a bit more accurate and less aggressive, as the `PorterStemmer` stems `this` to `thi` and `was` to `wa`. We can therefore argue that a *Snowball Stemmer* is better for the given sentences. The removal of stopwords does not change that in any way.

In [16]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

text = """A longer sentence was used for doing some stemming by using multiple stemmers which
    are implemented in nltk. This didn't yield an groundbreaking results. We can conclude
    that the results are not that different overall when using different stemmers. During the
    testing, only some minor differences were found between the approaches."""
words = word_tokenize(text)

wordnet_lemmatizer = WordNetLemmatizer()

print('Lemmatized words that changed:')
for w in words:
    if (wordnet_lemmatizer.lemmatize(w) != w):
        print('{} -> {}'.format(w, wordnet_lemmatizer.lemmatize(w)))  

Lemmatized words that changed:
was -> wa
stemmers -> stemmer
results -> result
results -> result
stemmers -> stemmer
differences -> difference
approaches -> approach


Comparing the `WordNetLemmatizer` to the previously used stemmers shows that it's only partially different. The lemmatizer does return `wa` for `was`, but doesn't change `this` like the *Porter Stemmer*.