# Task 2: Stemming and Lemmatizing

*by Lukas Dötlinger*


In [18]:
from pprint import pprint
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

text = """A longer sentence was used for doing some stemming by using multiple stemmers which
    are implemented in nltk. This didn't yield an groundbreaking results. We can conclude
    that the results are not that different overall when using different stemmers. During the
    testing, only some minor differences were found between the approaches."""
words = word_tokenize(text)
filtered_words = [w for w in words if not w in stop_words]

ps = PorterStemmer()
ss = SnowballStemmer('english')

# Stems words and returns the transformation as string if stemmer changes word.
stem_words = lambda words, s: ['{} -> {}'.format(w, s.stem(w)) for w in words if w != s.stem(w)]

ps_result = stem_words(words, ps)
ss_result = stem_words(words, ss)
print('Porter Stemmer:')
pprint(ps_result)
print('\nSnowball Stemmer:')
pprint(ss_result)

print('\nIntersection when using all words:')
pprint(list(set(ss_result) ^ set(ps_result)))

ps_result = stem_words(filtered_words, ps)
ss_result = stem_words(filtered_words, ss)

print('\nIntersection when excluding stopwords:')
pprint(list(set(ss_result) ^ set(ps_result)))

Porter Stemmer:
['sentence -> sentenc',
 'was -> wa',
 'used -> use',
 'doing -> do',
 'stemming -> stem',
 'using -> use',
 'multiple -> multipl',
 'stemmers -> stemmer',
 'implemented -> implement',
 'This -> thi',
 'groundbreaking -> groundbreak',
 'results -> result',
 'conclude -> conclud',
 'results -> result',
 'different -> differ',
 'overall -> overal',
 'using -> use',
 'different -> differ',
 'stemmers -> stemmer',
 'During -> dure',
 'testing -> test',
 'only -> onli',
 'differences -> differ',
 'approaches -> approach']

Snowball Stemmer:
['A -> a',
 'sentence -> sentenc',
 'used -> use',
 'doing -> do',
 'stemming -> stem',
 'using -> use',
 'multiple -> multipl',
 'stemmers -> stemmer',
 'implemented -> implement',
 'This -> this',
 'groundbreaking -> groundbreak',
 'results -> result',
 'We -> we',
 'conclude -> conclud',
 'results -> result',
 'different -> differ',
 'overall -> overal',
 'using -> use',
 'different -> differ',
 'stemmers -> stemmer',
 'During -> dure'

When looking at the computed differences, we cannot observe any major difference between the two stemmers. However the `SnowballStemmer` seems to be a bit more accurate and less aggressive, as the `PorterStemmer` stems `this` to `thi` and `was` to `wa`. We can therefore argue that a *Snowball Stemmer* is better for the given sentences. The removal of stopwords does not change that in any way.

In [19]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

text = """A longer sentence was used for doing some stemming by using multiple stemmers which
    are implemented in nltk. This didn't yield an groundbreaking results. We can conclude
    that the results are not that different overall when using different stemmers. During the
    testing, only some minor differences were found between the approaches."""
words = word_tokenize(text)

wordnet_lemmatizer = WordNetLemmatizer()

print('Lemmatized words that changed:')
for w in words:
    if (wordnet_lemmatizer.lemmatize(w) != w):
        print('{} -> {}'.format(w, wordnet_lemmatizer.lemmatize(w)))  

Lemmatized words that changed:
was -> wa
stemmers -> stemmer
results -> result
results -> result
stemmers -> stemmer
differences -> difference
approaches -> approach


Comparing the `WordNetLemmatizer` to the previously used stemmers shows that it's only partially different when looking at the intersection of their results. Overall we can observe that the lemmatizer transforms way less words in total. This is due to the lemmatizer beeing a more intelligent approach that laverages a dictionary to find the proper form of a word.

In [22]:
from pprint import pprint
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# https://designers-inn.de/blindtexte/
text = """Hallo. Ich bin ein kleiner Blindtext. Und zwar schon so lange ich denken kann.
    Es war nicht leicht zu verstehen, was es bedeutet, ein blinder Text zu sein: Man
    ergibt keinen Sinn. Wirklich keinen Sinn. Man wird zusammenhangslos eingeschoben
    und rumgedreht – und oftmals gar nicht erst gelesen. Aber bin ich allein deshalb ein
    schlechterer Text als andere? Na gut, ich werde nie in den Bestsellerlisten stehen.
    Aber andere Texte schaffen das auch nicht. Und darum stört es mich nicht besonders
    blind zu sein. Und sollten Sie diese Zeilen noch immer lesen, so habe ich als
    kleiner Blindtext etwas geschafft, wovon all die richtigen und wichtigen Texte meist
    nur träumen."""
words = word_tokenize(text)
ss = SnowballStemmer('english')

# Outputs words which are different after stemming (not case sensitive)
pprint(['{} -> {}'.format(w, ss.stem(w)) for w in words if w.lower() != ss.stem(w)])

['lange -> lang',
 'zusammenhangslos -> zusammenhangslo',
 'oftmals -> oftmal',
 'schlechterer -> schlechter',
 'als -> al',
 'andere -> ander',
 'werde -> werd',
 'andere -> ander',
 'Texte -> text',
 'besonders -> besond',
 'diese -> dies',
 'als -> al',
 'etwas -> etwa',
 'Texte -> text']
