<img src="../Pics/MLSb-T.png" width="160">
<br><br>
<center><u><H1>Stemming and Lemmatization</H1></u></center>

## Stemming:

In [None]:
import nltk

In [None]:
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer

In [None]:
def words_stemmer(words, type="PorterStemmer", lang="english", encoding="utf8"):
    stemmers = ["PorterStemmer", "LancasterStemmer", "SnowballStemmer"]
    if type is False or type not in stemmers:
        return words
    else:
        stem_words = []
        if type == "PorterStemmer":
            stemmer = PorterStemmer()
            for w in words:
                stem_words.append(stemmer.stem(w).encode(encoding))
        if type == "LancasterStemmer":
            stemmer = LancasterStemmer()
            for w in words:
                stem_words.append(stemmer.stem(w).encode(encoding))
        if type == "SnowballStemmer":
            stemmer = SnowballStemmer(lang)
            for w in words:
                stem_words.append(stemmer.stem(w).encode(encoding))
        return " ".join(stem_words)        

In [None]:
words = "caring cares carefully cared"

In [None]:
print("Original:", words)
print("Porter: ", words_stemmer(nltk.word_tokenize(words), "PorterStemmer"))
print("Lancaster: ", words_stemmer(nltk.word_tokenize(words), "LancasterStemmer"))
print("Snowball: ", words_stemmer(nltk.word_tokenize(words), "SnowballStemmer"))

## Lemmatization:

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
wlem = WordNetLemmatizer()

In [None]:
#Function to apply lemmatization to list of words
def words_lemmatizer(text, encoding="utf8"):
    words = nltk.word_tokenize(text)
    lemma_words = []
    wl = WordNetLemmatizer()
    for w in words:
        pos = find_pos(w)
        lemma_words.append(wl.lemmatize(w, pos).encode(encoding))
    return " ".join(lemma_words)    

In [None]:
#n    NOUN 
#v    VERB 
#a    ADJECTIVE 
#s    ADJECTIVE SATELLITE 
#r    ADVERB 

In [None]:
def find_pos(word):
    #part of speech constants
    pos = nltk.pos_tag(nltk.word_tokenize(word))[0][1]
    # Adjective tags : "JJ", "JJR", "JJS"
    if pos.lower()[0] == 'j':
        return 'a'
    # Adverb tags : "RB", "RBR", "RBS"
    elif pos.lower()[0] == 'r':
        return 'r'
    # Verb tags: "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"
    elif pos.lower()[0] == 'v':
        return 'v'
    # Noun tags: "NN", "NNS", "NNP", "NNPS"
    else:
        return 'n'

In [None]:
print("Lemmatized: ", words_lemmatizer(words))

### Getting synonyms and antonyms for a given word with wordnet

In [None]:
# Wordnet is a large lexical database for English words that are linked together
# by their semantic relationships. 
# It groups words together based on their meanings.

In [None]:
from nltk.corpus import wordnet

In [None]:
s = wordnet.synsets("suitable")
print("Definition: ", s[0].definition())
print("Example: ", s[0].examples())

In [None]:
synonyms = []
antonyms = []
for s in wordnet.synsets("better"):
    for l in s.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print("synonyms: \n", set(synonyms))
print("antonyms: \n", set(antonyms))

## References:

http://www.nltk.org/api/nltk.stem.html

http://en.wikipedia.org/wiki/Stemming

https://wordnet.princeton.edu/wordnet/man/wndb.5WN.html#sect3

http://www.nltk.org/howto/wordnet.html