In [101]:
import nltk
from nltk.stem import WordNetLemmatizer

In [102]:
text = "James ate the food and washed the dishes. Students were eating noodles at a cafe. Don’t you want to eat before we leave? All of us have just eaten our breakfast. The animal also eats fruit and vegetables." 

In [111]:
print(text)

James ate the food and washed the dishes. Students were eating noodles at a cafe. Don’t you want to eat before we leave? All of us have just eaten our breakfast. The animal also eats fruit and vegetables.


<b>version 1</b>

In [115]:
wordnet_lemmatizer = WordNetLemmatizer()

Split the text into sentences and take the first sentence as an example

In [116]:
sentences= nltk.sent_tokenize(text)

Determine the lemma of each word

In [117]:
word_first_sentence = nltk.word_tokenize(sentences[0])

In [118]:
word_list = []
for w in word_first_sentence:
    word_list.append((w,wordnet_lemmatizer.lemmatize(w)))

The result is not obviously not great, because the lemmatize() function has no idea of the context of the word. The only change was that plural words got converted into their (singular) lemma. 

In [107]:
word_list

[('James', 'James'),
 ('ate', 'ate'),
 ('the', 'the'),
 ('food', 'food'),
 ('and', 'and'),
 ('washed', 'washed'),
 ('the', 'the'),
 ('dishes', 'dish'),
 ('.', '.')]

<b> version 2</b>
<br>
In this example the lemmatize function is given a `pos` argument

In [119]:
text_normalised = ''
for sent in sentences: 
    word_list = []
    for w in nltk.word_tokenize(sent):
        word_list.append(wordnet_lemmatizer.lemmatize(w,pos='v'))
    text_normalised += ' '.join(word_list)

In [120]:
text_normalised

'James eat the food and wash the dish .Students be eat noodles at a cafe .Don ’ t you want to eat before we leave ?All of us have just eat our breakfast .The animal also eat fruit and vegetables .'

<b>version 3 </b>
<br>

In [124]:
from nltk.corpus import wordnet

In [125]:
lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [126]:
print(lemmatizer.lemmatize("I am loving it")) #I am loving it
print(lemmatizer.lemmatize("loving")) #loving
print(lemmatizer.lemmatize("loving", "v")) #love
print(lemmatize_sentence("I am loving it")) #I be love it

I am loving it
loving
love
I be love it


In [130]:
text_normalised = ''
for sent in sentences: 
    sent_normalised = lemmatize_sentence(sent)
    text_normalised +=sent_normalised

In [131]:
text_normalised

'James eat the food and wash the dish .Students be eat noodle at a cafe .Don ’ t you want to eat before we leave ?All of us have just eat our breakfast .The animal also eat fruit and vegetable .'

In [129]:
sent_normalised

'The animal also eat fruit and vegetable .'