In [1]:
import nltk

In [2]:
# tokenization

from nltk.tokenize import sent_tokenize, word_tokenize

# data
text="Mary had a little lamb. Her fleece was white as snow"

# sentence tokenization
sentence_list = sent_tokenize(text)
print(sentence_list)

# word tokenization
word_list = word_tokenize(text)
print('\n{}'.format(word_list))

['Mary had a little lamb.', 'Her fleece was white as snow']

['Mary', 'had', 'a', 'little', 'lamb', '.', 'Her', 'fleece', 'was', 'white', 'as', 'snow']


In [3]:
# stop words

from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import word_tokenize

english_stopwords = stopwords.words('english')
print(english_stopwords[:20])

punctuation_list = list(punctuation)
print('\n{}'.format(punctuation_list))

custom_stopwords_distinct_list = list(set(english_stopwords + punctuation_list))
print('\n{}'.format(custom_stopwords_distinct_list[:20]))

words_without_stopwords = [word for word in word_list if word not in custom_stopwords_distinct_list]
print('\n{}'.format(words_without_stopwords))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers']

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']

['his', 'by', 'wasn', 'and', 'needn', 'here', 'too', '#', 'to', '{', '%', 'y', 'not', '=', 'out', '`', 'hasn', 'he', 'off', '<']

['Mary', 'little', 'lamb', 'Her', 'fleece', 'white', 'snow']


In [4]:
# n-grams

from nltk.collocations import BigramCollocationFinder

bigram_finder = BigramCollocationFinder.from_words(words_without_stopwords)
bigram_freq_dist = bigram_finder.ngram_fd.items()
print(sorted(bigram_freq_dist))

[(('Her', 'fleece'), 1), (('Mary', 'little'), 1), (('fleece', 'white'), 1), (('lamb', 'Her'), 1), (('little', 'lamb'), 1), (('white', 'snow'), 1)]


In [5]:
# stemming

from nltk.stem.lancaster import LancasterStemmer

text2 = "Mary closed on closing night when she was in the mood to close."

stemmer = LancasterStemmer()
stemmed_words = [stemmer.stem(word) for word in word_tokenize(text2)]
print(stemmed_words)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'clos', '.']


In [6]:
# POS - part of speech

from nltk import pos_tag

pos_tag_list = pos_tag(word_tokenize(text2))
print(pos_tag_list)

[('Mary', 'NNP'), ('closed', 'VBD'), ('on', 'IN'), ('closing', 'NN'), ('night', 'NN'), ('when', 'WRB'), ('she', 'PRP'), ('was', 'VBD'), ('in', 'IN'), ('the', 'DT'), ('mood', 'NN'), ('to', 'TO'), ('close', 'VB'), ('.', '.')]


In [7]:
# word sense disambiguation

from nltk.corpus import wordnet
from nltk.wsd import lesk

for synset in wordnet.synsets('bass'):
    print(synset, synset.definition())

sense1 = lesk(word_tokenize('Sing in a lower tone, along with the bass'),'bass')
print('\n -- {}, {}'.format(sense1, sense1.definition()))

sense2 = lesk(word_tokenize('This sea bass was really hard to catch'),'bass')
print('\n -- {}, {}'.format(sense2, sense2.definition()))

Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range

 -- Synset('bass.n.07'), the member with the lowest range of a family of musical instruments

 -- Synset('sea_bass.n.01'), the lean flesh of a saltwater fish of the family Serranidae
