In [1]:
import nltk

In [2]:
# Sentence Tokenization (paragraph into sentences)
from nltk.tokenize import sent_tokenize
text="""Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome.
The sky is pinkish-blue. You shouldn't eat cardboard"""
tokenized_text=sent_tokenize(text)
print(tokenized_text)

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and city is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard"]


In [3]:
# Word Tokenization (sentences into words)
from nltk.tokenize import word_tokenize
tokenized_word=word_tokenize(text)
print(tokenized_word)

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard']


In [4]:
# Frequency Distribution (can be ommited)
from nltk.probability import FreqDist
fdist = FreqDist(tokenized_word)
print(fdist)

print("Most common words are ", fdist.most_common(2))

import matplotlib.pyplot as plt
fdist.plot(30,cumulative=False)
plt.show()

<FreqDist with 25 samples and 30 outcomes>
Most common words are  [('is', 3), (',', 2)]


<Figure size 640x480 with 1 Axes>

In [7]:
# Removing stopwords

from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
print(stop_words)

filtered_sent=[]
for w in tokenized_word:
    if w not in stop_words:
        filtered_sent.append(w)
print("Tokenized Sentence:",tokenized_word)
print("Filterd Sentence:",filtered_sent)

{"it's", 'd', "needn't", 's', 'after', 'm', "wouldn't", 'where', "won't", 'too', 'if', 'had', 'yourselves', 'between', 'all', 'wasn', 'these', 'doesn', 'its', 'with', 'each', 'himself', 'doing', "didn't", 're', 'that', 'at', 'such', 'o', 'the', 'isn', 'should', 'shan', 'as', 'during', 'there', "that'll", 'below', 'to', 'here', 'has', 'hadn', 'how', 'myself', 'mightn', 'once', 'haven', 'above', 'ours', 'having', "should've", 'what', 'very', 'off', "shouldn't", 'me', 'before', 'being', "isn't", 'can', 'did', 'now', 'but', "you've", 'because', 'only', 'she', 'hers', 'is', "weren't", 'into', "you'd", 'those', 've', 'do', 'your', 'was', 'won', 'when', 'just', 'itself', "you're", 'were', 'theirs', 'any', "don't", 'which', 'their', 'then', "aren't", "doesn't", 'yourself', 'up', 'both', 'we', 'my', 'down', 'who', 'his', 'll', 'needn', 'does', 'have', 'herself', 'out', "couldn't", 'be', 'been', 'in', 'against', 'hasn', "hasn't", 'a', 'ma', 'it', 'will', 'don', 'own', 'until', 'shouldn', 'her', 

In [8]:
# Stemming (reducing words to their root regardless of tense)
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

stemmed_words=[]
for w in filtered_sent:
    stemmed_words.append(ps.stem(w))

print("Filtered Sentence:",filtered_sent)
print("Stemmed Sentence:",stemmed_words)

Filtered Sentence: ['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'The', 'weather', 'great', ',', 'city', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', "n't", 'eat', 'cardboard']
Stemmed Sentence: ['hello', 'mr.', 'smith', ',', 'today', '?', 'the', 'weather', 'great', ',', 'citi', 'awesom', '.', 'the', 'sky', 'pinkish-blu', '.', 'you', "n't", 'eat', 'cardboard']
