In [6]:
'''
Within Python interpreter:  (in the Terminal Window)

Enter: import nltk
       nltk.download()
       
Go to tab labeled all packages
Scroll down to one with name starting with vader – click on download
'''

# word tokens

from nltk.tokenize import sent_tokenize, word_tokenize

data = "All work and no play, makes jack a dull boy, all work and no play"
print(word_tokenize(data))

['All', 'work', 'and', 'no', 'play', ',', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play']


In [7]:
# sentence tokens

data = "All work and no play, makes jack dull boy. All work and no play makes jack a dull boy."
print(sent_tokenize(data))

['All work and no play, makes jack dull boy.', 'All work and no play makes jack a dull boy.']


In [8]:
# remove stop words

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
stopWords = set(stopwords.words('english'))   #  English stop words 
words = word_tokenize(data)
wordsFiltered = []

for w in words:
    if w not in stopWords:
        wordsFiltered.append(w)

print(wordsFiltered)

['All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.']


In [9]:
print(len(stopWords))
print(stopWords)

179
{'but', 'hers', 'him', 'our', 'they', 'were', 'been', 're', 'other', 'over', 'their', 'who', 'each', 'for', 'do', 'should', 'doing', 'at', 'is', 'where', 'not', "shan't", "you'd", 'only', 'whom', 'nor', 'once', 'ourselves', "she's", "you're", 'its', 'above', 'here', 'an', 'wouldn', 'because', 'shan', "haven't", "didn't", 'you', 's', 'does', 'be', 'what', 'while', 'on', "don't", 'isn', "wouldn't", 'yours', "mightn't", 'o', 'shouldn', 'no', 'those', "should've", 'about', 'during', 'when', 'then', 'theirs', 'up', 'm', 'am', 'why', 'himself', 'me', 'aren', "isn't", "you've", "that'll", 'will', 'myself', 'won', 'mustn', 'couldn', 'has', 'there', "mustn't", "you'll", 'how', 'own', "needn't", 'of', 'than', 'just', 'ain', 'after', "couldn't", 'same', 'with', 'didn', 'hadn', 'a', 'and', 'themselves', 'so', 'y', "shouldn't", 'any', 'that', 'to', 'hasn', 'between', 'out', 'we', 'under', 'more', 'this', 'she', 'ma', 'don', 'now', 'if', 'his', 'in', 'did', "hadn't", 'which', 'very', 'the', "are

In [10]:
print(len(wordsFiltered))
print(wordsFiltered)

16
['All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.']


In [11]:
# word Stems

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

words = ["game","gaming","gamed","games","tweets","tweeted","tweeting"]
ps = PorterStemmer()

for word in words:
    print(word + ":" + ps.stem(word))

game:game
gaming:game
gamed:game
games:game
tweets:tweet
tweeted:tweet
tweeting:tweet


In [12]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

sentence = "gaming, the gamers play games"
words = word_tokenize(sentence)

for word in words:
    print(word + ":" + ps.stem(word))

gaming:game
,:,
the:the
gamers:gamer
play:play
games:game


In [13]:
#  Split all punctuation into separate tokens

from nltk.tokenize import WordPunctTokenizer
text = "Reset your password if you just can't remember your old one."
print("\nOriginal string:")
print(text)
result = WordPunctTokenizer().tokenize(text)
print("\nSplit all punctuation into separate tokens:")
print(result)


Original string:
Reset your password if you just can't remember your old one.

Split all punctuation into separate tokens:
['Reset', 'your', 'password', 'if', 'you', 'just', 'can', "'", 't', 'remember', 'your', 'old', 'one', '.']


In [14]:
#  Tokenize a twitter text

from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
tweet_text = "NoSQL introduction - w3resource http://bit.ly/1ngHC5F  #nosql #database #webdev"
print("\nOriginal Tweet:")
print(tweet_text)
result = tknzr.tokenize(tweet_text)
print("\nTokenize a twitter text:")
print(result) 


Original Tweet:
NoSQL introduction - w3resource http://bit.ly/1ngHC5F  #nosql #database #webdev

Tokenize a twitter text:
['NoSQL', 'introduction', '-', 'w3resource', 'http://bit.ly/1ngHC5F', '#nosql', '#database', '#webdev']


In [15]:
#  Remove username handles from a twitter text

from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True)
tweet_text = "@abcd @pqrs NoSQL introduction - w3resource http://bit.ly/1ngHC5F  #nosql #database #webdev"
print("\nOriginal Tweet:")
print(tweet_text)
result = tknzr.tokenize(tweet_text)
print("\nTokenize a twitter text:")
print(result)


Original Tweet:
@abcd @pqrs NoSQL introduction - w3resource http://bit.ly/1ngHC5F  #nosql #database #webdev

Tokenize a twitter text:
['NoSQL', 'introduction', '-', 'w3resource', 'http://bit.ly/1ngHC5F', '#nosql', '#database', '#webdev']


In [16]:
#  Read a given text through each line and look for sentences

import nltk.data
text = '''
Mr. Smith waited for the train. The train was late.
Mary and Samantha took the bus. I looked for Mary and
Samantha at the bus station.
'''
print("\nOriginal Tweet:")
print(text)
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
print('\n==============\n'.join(sent_detector.tokenize(text.strip())))



Original Tweet:

Mr. Smith waited for the train. The train was late.
Mary and Samantha took the bus. I looked for Mary and
Samantha at the bus station.

Mr. Smith waited for the train.
The train was late.
Mary and Samantha took the bus.
I looked for Mary and
Samantha at the bus station.
