In [1]:
# do the needed imports

import nltk
import string
import re

In [2]:
# define some helper functions

def lower_tokens(tokens):
    return [token.lower() for token in tokens]

def remove_punctuation_tokens(tokens):
    punct_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))
    return [a for a,b in zip(tokens, [punct_regex.sub('', token) for token in tokens]) if b != '']

def get_cleaned_tokens(tokens):
    return remove_punctuation_tokens(lower_tokens(tokens))

In [3]:
# create text to tokenize

text = "The early bird gets the worm. " + \
       "Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb. " + \
       "The Earth is 92,960,000mi from the Sun. " + \
       "In Mr. Smith's words, 'This book is great!' " + \
       "The cost is $19.99\non sale until the end of the year. " + \
       "Michio Kaku, Ph.D. "
text

"The early bird gets the worm. Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb. The Earth is 92,960,000mi from the Sun. In Mr. Smith's words, 'This book is great!' The cost is $19.99\non sale until the end of the year. Michio Kaku, Ph.D. "

In [4]:
# load the necessary punkt items so we can tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maxen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# tokenize and sample the 4th sentence

sent_tokens = nltk.sent_tokenize(text)
sample_sentence = sent_tokens[3]
sample_sentence

"In Mr. Smith's words, 'This book is great!'"

In [6]:
# let's word tokenize that sentence

tokens = nltk.word_tokenize(sample_sentence)
tokens

['In',
 'Mr.',
 'Smith',
 "'s",
 'words',
 ',',
 "'This",
 'book',
 'is',
 'great',
 '!',
 "'"]

In [7]:
# and now install the stopwords items into nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maxen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
# let's check out all the stopwords in english

stopwords = nltk.corpus.stopwords.words('english')
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [9]:
# these are the languages supported for stopwords

!ls /home/azureuser/nltk_data/corpora/stopwords

'ls' n'est pas reconnu en tant que commande interne
ou externe, un programme ex‚cutable ou un fichier de commandes.


In [10]:
# remove the stopwords from the list of tokens

[token for token in tokens if token not in stopwords]

['In', 'Mr.', 'Smith', "'s", 'words', ',', "'This", 'book', 'great', '!', "'"]

In [11]:
# that didn't work too well because of case, so let's lower case our tokens

tokens = get_cleaned_tokens(tokens)
tokens

['in', 'mr.', 'smith', "'s", 'words', "'this", 'book', 'is', 'great']

In [12]:
# and now we'll see the stopwords removed

[token for token in tokens if token not in stopwords]

['mr.', 'smith', "'s", 'words', "'this", 'book', 'great']

In [13]:
# so let's make a function to remove stopwords

def remove_stopword_tokens(tokens):
    stopwords = nltk.corpus.stopwords.words('english')
    return [token for token in tokens if token not in stopwords]

In [14]:
# and give it a try

remove_stopword_tokens(tokens)

['mr.', 'smith', "'s", 'words', "'this", 'book', 'great']

In [15]:
# note we have a 's token, so let's remove punctuation in any tokens

def remove_punctuation_in_tokens(tokens):
    punct_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))
    return [punct_regex.sub('', token) for token in tokens]

In [16]:
tokens_punct_removed = remove_punctuation_in_tokens(tokens)
tokens_punct_removed

['in', 'mr', 'smith', 's', 'words', 'this', 'book', 'is', 'great']

In [17]:
# and then remove stopwords

remove_stopword_tokens(tokens_punct_removed)

['mr', 'smith', 'words', 'book', 'great']

In [18]:
# so let's pull this all together

def preprocess_sentence(sentence):
    return remove_stopword_tokens(remove_punctuation_in_tokens(get_cleaned_tokens(nltk.word_tokenize(sentence))))

In [19]:
preprocess_sentence(sample_sentence)

['mr', 'smith', 'words', 'book', 'great']