In [1]:
# import nltk and check on the version

import nltk
print(nltk.__version__)

3.4


In [2]:
# create a block of text to represent a corpus

text = "The early bird gets the worm. " + \
       "Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb. " + \
       "The Earth is 92,960,000mi from the Sun. " + \
       "In Mr. Smith's words, 'This book is great!' " + \
       "The cost is $19.99\non sale until the end of the year. " + \
       "Michio Kaku, Ph.D. "
text

"The early bird gets the worm. Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb. The Earth is 92,960,000mi from the Sun. In Mr. Smith's words, 'This book is great!' The cost is $19.99\non sale until the end of the year. Michio Kaku, Ph.D. "

In [3]:
# download and install nltk components needed for sentence tokenization (punkt)

nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maxen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# convert the text to sentence tokens

sent_tokens = nltk.sent_tokenize(text)
sent_tokens

['The early bird gets the worm.',
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb.',
 'The Earth is 92,960,000mi from the Sun.',
 "In Mr. Smith's words, 'This book is great!'",
 'The cost is $19.99\non sale until the end of the year.',
 'Michio Kaku, Ph.D.']

In [5]:
# can access each sentence as an element of the list

sent_tokens[0]

'The early bird gets the worm.'

In [6]:
# word tokenize the first sentence

nltk.word_tokenize(sent_tokens[0])

['The', 'early', 'bird', 'gets', 'the', 'worm', '.']

In [7]:
# tokenize all sentences.  Notice the list of lists

[nltk.word_tokenize(sent_token) for sent_token in sent_tokens]

[['The', 'early', 'bird', 'gets', 'the', 'worm', '.'],
 ['Dr.',
  'Strangelove',
  'or',
  ':',
  'How',
  'I',
  'Learned',
  'to',
  'Stop',
  'Worrying',
  'and',
  'Love',
  'the',
  'Bomb',
  '.'],
 ['The', 'Earth', 'is', '92,960,000mi', 'from', 'the', 'Sun', '.'],
 ['In',
  'Mr.',
  'Smith',
  "'s",
  'words',
  ',',
  "'This",
  'book',
  'is',
  'great',
  '!',
  "'"],
 ['The',
  'cost',
  'is',
  '$',
  '19.99',
  'on',
  'sale',
  'until',
  'the',
  'end',
  'of',
  'the',
  'year',
  '.'],
 ['Michio', 'Kaku', ',', 'Ph.D', '.']]

In [8]:
# we can also word tokenize the entire document
# notice this results in a single list

nltk.word_tokenize(text)

['The',
 'early',
 'bird',
 'gets',
 'the',
 'worm',
 '.',
 'Dr.',
 'Strangelove',
 'or',
 ':',
 'How',
 'I',
 'Learned',
 'to',
 'Stop',
 'Worrying',
 'and',
 'Love',
 'the',
 'Bomb',
 '.',
 'The',
 'Earth',
 'is',
 '92,960,000mi',
 'from',
 'the',
 'Sun',
 '.',
 'In',
 'Mr.',
 'Smith',
 "'s",
 'words',
 ',',
 "'This",
 'book',
 'is',
 'great',
 '!',
 "'",
 'The',
 'cost',
 'is',
 '$',
 '19.99',
 'on',
 'sale',
 'until',
 'the',
 'end',
 'of',
 'the',
 'year',
 '.',
 'Michio',
 'Kaku',
 ',',
 'Ph.D',
 '.']

In [9]:
# explicitly create a Punkt sentence tokenizer
ps_tokenizer = nltk.PunktSentenceTokenizer()

In [10]:
# tokenize sentences

ps_tokenizer.tokenize(text)

['The early bird gets the worm.',
 'Dr.',
 'Strangelove or: How I Learned to Stop Worrying and Love the Bomb.',
 'The Earth is 92,960,000mi from the Sun.',
 'In Mr.',
 "Smith's words, 'This book is great!'",
 'The cost is $19.99\non sale until the end of the year.',
 'Michio Kaku, Ph.D.']

In [11]:
# Note that Punkt treats Dr. as a sifferent token / sentence

ps_tokenizer.tokenize(sent_tokens[1])

['Dr.', 'Strangelove or: How I Learned to Stop Worrying and Love the Bomb.']

In [12]:
# versus the default which does not

nltk.sent_tokenize(sent_tokens[1])

['Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb.']

In [13]:
# word tokenization can be done explicitly with a Treebank Word Tokenizer

tw_tokenizer = nltk.TreebankWordTokenizer()

In [14]:
# word tokenize the second sentence

tw_tokenizer.tokenize(sent_tokens[1])

['Dr.',
 'Strangelove',
 'or',
 ':',
 'How',
 'I',
 'Learned',
 'to',
 'Stop',
 'Worrying',
 'and',
 'Love',
 'the',
 'Bomb',
 '.']

In [15]:
# note that this is the same result as the default

nltk.word_tokenize(sent_tokens[1])

['Dr.',
 'Strangelove',
 'or',
 ':',
 'How',
 'I',
 'Learned',
 'to',
 'Stop',
 'Worrying',
 'and',
 'Love',
 'the',
 'Bomb',
 '.']

In [16]:
# another way to tokenize words is with the Regexp tokenizer

regex_pattern = r'\w+' # word characters
regex_wt = nltk.RegexpTokenizer(pattern=regex_pattern, gaps=False)
regex_wt.tokenize(sent_tokens[2])

['The', 'Earth', 'is', '92', '960', '000mi', 'from', 'the', 'Sun']

In [17]:
# or split it on whitespace characters

regex_pattern = r'\s+'
regex_wt = nltk.RegexpTokenizer(pattern=regex_pattern, gaps=True)
regex_wt.tokenize(sent_tokens[2])

['The', 'Earth', 'is', '92,960,000mi', 'from', 'the', 'Sun.']

In [18]:
# and there is a tokenizer to explicitly s split on white space

ws_wt = nltk.WhitespaceTokenizer()
ws_wt.tokenize(sent_tokens[3])

['In', 'Mr.', "Smith's", 'words,', "'This", 'book', 'is', "great!'"]

In [2]:
# Punkt has support for several languages.  There are each stored in a pickle file

!dir /B C:\Users\maxen\AppData\Roaming\nltk_data\tokenizers\punkt

czech.pickle
danish.pickle
dutch.pickle
english.pickle
estonian.pickle
finnish.pickle
french.pickle
german.pickle
greek.pickle
italian.pickle
norwegian.pickle
polish.pickle
portuguese.pickle
PY3
README
russian.pickle
slovene.pickle
spanish.pickle
swedish.pickle
turkish.pickle


In [20]:
# let's tokenize some itelian (from Dante's Inferno)

dante = "cosi` l'animo mio, ch'ancor fuggiva,  si volse a retro a rimirar lo passo che non lascio` " + \
        "gia` mai persona viva. Poi ch'ei posato un poco il corpo lasso, ripresi via per la piaggia diserta, " + \
        "si` che 'l pie` fermo sempre era 'l piu` basso."
dante

"cosi` l'animo mio, ch'ancor fuggiva,  si volse a retro a rimirar lo passo che non lascio` gia` mai persona viva. Poi ch'ei posato un poco il corpo lasso, ripresi via per la piaggia diserta, si` che 'l pie` fermo sempre era 'l piu` basso."

In [21]:
# the default always tokenizes with English

nltk.sent_tokenize(dante)

["cosi` l'animo mio, ch'ancor fuggiva,  si volse a retro a rimirar lo passo che non lascio` gia` mai persona viva.",
 "Poi ch'ei posato un poco il corpo lasso, ripresi via per la piaggia diserta, si` che 'l pie` fermo sempre era 'l piu` basso."]

In [22]:
# but we can use other languages using the languag4e parameter and the language name

nltk.sent_tokenize(dante, language="italian")

["cosi` l'animo mio, ch'ancor fuggiva,  si volse a retro a rimirar lo passo che non lascio` gia` mai persona viva.",
 "Poi ch'ei posato un poco il corpo lasso, ripresi via per la piaggia diserta, si` che 'l pie` fermo sempre era 'l piu` basso."]

In [23]:
# and we can also can create a tokenizer directly from the pickle
italian_tokenizer = nltk.data.load("tokenizers/punkt/italian.pickle")
italian_tokenizer.tokenize(dante)

["cosi` l'animo mio, ch'ancor fuggiva,  si volse a retro a rimirar lo passo che non lascio` gia` mai persona viva.",
 "Poi ch'ei posato un poco il corpo lasso, ripresi via per la piaggia diserta, si` che 'l pie` fermo sempre era 'l piu` basso."]

In [24]:
# this is also supported in the word tokenizer

nltk.word_tokenize(dante, language="italian")

['cosi',
 '`',
 "l'animo",
 'mio',
 ',',
 "ch'ancor",
 'fuggiva',
 ',',
 'si',
 'volse',
 'a',
 'retro',
 'a',
 'rimirar',
 'lo',
 'passo',
 'che',
 'non',
 'lascio',
 '`',
 'gia',
 '`',
 'mai',
 'persona',
 'viva',
 '.',
 'Poi',
 "ch'ei",
 'posato',
 'un',
 'poco',
 'il',
 'corpo',
 'lasso',
 ',',
 'ripresi',
 'via',
 'per',
 'la',
 'piaggia',
 'diserta',
 ',',
 'si',
 '`',
 'che',
 "'",
 'l',
 'pie',
 '`',
 'fermo',
 'sempre',
 'era',
 "'",
 'l',
 'piu',
 '`',
 'basso',
 '.']

In [25]:
import string
import re

In [26]:
# we'll work with sentence 4, so let's check it out

nltk.word_tokenize(sent_tokens[3])

['In',
 'Mr.',
 'Smith',
 "'s",
 'words',
 ',',
 "'This",
 'book',
 'is',
 'great',
 '!',
 "'"]

In [27]:
# a function to lower all tokens in a list

def lower_tokens(tokens):
    return [token.lower() for token in tokens]

# lower all the tokens in our list
lower_tokens(nltk.word_tokenize(sent_tokens[3]))

['in',
 'mr.',
 'smith',
 "'s",
 'words',
 ',',
 "'this",
 'book',
 'is',
 'great',
 '!',
 "'"]

In [28]:
# this contains a list of all punctuation

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [29]:
# let's create a function to remove all punctuation in each token in a list

def remove_punctuation_tokens(tokens):
    punct_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))
    return [a for a,b in zip(tokens, [punct_regex.sub('', token) for token in tokens]) if b != '']

In [30]:
# and give it a try

remove_punctuation_tokens(lower_tokens(nltk.word_tokenize(sent_tokens[3])))

['in', 'mr.', 'smith', "'s", 'words', "'this", 'book', 'is', 'great']

In [31]:
# and let's create a function that lowers and removes punctuation

def get_cleaned_tokens(tokens):
    return remove_punctuation_tokens(lower_tokens(tokens))

In [32]:
get_cleaned_tokens(nltk.word_tokenize(sent_tokens[3]))

['in', 'mr.', 'smith', "'s", 'words', "'this", 'book', 'is', 'great']