In [None]:
# get necessary libraries
import nltk # natural language toolkit
import nltk.corpus

In [None]:
# get all corpus datasets
nltk.download('all-corpora')

[nltk_data] Downloading collection 'all-corpora'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    | 

True

In [None]:
# list datasets under gutenberg
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [None]:
# gets words from shakespeare hamlet
hamlet = nltk.corpus.gutenberg.words('shakespeare-hamlet.txt')
hamlet
for word in hamlet[0:100]:
  print(word, end=' ')

[ The Tragedie of Hamlet by William Shakespeare 1599 ] Actus Primus . Scoena Prima . Enter Barnardo and Francisco two Centinels . Barnardo . Who ' s there ? Fran . Nay answer me : Stand & vnfold your selfe Bar . Long liue the King Fran . Barnardo ? Bar . He Fran . You come most carefully vpon your houre Bar . ' Tis now strook twelue , get thee to bed Francisco Fran . For this releefe much thankes : ' Tis bitter cold , And I am sicke at heart Barn . Haue you had 

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# 1. Tokenization
from nltk.tokenize import word_tokenize

story = "Mr Tan is a cute little boy. He is learning AI and machine learning to try to save the world."
story_tokens = word_tokenize(story)
story_tokens

['Mr',
 'Tan',
 'is',
 'a',
 'cute',
 'little',
 'boy',
 '.',
 'He',
 'is',
 'learning',
 'AI',
 'and',
 'machine',
 'learning',
 'to',
 'try',
 'to',
 'save',
 'the',
 'world',
 '.']

In [None]:
len(story_tokens)

22

In [None]:
# get frequency
from nltk.probability import FreqDist

fdist = FreqDist()
for word in story_tokens:
  fdist[word.lower()] += 1
fdist

FreqDist({'.': 2,
          'a': 1,
          'ai': 1,
          'and': 1,
          'boy': 1,
          'cute': 1,
          'he': 1,
          'is': 2,
          'learning': 2,
          'little': 1,
          'machine': 1,
          'mr': 1,
          'save': 1,
          'tan': 1,
          'the': 1,
          'to': 2,
          'try': 1,
          'world': 1})

In [None]:
from nltk import bigrams, trigrams, ngrams

quote = "The best and most beautiful things in the world cannot be seen or even touched, they must be felt with the heart."
quote_tokens = nltk.word_tokenize(quote)
quote_tokens

quote_bigrams = list(nltk.bigrams(quote_tokens))
quote_bigrams

quote_trigrams = list(nltk.trigrams(quote_tokens))
quote_trigrams

quote_ngrams = list(nltk.ngrams(quote_tokens, 5))
quote_ngrams

[('The', 'best', 'and', 'most', 'beautiful'),
 ('best', 'and', 'most', 'beautiful', 'things'),
 ('and', 'most', 'beautiful', 'things', 'in'),
 ('most', 'beautiful', 'things', 'in', 'the'),
 ('beautiful', 'things', 'in', 'the', 'world'),
 ('things', 'in', 'the', 'world', 'can'),
 ('in', 'the', 'world', 'can', 'not'),
 ('the', 'world', 'can', 'not', 'be'),
 ('world', 'can', 'not', 'be', 'seen'),
 ('can', 'not', 'be', 'seen', 'or'),
 ('not', 'be', 'seen', 'or', 'even'),
 ('be', 'seen', 'or', 'even', 'touched'),
 ('seen', 'or', 'even', 'touched', ','),
 ('or', 'even', 'touched', ',', 'they'),
 ('even', 'touched', ',', 'they', 'must'),
 ('touched', ',', 'they', 'must', 'be'),
 (',', 'they', 'must', 'be', 'felt'),
 ('they', 'must', 'be', 'felt', 'with'),
 ('must', 'be', 'felt', 'with', 'the'),
 ('be', 'felt', 'with', 'the', 'heart'),
 ('felt', 'with', 'the', 'heart', '.')]

In [None]:
# 2. Stemming

words_to_stem = ['give', 'giving', 'given', 'gave']

# 2a Porter Stemming

from nltk import PorterStemmer

pst = PorterStemmer()
for word in words_to_stem:
  print(word, pst.stem(word))

give give
giving give
given given
gave gave


In [None]:
# 2b Lancaster Stemming

from nltk import LancasterStemmer

lst = LancasterStemmer()
for word in words_to_stem:
  print(word, lst.stem(word))

give giv
giving giv
given giv
gave gav


In [None]:
# 3. Lemmatization
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()
for word in words_to_stem:
  print(word, lem.lemmatize(word))

give give
giving giving
given given
gave gave


In [None]:
# eliminate stop words
from nltk.corpus import stopwords

stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
len(stopwords.words('english'))

179

In [None]:
# remove punctuation
import re # regular expression

punctuation = re.compile(r'[-.,?:;!()]')

print(story_tokens)

post_punctuation = []
for words in story_tokens:
  word = punctuation.sub("", words)
  if len(word) > 0:
    post_punctuation.append(word)
post_punctuation

['Mr', 'Tan', 'is', 'a', 'cute', 'little', 'boy', '.', 'He', 'is', 'learning', 'AI', 'and', 'machine', 'learning', 'to', 'try', 'to', 'save', 'the', 'world', '.']


['Mr',
 'Tan',
 'is',
 'a',
 'cute',
 'little',
 'boy',
 'He',
 'is',
 'learning',
 'AI',
 'and',
 'machine',
 'learning',
 'to',
 'try',
 'to',
 'save',
 'the',
 'world']

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# POS - Parts of Speech
for token in story_tokens:
  print(nltk.pos_tag([token]))

[('Mr', 'NN')]
[('Tan', 'NN')]
[('is', 'VBZ')]
[('a', 'DT')]
[('cute', 'NN')]
[('little', 'JJ')]
[('boy', 'NN')]
[('.', '.')]
[('He', 'PRP')]
[('is', 'VBZ')]
[('learning', 'VBG')]
[('AI', 'NN')]
[('and', 'CC')]
[('machine', 'NN')]
[('learning', 'VBG')]
[('to', 'TO')]
[('try', 'NN')]
[('to', 'TO')]
[('save', 'VB')]
[('the', 'DT')]
[('world', 'NN')]
[('.', '.')]
