In [1]:
##### Load libraries #####

import nltk
import numpy as np
import matplotlib as plot
from nltk.corpus import brown

In [2]:
# View books from the Gutenberg project
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
# Store the name of the first book into a variable
file0 = nltk.corpus.gutenberg.fileids()[0]
print(file0)

In [6]:
# Store the text from the book into a variable
emmatext = nltk.corpus.gutenberg.raw(file0)
print(len(emmatext))
print(type(emmatext))

887071
<class 'str'>


In [7]:
# Preview the first 120 characters
emmatext[:120]

'[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nan'

In [8]:
# Use the standard tokenizer that comes with NLTK
emmatokens = nltk.word_tokenize(emmatext)
print(len(emmatokens))

191785

In [10]:
# Look at the first 50 tokens
print(emmatokens[:50])

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER', 'I', 'Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', ',', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of', 'existence', ';', 'and', 'had', 'lived', 'nearly', 'twenty-one', 'years', 'in', 'the', 'world', 'with']


In [11]:
# Lets standardize all the words to lowercase
emmawords = [w.lower() for w in emmatokens]

In [13]:
# Create a unique list of words using a set
emmavocab = sorted(set(emmawords))
print(emmavocab[:50])

['!', '&', "'", "''", "'d", "'s", "'t", "'ye", '(', ')', ',', '--', '.', '10,000', '1816', '23rd', '24th', '26th', '28th', '7th', '8th', ':', ';', '?', '[', ']', '_______', '_a_', '_accepted_', '_adair_', '_addition_', '_all_', '_almost_', '_alone_', '_amor_', '_and_', '_answer_', '_any_', '_appropriation_', '_as_', '_assistance_', '_at_', '_bath_', '_be_', '_been_', '_blunder_', '_boiled_', '_both_', '_bride_', '_broke_']


In [14]:
# Check occurneces of the word emma in the text
emmawords.count("emma")

855

In [16]:
# Create a frequency distribution of the words
fdist = nltk.FreqDist(emmawords)

In [19]:
# Look at the keys of the frequency distribution
fdistkeys = list(fdist.keys())
print(fdistkeys[:50])

['[', 'emma', 'by', 'jane', 'austen', '1816', ']', 'volume', 'i', 'chapter', 'woodhouse', ',', 'handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home', 'happy', 'disposition', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'existence', ';', 'had', 'lived', 'nearly', 'twenty-one', 'years', 'in', 'world', 'very', 'little', 'distress', 'or', 'vex', 'her', '.', 'she', 'was', 'youngest', 'two']


In [20]:
# Look up a key
fdist["emma"]

855

In [22]:
# Look at the top keys
topkeys = fdist.most_common(40)

In [23]:
# Loop over each pair of keys and print it
for pair in topkeys:
    print(pair)

(',', 12016)
('.', 6355)
('the', 5201)
('to', 5181)
('and', 4877)
('of', 4284)
('i', 3177)
('a', 3124)
('--', 3100)
('it', 2503)
("''", 2452)
('her', 2448)
('was', 2396)
(';', 2353)
('she', 2336)
('not', 2281)
('in', 2173)
('be', 1970)
('you', 1967)
('he', 1806)
('that', 1805)
('``', 1735)
('had', 1623)
('but', 1441)
('as', 1436)
('for', 1346)
('have', 1320)
('is', 1241)
('with', 1215)
('very', 1202)
('his', 1141)
('mr.', 1091)
('!', 1063)
('at', 1030)
('so', 968)
("'s", 866)
('emma', 855)
('all', 841)
('could', 836)
('would', 818)


In [24]:
# Convert frequencies to percentage of totals for the top keys
topkeysnorm = [(word, freq / len(emmawords)) for (word, freq) in topkeys]
for pair in topkeysnorm:
    print(pair)

(',', 0.06265349219177725)
('.', 0.03313606382146675)
('the', 0.027118909195192532)
('to', 0.0270146257527961)
('and', 0.02542951742837031)
('of', 0.022337513361316057)
('i', 0.016565424824673464)
('a', 0.016289073702322913)
('--', 0.016163933571447194)
('it', 0.013051072815913653)
("''", 0.012785150037802747)
('her', 0.012764293349323462)
('was', 0.012493156399092735)
(';', 0.012268946997940402)
('she', 0.012180306071903433)
('not', 0.011893526605313242)
('in', 0.0113303960163725)
('be', 0.010271919076048701)
('you', 0.010256276559689236)
('he', 0.009416794848397945)
('that', 0.009411580676278125)
('``', 0.009046588627890607)
('had', 0.00846260135047058)
('but', 0.007513622024663034)
('as', 0.007487551164063926)
('for', 0.007018275673279975)
('have', 0.0068827071981646115)
('is', 0.006470787600698699)
('with', 0.006335219125583336)
('very', 0.0062674348880256536)
('his', 0.005949370388716532)
('mr.', 0.005688661782725448)
('!', 0.005542664963370441)
('at', 0.0053705972834163255)
('so'

In [27]:
##### Repeat the steps with a different book #####

# Store the name of the first book into a variable
file1 = nltk.corpus.gutenberg.fileids()[1]

# Store the text from the book into a variable
austenptext = nltk.corpus.gutenberg.raw(file1)

# Use the standard tokenizer that comes with NLTK
austenptokens = nltk.word_tokenize(austenptext)

# Lets standardize all the words to lowercase
austenpwords = [w.lower() for w in austenptokens]

# Create a frequency distribution of the words
fdist = nltk.FreqDist(austenpwords)

# Get the top 30 words
topkeys = fdist.most_common(30)

# Print out the word frequency pairs
for pair in topkeys:
    print(pair)

(',', 7024)
('the', 3328)
('.', 3119)
('and', 2786)
('to', 2782)
('of', 2568)
('a', 1592)
('in', 1383)
('was', 1337)
(';', 1319)
('her', 1203)
('had', 1186)
('she', 1146)
('i', 1123)
('it', 1038)
('not', 976)
('he', 961)
('be', 950)
("''", 912)
('that', 882)
('as', 809)
('for', 707)
('but', 664)
('his', 659)
('with', 654)
('``', 652)
('you', 628)
('have', 589)
('at', 533)
('all', 530)
