In [36]:
import glob
import nltk
from collections import Counter

Be sure to run the command "python dump_data.py gutenberg test -n 10". Now we will grab the filenames of the documents we're interested in.

In [6]:
filenames = glob.glob('test/*.txt')
texts = list()
for fname in filenames:
    with open(fname,'r') as f:
        texts.append(f.read())
print('read', len(texts), 'texts.')

read 10 texts.


## Word and Sentence Tokenizers
We now have the filenames and texts. Next, we extract a single text file to demonstrate word and sentence tokenization.

In [9]:
single_text = texts[0]
print(single_text[:200])

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; an


In [13]:
words = nltk.tokenize.word_tokenize(single_text)
words[:10]

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER']

In [15]:
sentences = nltk.tokenize.sent_tokenize(single_text)
sentences[:2]

['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.',
 "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."]

## Tokenize All Texts
Now we tokenized all texts in a loop, counting number of words and sentences.

In [26]:
for fname,text in zip(filenames, texts):
    words = nltk.tokenize.word_tokenize(text)
    print('found', len(words), 'words in', fname)

['[', 'Emma', 'by', 'Jane', 'Austen']
found 191673 words in test/austen-emma.txt
['[', 'Persuasion', 'by', 'Jane', 'Austen']
found 97888 words in test/austen-persuasion.txt
['[', 'Sense', 'and', 'Sensibility', 'by']
found 141367 words in test/austen-sense.txt
['[', 'The', 'King', 'James', 'Bible']
found 946812 words in test/bible-kjv.txt
['[', 'Poems', 'by', 'William', 'Blake']
found 8239 words in test/blake-poems.txt
['[', 'Stories', 'to', 'Tell', 'to']
found 55621 words in test/bryant-stories.txt
['[', 'The', 'Adventures', 'of', 'Buster']
found 18542 words in test/burgess-busterbrown.txt
['[', 'Alice', "'s", 'Adventures', 'in']
found 33310 words in test/carroll-alice.txt
['[', 'The', 'Ball', 'and', 'The']
found 97203 words in test/chesterton-ball.txt
['[', 'The', 'Wisdom', 'of', 'Father']
found 85412 words in test/chesterton-brown.txt


In [29]:
for fname,text in zip(filenames, texts):
    sents = nltk.tokenize.sent_tokenize(text)
    print('found', len(sents), 'sentences in', fname)

found 7493 sentences in test/austen-emma.txt
found 3654 sentences in test/austen-persuasion.txt
found 4833 sentences in test/austen-sense.txt
found 29812 sentences in test/bible-kjv.txt
found 355 sentences in test/blake-poems.txt
found 2715 sentences in test/bryant-stories.txt
found 1001 sentences in test/burgess-busterbrown.txt
found 1625 sentences in test/carroll-alice.txt
found 4624 sentences in test/chesterton-ball.txt
found 3712 sentences in test/chesterton-brown.txt


## Sentence Length
Now try identifying the average sentence length. We will need to first tokenize by sentence (sent_tokenize), then by words (word_tokenize on each sentence).

In [32]:
for fname,text in zip(filenames, texts):
    sents = nltk.tokenize.sent_tokenize(text)
    sent_words = [nltk.tokenize.word_tokenize(s) for s in sents]
    
    sent_lengths = [len(s) for s in sent_words]
    max_len = max(sent_lengths)
    min_len = min(sent_lengths)
    av_len = sum(sent_lengths)/len(sent_lengths)
    
    print('{}\n number: {}; max words: {}; '
          'min words: {}; av words: {}'
          '\n'.format(fname, len(sents), max_len, min_len, av_len)
         )

test/austen-emma.txt
 number: 7493; max words: 275; min words: 1; av words: 25.58054183904978

test/austen-persuasion.txt
 number: 3654; max words: 215; min words: 1; av words: 26.78927203065134

test/austen-sense.txt
 number: 4833; max words: 346; min words: 2; av words: 29.250362093937515

test/bible-kjv.txt
 number: 29812; max words: 564; min words: 2; av words: 31.759425734603514

test/blake-poems.txt
 number: 355; max words: 105; min words: 2; av words: 23.208450704225353

test/bryant-stories.txt
 number: 2715; max words: 135; min words: 2; av words: 20.486556169429097

test/burgess-busterbrown.txt
 number: 1001; max words: 93; min words: 2; av words: 18.523476523476525

test/carroll-alice.txt
 number: 1625; max words: 202; min words: 2; av words: 20.498461538461537

test/chesterton-ball.txt
 number: 4624; max words: 202; min words: 2; av words: 21.021410034602077

test/chesterton-brown.txt
 number: 3712; max words: 115; min words: 2; av words: 23.009698275862068



## Calculate Word Frequencies
In this step, we will count how often each word appears in each document.

In [40]:
for fname,text in zip(filenames, texts):
    words = nltk.tokenize.word_tokenize(text)
    
    word_counts = Counter(words)
    
    sort_word_counts = list(sorted(word_counts.items(), key=lambda x: x[1]))
    
    print(fname)
    print(sort_word_counts[:3])
    print(sort_word_counts[-3:])
    print()
    #print('found', len(words), 'words in', fname)
    

test/austen-emma.txt
[('Austen', 1), ('1816', 1), ('twenty-one', 1)]
[('to', 5124), ('.', 6355), (',', 12016)]

test/austen-persuasion.txt
[('[', 1), ('Persuasion', 1), ('Jane', 1)]
[('the', 3118), ('.', 3119), (',', 7024)]

test/austen-sense.txt
[('Sense', 1), ('Sensibility', 1), ('Jane', 1)]
[('.', 4023), ('to', 4050), (',', 9901)]

test/bible-kjv.txt
[('[', 1), (']', 1), ('Old', 1)]
[('and', 38847), ('the', 62103), (',', 70573)]

test/blake-poems.txt
[('[', 1), ('Poems', 1), ('1789', 1)]
[('.', 221), ('the', 351), (',', 685)]

test/bryant-stories.txt
[('Stories', 1), ('Sara', 1), ('Cone', 1)]
[('.', 2049), ('the', 3086), (',', 3855)]

test/burgess-busterbrown.txt
[('Adventures', 1), ('Thornton', 1), ('W.', 1)]
[('the', 639), ('.', 843), (',', 886)]

test/carroll-alice.txt
[('Lewis', 1), ('Carroll', 1), ('1865', 1)]
[("'", 1127), ('the', 1516), (',', 2418)]

test/chesterton-ball.txt
[('1909', 1), ('SOMEWHAT', 1), ('AIR', 1)]
[('.', 3997), ('the', 4521), (',', 5223)]

test/chesterton-

## Part-of-Speech (POS) Tagging

POS tagging assigns to each token a part of speech indicating it's grammatical role in the sentence structure.

This page includes more information about the tags themselves: [POS tag reference](https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/).


In [43]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /cs/student/dcornell/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [45]:
for fname,text in zip(filenames, texts):
    words = nltk.tokenize.word_tokenize(text)
    
    pos_tags = nltk.pos_tag(words)
    
    print(fname)
    print(pos_tags[:5])
    print()

test/austen-emma.txt
[('[', 'NNS'), ('Emma', 'NNP'), ('by', 'IN')]

test/austen-persuasion.txt
[('[', 'JJ'), ('Persuasion', 'NNP'), ('by', 'IN')]

test/austen-sense.txt
[('[', 'JJ'), ('Sense', 'NNP'), ('and', 'CC')]

test/bible-kjv.txt
[('[', 'VB'), ('The', 'DT'), ('King', 'NNP')]

test/blake-poems.txt
[('[', 'JJ'), ('Poems', 'NNP'), ('by', 'IN')]

test/bryant-stories.txt
[('[', 'NN'), ('Stories', 'NNS'), ('to', 'TO')]

test/burgess-busterbrown.txt
[('[', 'IN'), ('The', 'DT'), ('Adventures', 'NNP')]

test/carroll-alice.txt
[('[', 'JJ'), ('Alice', 'NNP'), ("'s", 'POS')]

test/chesterton-ball.txt
[('[', 'IN'), ('The', 'DT'), ('Ball', 'NNP')]

test/chesterton-brown.txt
[('[', 'IN'), ('The', 'DT'), ('Wisdom', 'NNP')]

