In [1]:
import glob
import nltk
from pprint import pprint
from collections import Counter

In [2]:
filenames = glob.glob('test/*.txt')
texts = list()
for fname in filenames:
    with open(fname,'r') as f:
        texts.append(f.read())
print('read', len(texts), 'texts.')

read 10 texts.


In [3]:
single_text = texts[0]
print(single_text[:200])

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; an


In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /cs/student/dcornell/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
words = nltk.tokenize.word_tokenize(single_text)
words[:5]

['[', 'Emma', 'by', 'Jane', 'Austen']

In [6]:
sentences = nltk.tokenize.sent_tokenize(single_text)
sentences[:2]

['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.',
 "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."]

In [7]:
for fname,text in zip(filenames, texts):
    words = nltk.tokenize.word_tokenize(text)
    print('found', len(words), 'words in', fname)

found 191673 words in test/austen-emma.txt
found 97888 words in test/austen-persuasion.txt
found 141367 words in test/austen-sense.txt
found 946812 words in test/bible-kjv.txt
found 8239 words in test/blake-poems.txt
found 55621 words in test/bryant-stories.txt
found 18542 words in test/burgess-busterbrown.txt
found 33310 words in test/carroll-alice.txt
found 97203 words in test/chesterton-ball.txt
found 85412 words in test/chesterton-brown.txt


In [8]:
for fname,text in zip(filenames, texts):
    sents = nltk.tokenize.sent_tokenize(text)
    print('found', len(sents), 'sentences in', fname)

found 7493 sentences in test/austen-emma.txt
found 3654 sentences in test/austen-persuasion.txt
found 4833 sentences in test/austen-sense.txt
found 29812 sentences in test/bible-kjv.txt
found 355 sentences in test/blake-poems.txt
found 2715 sentences in test/bryant-stories.txt
found 1001 sentences in test/burgess-busterbrown.txt
found 1625 sentences in test/carroll-alice.txt
found 4624 sentences in test/chesterton-ball.txt
found 3712 sentences in test/chesterton-brown.txt


In [9]:
for fname,text in zip(filenames, texts):
    sents = nltk.tokenize.sent_tokenize(text)
    sent_words = [nltk.tokenize.word_tokenize(s) for s in sents]
    
    sent_lengths = [len(s) for s in sent_words]
    max_len = max(sent_lengths)
    min_len = min(sent_lengths)
    av_len = sum(sent_lengths)/len(sent_lengths)
    
    print('{}\n number: {}; max words: {}; '
          'min words: {}; av words: {}'
          '\n'.format(fname, len(sents), max_len, min_len, av_len)
         )

test/austen-emma.txt
 number: 7493; max words: 275; min words: 1; av words: 25.58054183904978

test/austen-persuasion.txt
 number: 3654; max words: 215; min words: 1; av words: 26.78927203065134

test/austen-sense.txt
 number: 4833; max words: 346; min words: 2; av words: 29.250362093937515

test/bible-kjv.txt
 number: 29812; max words: 564; min words: 2; av words: 31.759425734603514

test/blake-poems.txt
 number: 355; max words: 105; min words: 2; av words: 23.208450704225353

test/bryant-stories.txt
 number: 2715; max words: 135; min words: 2; av words: 20.486556169429097

test/burgess-busterbrown.txt
 number: 1001; max words: 93; min words: 2; av words: 18.523476523476525

test/carroll-alice.txt
 number: 1625; max words: 202; min words: 2; av words: 20.498461538461537

test/chesterton-ball.txt
 number: 4624; max words: 202; min words: 2; av words: 21.021410034602077

test/chesterton-brown.txt
 number: 3712; max words: 115; min words: 2; av words: 23.009698275862068



In [10]:
from collections import Counter


for fname,text in zip(filenames, texts):
    words = nltk.tokenize.word_tokenize(text)
    
    word_counts = Counter(words)
    
    sort_word_counts = list(sorted(word_counts.items(), key=lambda x: x[1]))
    
    print(fname)
    #print(word_counts['cat'])
    print('he', word_counts['he']/len(words))
    print('she', word_counts['she']/len(words))
    #print(sort_word_counts[:3])
    #print(sort_word_counts[-3:])
    print()

test/austen-emma.txt
he 0.007121503811178413
she 0.009255346345077292

test/austen-persuasion.txt
he 0.007508581235697941
she 0.008366704805491992

test/austen-sense.txt
he 0.00632396528185503
she 0.009429357629432611

test/bible-kjv.txt
he 0.010207939907816968
she 0.0009811873951745436

test/blake-poems.txt
he 0.004248088360237893
she 0.0008496176720475786

test/bryant-stories.txt
he 0.015677531867460133
she 0.006472375541611981

test/burgess-busterbrown.txt
he 0.030309567468450007
she 0.00010786322942508899

test/carroll-alice.txt
he 0.0029420594416091263
she 0.015190633443410387

test/chesterton-ball.txt
he 0.010791847988230815
she 0.0007510056273983313

test/chesterton-brown.txt
he 0.012246522736851964
she 0.0010420081487378823



In [11]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /cs/student/dcornell/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [12]:
for fname,text in zip(filenames, texts):
    words = nltk.tokenize.word_tokenize(text)
    
    pos_tags = nltk.pos_tag(words)
    
    verb_tok = [w for w,p in pos_tags if p.startswith('VB')]
    verb_cts = Counter(verb_tok)
    sort_verbs = list(sorted(verb_cts.items(), key=lambda x: x[1]))
    
    print(fname)
    #print(pos_tags[:5])
    #print(verb_tok[:5])
    print(sort_verbs[-5:])
    print()

test/austen-emma.txt
[('is', 1221), ('have', 1301), ('had', 1605), ('be', 1965), ('was', 2383)]

test/austen-persuasion.txt
[('been', 495), ('have', 583), ('be', 949), ('had', 1177), ('was', 1330)]

test/austen-sense.txt
[('is', 732), ('have', 806), ('had', 969), ('be', 1304), ('was', 1846)]

test/bible-kjv.txt
[('have', 3842), ('said', 3995), ('was', 4515), ('is', 6832), ('be', 6877)]

test/blake-poems.txt
[('have', 17), ('are', 21), ('be', 25), ('was', 31), ('is', 45)]

test/bryant-stories.txt
[('were', 194), ('is', 243), ('had', 293), ('said', 452), ('was', 713)]

test/burgess-busterbrown.txt
[('see', 72), ('did', 78), ('is', 167), ('had', 220), ('was', 287)]

test/carroll-alice.txt
[("'s", 121), ('be', 145), ('had', 183), ('was', 361), ('said', 456)]

test/chesterton-ball.txt
[('be', 401), ('had', 556), ('said', 652), ('is', 717), ('was', 926)]

test/chesterton-brown.txt
[('be', 305), ('said', 415), ('is', 463), ('had', 526), ('was', 1149)]

