In [1]:
import pandas as pd

import nltk 
from nltk.tokenize import word_tokenize

# 1. Counting all words

First we create a list with all the names of our text files

In [2]:
books = ["A Connecticut Yankee_word_counts.txt", "A Horse’s Tale_word_counts.txt", 
         "Personal Recollections of Joan of Arc_word_counts.txt", "The Adventures of Tom Sawyer_word_counts.txt", 
         "The American Claimant_word_counts.txt", "The Gilded Age_word_counts.txt", "The Mysterious Stranger_word_counts.txt",
         "The Prince and the Pauper_word_counts.txt", "The Tragedy of Pudd’nhead Wilson_word_counts.txt", 
         "Tom Sawyer Abroad_word_counts.txt", "Tom Sawyer, Detective_word_counts.txt"]

In [3]:
all_data = []

for i in books:
    with open(i, "r", encoding = "utf-8") as f:
        data = f.read()
        all_data.append(data)

In [4]:
all_data

 'soldier boy—privately buffalo bill horse spent life saddle—with good two hundred pounds without clothes telling much weigh war-path batteries belted six feet young ounce waste flesh straight graceful springy motions quick cat handsome face black hair dangling shoulders beautiful look nobody braver nobody stronger except yes person doubts fine see see beaded buck-skins back rifle peeping shoulder chasing hostile trail going like wind hair streaming behind shelter broad slouch yes sight look then—and part favorite horse dozens big carried eighty-one miles nightfall sunrise scout good fifty day day time large built business basis carried thousands thousands miles scout duty army gorge pass valley fort trading post buffalo-range whole sweep rocky mountains great plains know well know bugle-calls chief scouts army frontier makes us important position hold military service one needs good family possess education much common worthy place best-educated horse outside hippodrome everybody says

In [5]:
len(all_data) #each element in the list is a book

11

Then we tokenize things but keeping them in the same list

In [6]:
tokens = []

for i in all_data:
    tokens.append(word_tokenize(i))

In [7]:
tokens

[['preface',
  'ungentle',
  'laws',
  'customs',
  'touched',
  'upon',
  'tale',
  'historical',
  'episodes',
  'used',
  'illustrate',
  'also',
  'historical',
  'pretended',
  'laws',
  'customs',
  'existed',
  'england',
  'sixth',
  'century',
  'pretended',
  'inasmuch',
  'existed',
  'english',
  'civilizations',
  'far',
  'later',
  'times',
  'safe',
  'consider',
  'libel',
  'upon',
  'sixth',
  'century',
  'suppose',
  'practice',
  'day',
  'also',
  'one',
  'quite',
  'justified',
  'inferring',
  'whatever',
  'one',
  'laws',
  'customs',
  'lacking',
  'remote',
  'time',
  'place',
  'competently',
  'filled',
  'worse',
  'one',
  'question',
  'whether',
  'thing',
  'divine',
  'right',
  'kings',
  'settled',
  'book',
  'found',
  'difficult',
  'executive',
  'head',
  'nation',
  'person',
  'lofty',
  'character',
  'extraordinary',
  'ability',
  'manifest',
  'indisputable',
  'none',
  'deity',
  'could',
  'select',
  'head',
  'unerringly',
  'als

In [8]:
len(tokens) #so now we have each element in the list (a book) tokenized

11

And now we count the unique number of tokens per book

In [9]:
total_tokens = [] 

for i in tokens:
    total_tokens.append(len(i))

In [10]:
total_tokens

[55871, 7501, 36078, 34189, 29744, 78661, 18799, 35130, 29355, 15314, 10465]

And now let's create a dataframe with the total number of tokens per book

In [11]:
book_names = ["A Connecticut Yankee", "A Horse’s Tale", "Personal Recollections of Joan of Arc", 
              "The Adventures of Tom Sawyer", "The American Claimant", "The Gilded Age", "The Mysterious Stranger",
              "The Prince and the Pauper", "The Tragedy of Pudd’nhead Wilson", "Tom Sawyer Abroad", "Tom Sawyer, Detective"]

In [12]:
number_words = pd.DataFrame({"Titles" : book_names, "Total Words" : total_tokens})

In [13]:
number_words

Unnamed: 0,Titles,Total Words
0,A Connecticut Yankee,55871
1,A Horse’s Tale,7501
2,Personal Recollections of Joan of Arc,36078
3,The Adventures of Tom Sawyer,34189
4,The American Claimant,29744
5,The Gilded Age,78661
6,The Mysterious Stranger,18799
7,The Prince and the Pauper,35130
8,The Tragedy of Pudd’nhead Wilson,29355
9,Tom Sawyer Abroad,15314


# 2. Counting Unique Words

And now let's count the unique number of tokens for each book! Let's go back to our tokens variable, and let's use a set to do that.

In [14]:
unique_tokens = []

for i in tokens:
    unique_tokens.append(set(i))

In [15]:
unique_tokens

[{'leisure',
  'places',
  'anomaly',
  'swept',
  'wherever',
  'examine',
  'reigned',
  'gazed',
  'horseback',
  'sense',
  'thirty-five',
  'embankment',
  'upper',
  'differ',
  'alters',
  'thankful',
  'constantinopolitanischerdudelsackspfeifenmachersgesellschafft',
  'dropping',
  'going',
  'lowdownest',
  'go-as-you-please',
  'belted',
  'commissioned',
  'part',
  'agent',
  '_master_',
  'eagle',
  'sapient',
  'star',
  'plucky',
  'alike',
  'lives',
  'tongs',
  'counts',
  'six-sevenths',
  'troubled',
  'yield',
  'grandchild',
  'steam-spout',
  'carries',
  'journey',
  'superior',
  'meadow',
  'vii',
  'laboratory',
  'quickly',
  'sank',
  'fist',
  '_any_body',
  'reached',
  'unslaked',
  'commmon',
  'kent',
  'worshipping',
  'postures',
  'dost',
  'dissolved',
  '_any_',
  'declined',
  'abyss',
  'busied',
  'rules',
  'leaves',
  'sentimentally',
  'guenever',
  'shorter',
  'disposed',
  'millions',
  'interruptions',
  'conf',
  'backed',
  'stephenson

In [16]:
len(unique_tokens)

11

In [17]:
number_unique_tokens = [len(i) for i in unique_tokens]

In [18]:
number_unique_tokens

[10541, 2940, 7826, 8041, 8304, 12113, 4779, 7887, 6479, 3495, 2590]

And now let's add that to our dataframe

In [19]:
number_words["Unique Words"] = number_unique_tokens

In [20]:
number_words

Unnamed: 0,Titles,Total Words,Unique Words
0,A Connecticut Yankee,55871,10541
1,A Horse’s Tale,7501,2940
2,Personal Recollections of Joan of Arc,36078,7826
3,The Adventures of Tom Sawyer,34189,8041
4,The American Claimant,29744,8304
5,The Gilded Age,78661,12113
6,The Mysterious Stranger,18799,4779
7,The Prince and the Pauper,35130,7887
8,The Tragedy of Pudd’nhead Wilson,29355,6479
9,Tom Sawyer Abroad,15314,3495


And now let's save that to a csv dataframe!

In [22]:
number_words.to_csv("twain_number_words.csv")