In [17]:
import pandas as pd

import nltk 
from nltk.tokenize import word_tokenize

# 1. Counting all words

First we create a list with all the names of our text files

In [18]:
books = ["A Dark Night's Work_word_counts.txt", "Cranford_word_counts.txt", "Mary Barton_word_counts.txt",
         "My Lady Ludlow_word_counts.txt", "North and South_word_counts.txt", "Ruth_word_counts.txt",
         "Sylvia's Lovers_word_counts.txt", "Wives and Daughters_word_counts.txt"]

In [19]:
all_data = []

for i in books:
    with open(i, "r", encoding = "utf-8") as f:
        data = f.read()
        all_data.append(data)

In [20]:
all_data

["chapter county town certain shire lived forty years ago one mr. wilkins conveyancing attorney considerable standing certain shire small county principal town contained four thousand inhabitants saying mr. wilkins principal lawyer hamley say little unless add transacted legal business gentry twenty miles round grandfather established connection father consolidated strengthened indeed wise upright conduct well professional skill obtained position confidential friend many surrounding families distinction visited among way mere lawyer ever done dined tables alone accompanied wife observed rode meet occasionally accident although well mounted squire among often persuaded little coquetting `` professional engagements '' `` wanted office '' run clients nay twice forgot usual caution first death rode home brush general knew place place held aristocratic county days let supposed way toadeater respected much would give unpalatable advice need would counsel unsparing reduction expenditure extra

In [21]:
len(all_data) #each element in the list is a book

8

Then we tokenize things but keeping them in the same list

In [22]:
tokens = []

for i in all_data:
    tokens.append(word_tokenize(i))

In [23]:
tokens

[['chapter',
  'county',
  'town',
  'certain',
  'shire',
  'lived',
  'forty',
  'years',
  'ago',
  'one',
  'mr.',
  'wilkins',
  'conveyancing',
  'attorney',
  'considerable',
  'standing',
  'certain',
  'shire',
  'small',
  'county',
  'principal',
  'town',
  'contained',
  'four',
  'thousand',
  'inhabitants',
  'saying',
  'mr.',
  'wilkins',
  'principal',
  'lawyer',
  'hamley',
  'say',
  'little',
  'unless',
  'add',
  'transacted',
  'legal',
  'business',
  'gentry',
  'twenty',
  'miles',
  'round',
  'grandfather',
  'established',
  'connection',
  'father',
  'consolidated',
  'strengthened',
  'indeed',
  'wise',
  'upright',
  'conduct',
  'well',
  'professional',
  'skill',
  'obtained',
  'position',
  'confidential',
  'friend',
  'many',
  'surrounding',
  'families',
  'distinction',
  'visited',
  'among',
  'way',
  'mere',
  'lawyer',
  'ever',
  'done',
  'dined',
  'tables',
  'alone',
  'accompanied',
  'wife',
  'observed',
  'rode',
  'meet',
  '

In [24]:
len(tokens) #so now we have each element in the list (a book) tokenized

8

And now we count the unique number of tokens per book

In [25]:
total_tokens = [] 

for i in tokens:
    total_tokens.append(len(i))

In [26]:
total_tokens

[33494, 33281, 84552, 35421, 84889, 81049, 96870, 139209]

And now let's create a dataframe with the total number of tokens per book

In [27]:
book_names = ["A Dark Night's Work", "Cranford", "Mary Barton", "My Lady Ludlow", "North and South", "Ruth",
              "Sylvia's Lovers", "Wives and Daughters"]

In [28]:
number_words = pd.DataFrame({"Titles" : book_names, "Total Words" : total_tokens})

In [29]:
number_words

Unnamed: 0,Titles,Total Words
0,A Dark Night's Work,33494
1,Cranford,33281
2,Mary Barton,84552
3,My Lady Ludlow,35421
4,North and South,84889
5,Ruth,81049
6,Sylvia's Lovers,96870
7,Wives and Daughters,139209


# 2. Counting Unique Words

And now let's count the unique number of tokens for each book! Let's go back to our tokens variable, and let's use a set to do that.

In [30]:
unique_tokens = []

for i in tokens:
    unique_tokens.append(set(i))

In [31]:
unique_tokens

[{'children',
  'hitches',
  'counsellor',
  'frankness',
  'travellers',
  'bring',
  'tempers',
  'waving',
  'clumps',
  'obliged',
  'lent',
  'ellinor',
  'seem',
  'register-office',
  'adam',
  'stouter',
  'exceeding',
  'evenings',
  'undoubtedly',
  'peril',
  'staring',
  'arrived',
  'inside',
  'twitch',
  'stayed',
  'practical',
  'smiling',
  'living',
  'observant',
  'referred',
  'goodnatured',
  'unreflecting',
  'turning',
  'sedately',
  'dreaming',
  'treasure',
  "'no",
  'stream',
  'approving',
  'snowdrops',
  'imparting',
  'morose',
  'foot',
  'events',
  'score',
  'obstacles',
  'rejoicing',
  'chagrined',
  'neighbourhood',
  'always',
  'deciding',
  'roses',
  'appear',
  'breakfast',
  'iron',
  'baser',
  'pacing',
  'folded',
  'materially',
  'agitation',
  'direct',
  'cautious',
  'destiny',
  'ostentatiously',
  'influential',
  'windows',
  'within',
  'honourable',
  'households',
  'haughty-looking',
  'understand',
  'visitor',
  'conquerin

In [32]:
len(unique_tokens)

8

In [33]:
number_unique_tokens = [len(i) for i in unique_tokens]

In [34]:
number_unique_tokens

[6266, 7021, 10425, 6978, 11676, 9835, 11900, 12037]

And now let's add that to our dataframe

In [35]:
number_words["Unique Words"] = number_unique_tokens

In [36]:
number_words

Unnamed: 0,Titles,Total Words,Unique Words
0,A Dark Night's Work,33494,6266
1,Cranford,33281,7021
2,Mary Barton,84552,10425
3,My Lady Ludlow,35421,6978
4,North and South,84889,11676
5,Ruth,81049,9835
6,Sylvia's Lovers,96870,11900
7,Wives and Daughters,139209,12037


And now let's save that to a csv dataframe!

In [37]:
number_words.to_csv("Gaskell_number_words.csv")