In [24]:
# Prints number of words and word length for each of six Jane Austen novels.
# Also prints mean word length of a Jane Austen novel, as based on the sample of six novels.

import os
import string

# define the path to the data
mypath = '/Users/ethan/Documents/Scripts/Teaching/Development-of-Language/DoL_2017/Data/Jane-Austen'

# make a list with all the titles of the novels
# NOTE: there are other ways to do this. E.g., by using the "glob" library, we can avoid having to write out
# all the filenames. Doing it this way has some pedagogical value though, I think, to show the many uses of loops
# and string concatenation
texts = ['Emma', 'Mansfield', 'Northanger', 'Persuasion', 'Pride', 'Sense']

os.chdir(mypath)

# make a list of characters to remove
# in addition to punctuation marks, we already know that at least Pride and Predjudice has a quotation mark that
# is not caught by "string.punctuation". We might also want to remove numbers. This list could contain any character
# that we do not want to count
removelist = ['”', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

# convert the set of punctuation marks to a list, to make it easy to loop through
punct = list(set(string.punctuation))

# loop through all the punctuation marks in "punct" and add them to our list of things to remove
for item in punct:
    removelist.append(item)

#uncomment to see what is being removed
#print(removelist)

# set up an empty list to hold mean word lengths of each novel, and one for the number of words in each novel
# NOTE: it is important that these empty list be set up outside of the loop that goes through the texts. Why?
mean_length_counter = []
num_words_counter = []

# loop through all the texts
for title in texts:
    # take the title from the list of titles above, add "Austen-" at the beginning and ".txt" at the end
    # to create the full file name
    filename = 'Austen-' + title + '.txt' 
    
    # look inside each file
    with open(filename,'r') as f:             
        text = f.read()
        text = text.lower()

        text = ''.join(x for x in text if x not in removelist)

        text = text.split()
        
        # uncomment to check the first 100 words in each novel
        # print(text[0:100])
        
        #find the number of words in the file
        numwords = len(text)
        
        # add the number of words for the present novel to the list for all the novels
        num_words_counter.append(numwords)
        
        # set up an empty list to keep track of the length of all the words in the novel
        word_length_counter = []
        
        # find the length of every word in the novel, and add it to our list of word lengths
        for word in text:
            b = len(word)
            word_length_counter.append(b)
        
        # find the mean of all the word lengths we have collected in word_length_counter
        av = sum(word_length_counter) / len(word_length_counter)
        
        # add the mean word length for the current novel to the list of mean word lengths for all the novels
        mean_length_counter.append(av)

        
        # print out the information we want
        print('The number of words in ' + title + ' is: ' + str(numwords))
        print(' ')
        print('The mean length of a word in ' + title + ' is ' + str(av))
        print(' ')
        

grand_average_wordlength = sum(mean_length_counter) / len(mean_length_counter)
grand_average_numwords = sum(num_words_counter) / len(num_words_counter)

print('The mean length of a word in a Jane Austen novel is: ' + str(grand_average_wordlength))
print('The mean number of words in a Jane Austen novel is: ' + str(grand_average_numwords))

The number of words in Emma is: 157438
 
The mean length of a word in Emma is 4.367725707897712
 
The number of words in Mansfield is: 159536
 
The mean length of a word in Mansfield is 4.351807742453114
 
The number of words in Northanger is: 77036
 
The mean length of a word in Northanger is 4.428098551326652
 
The number of words in Persuasion is: 83237
 
The mean length of a word in Persuasion is 4.3838917788964045
 
The number of words in Pride is: 121470
 
The mean length of a word in Pride is 4.436815674652178
 
The number of words in Sense is: 118510
 
The mean length of a word in Sense is 4.430976288920766
 
The mean length of a word in a Jane Austen novel is: 4.399885957357804
The mean number of words in a Jane Austen novel is: 119537.83333333333
