In [17]:
import os
import csv
from scipy.stats import fisher_exact

In [18]:
metadata_fn = "./metadata.csv"

corpus_directory = "./books"

In [19]:
# Functions

# Take a filename, return some text
def file2text(filename):
    f=open(filename)
    text=f.read()
    f.close()
    return text

# Take a filename for some standard spreadsheet form, return a list of lists
def file2rows(filename,sep = ',',no_ufeff=True):
    rows = []
    with open(filename, newline='\n') as metadata:
        reader = csv.reader(metadata, delimiter=",", quotechar="\"")
        for row in reader:
            rows.append(row)
    if no_ufeff:
        rows[0][0] = rows[0][0].replace("\ufeff",'')
    return rows

# Take a directory, give back a list of the files in it
def dir2filelist(somedir,path=True):
    files = os.listdir(somedir)
    if path:
        files = [os.path.join(somedir,filename) for filename in files]
    return files

# Take a directory, list the files in it of a given type
def dir2files(directory,path=True,file_type = ".txt"):
    files = dir2filelist(directory,path)
    for i in files[:]:
        if not i.endswith(file_type):
            files.remove(i)
    return files

# Take a path, get back the filename alone
def get_shortname(somepath):
    sn = os.path.basename(os.path.normpath(somepath))
    return sn

# Take a messy word and give back a tidier version
def cleanword(w,lower=True,no_apostrophe=False):
    # Usually want to make lowercase, but might not with names
    if lower: 
        w=w.casefold()
    while w and not w[0].isalpha():
        w=w[1:]
    while w and not w[-1].isalpha():
        w=w[:-1]
    # all of this is if you want to get rid of apostrophe s's
    if no_apostrophe:
        if w.endswith("'s"):
            w=w[:-2]
        if w.endswith("’s"):
            w=w[:-2]
    return w

# Take some text and give back a list of words
def text2words(sometext,no_apostrophe=False,no_blank=True):
    words = sometext.split()
    words = [cleanword(w,no_apostrophe=no_apostrophe) for w in words]
    if no_blank:
        words = [i for i in words if i != '']
    return words

# Take a filename and give back a list of words
def file2words(filename,no_apostrophe=False,no_blank=True):
    ftw_text = file2text(filename)
    ftw_words = text2words(ftw_text,no_apostrophe=no_apostrophe,no_blank=no_blank)
    return ftw_words

# Take some raw MDW data for a given word and get back some refined MDW data
def get_fishers(someword,somecountdict,someratedict,obs_exp=True,alternative='greater'):
    r = someratedict[someword]
    wc = sum(somecountdict.values())
    a = somecountdict[someword]
    b = wc - a
    c = round(r*wc)
    d = wc-c
    p = fisher_exact([[a,b],[c,d]],alternative=alternative)[1]
    if obs_exp == True:
        if c != 0:
            oe = a/c
        else:
            oe = "Inf"
        p = (p,oe)
    return p

# Takes a list of lists and writes out a spreadsheet
def lol2file(somelol,filename,sep=","):
    with open(filename,'w') as output_file:
        writer = csv.writer(output_file, delimiter=",", quotechar="\"")
        for row in somelol:
            writer.writerow(row)
    print("Wrote the file " + filename)
    
# Take a file and make a list based on linebreaks
def file2list(filename):
    f=open(filename)
    text=f.read()
    olist = text.splitlines()
    return olist

In [20]:
# Read in your metadata file
metadata_rows = file2rows(metadata_fn)

In [21]:
# See the headers and the column numbers if you want
for n,i in enumerate(metadata_rows[0]):
    print(n,i)

0 Author
1 Author Country of Origin
2 Modern Country of Origin
3 Author Lifespan
4 Title
5 Subtitle
6 Release Date
7 Publication Date
8 Subject
9 Language
10 Copyright Status
11 Download Link
12 Gutenberg Link


In [22]:
# Also specify which column has your filenames
title_column = 4

corpora = set()

file_corpora = {}

for row in metadata_rows[1:]:
    c = row[title_column]
    corpora.add(c)
    filename = f"{c}.txt"
    file_corpora[filename] = c
    
corpora = list(corpora)

In [23]:
# Get the counts for every word, by corpus and overall

files = dir2files(corpus_directory)

corpus_counts = {i:{} for i in corpora}
all_counts = {}
cull_counts = {}

for f in files:
    sn = get_shortname(f)
    print(sn)
    corpus = file_corpora[sn]
    words = file2words(f)
    for w in words:
        if w not in corpus_counts[corpus]:
            corpus_counts[corpus][w] = 0
        corpus_counts[corpus][w] += 1
        if w not in all_counts:
            all_counts[w] = 0
        all_counts[w] += 1
    for w in set(words):
        if w not in cull_counts:
            cull_counts[w] = 0
        cull_counts[w] += 1
        
        
rates = {}
total_wc = sum(all_counts.values())

for w,c in all_counts.items():
    rates[w] = c/total_wc

A Sentimental Journey Through France and Italy.txt
Studies in the Psychology of Sex, Volume 4.txt
The Backwash of War.txt
The Decameron, Volume I.txt
Studies in the Psychology of Sex, Volume 5.txt
Corinne; Or, Italy. Volume 1 (of 2).txt
Ulysses.txt
Poems and Ballads (Third Series).txt
The Satyricon — Complete.txt
Lysistrata.txt
Memoirs of Fanny Hill.txt
The History of the Decline and Fall of the Roman Empire.txt
Studies in the Psychology of Sex, Volume 6.txt
Candide.txt
Faust [part 1]. Translated Into English in the Original Metres.txt
The Adventures of Sherlock Holmes.txt
The Sorrows of Young Werther.txt
Sinister Street, vol. 2.txt
Studies in the Psychology of Sex, Volume 2.txt
The Awakening, and Selected Short Stories.txt
Adam Bede.txt
The Golden Asse.txt
The Fortunate Mistress (Parts 1 and 2).txt
Studies in the Psychology of Sex, Volume 3.txt
The Kama Sutra of Vatsyayana.txt
In Praise of Folly.txt
Sinister Street, vol. 1.txt
Richard II.txt
Principles of Political Economy.txt
Women i

### Make some decisions
* Pick a cutoff (words must appear x times or we ignore them)
 * A higher cutoff makes the process much faster
 * You may also find infrequent words less interesting to analyze (although maybe not!)
* Choose an alpha (.05 is standard)
* Decide whether you want to exclude stopwords
* Decide if you want to cull the words by setting a minimum number of documents in which they must appear.
 * That is, set how many different documents must contin a word in order for it to appear in your results. The idea is to balance between removing things like names that appear in one text but not things like "whale" which could characterize a Melville corpus against Dickens.
 * The higher you go, the more apt you are to exclude things like character names
 * One consideration: If the number is greater than the number of documents in the smallest subcorpus, you'll lose words that characterize that corpus. E.g., there are only 4 Fitzgerald novels in our author corpus. If you set your min_docs > 4, you'll lose any words used only by Fitzgerald.
* Adjust the headers on your output spreadsheet if you like

In [43]:
cutoff = 10
alpha = .05
exclude_stops = True
min_docs = 15

output_table = [['corpus','token_','count','p_value','obs/exp']]

In [44]:
if exclude_stops:
    stops = file2list('stopwords.txt')

true_cull = min(min_docs,len(files))
counts = {}
    
print('Calculating significance...')
for corpus,data in corpus_counts.items():
    print(f'Working on the {corpus} corpus...')
    for w,c in data.items():
        if c < cutoff:
            continue
        if cull_counts[w] < true_cull:
            continue
        if exclude_stops:
            if w in stops or len(w) == 1:
                continue
        p,oe = get_fishers(w,data,rates)
        if p < alpha:
            output_table.append([corpus,w,c,p,oe])
            if corpus in counts:
                counts[corpus] += 1
            else:
                counts[corpus] = 1
print('Complete!')

Calculating significance...
Working on the King Lear corpus...
Working on the The Writings of Thomas Paine — Volume 2 (1779-1792): The Rights of Man corpus...
Working on the The Adventures of Sherlock Holmes corpus...
Working on the The Satyricon — Complete corpus...
Working on the Studies in the Psychology of Sex, Volume 2 corpus...
Working on the The Complete Poetical Works of Percy Bysshe Shelley — Volume 3 corpus...
Working on the Mademoiselle Fifi corpus...
Working on the Adventures of Huckleberry Finn corpus...
Working on the Memoirs of Fanny Hill corpus...
Working on the The Complete Plays of Gilbert and Sullivan corpus...
Working on the Studies in the Psychology of Sex, Volume 3 corpus...
Working on the Emile corpus...
Working on the The Prince corpus...
Working on the Corinne; Or, Italy. Volume 1 (of 2) corpus...
Working on the The Sex Side of Life: An Explanation for Young People corpus...
Working on the The Decameron, Volume I corpus...
Working on the Lysistrata corpus...
Wo

In [47]:
print(len(output_table))
print(counts)
with open("counts.csv", "w") as counts_file:
    writer = csv.writer(counts_file)
    for k,v in counts.items():
        writer.writerow([k,v])

55465
{'King Lear': 106, 'The Writings of Thomas Paine — Volume 2 (1779-1792): The Rights of Man': 471, 'The Adventures of Sherlock Holmes': 378, 'The Satyricon — Complete': 249, 'Studies in the Psychology of Sex, Volume 2': 740, 'The Complete Poetical Works of Percy Bysshe Shelley — Volume 3': 585, 'Mademoiselle Fifi': 66, 'Adventures of Huckleberry Finn': 435, 'Memoirs of Fanny Hill': 357, 'The Complete Plays of Gilbert and Sullivan': 594, 'Studies in the Psychology of Sex, Volume 3': 764, 'Emile': 901, 'The Prince': 260, 'Corinne; Or, Italy. Volume 1 (of 2)': 488, 'The Sex Side of Life: An Explanation for Young People': 1, 'The Decameron, Volume I': 521, 'Lysistrata': 64, 'Sister Carrie': 569, 'The Poetical Works of Elizabeth Barrett Browning, Volume 4': 243, 'Studies in the Psychology of Sex, Volume 6': 1319, 'The Prose Writings of Heinrich Heine': 388, 'Une Vie, a Piece of String and Other Stories': 371, 'The Fortunate Mistress (Parts 1 and 2)': 485, 'Essays of Michel de Montaigne

In [30]:
# Write out the results

# Specify a filename for the results
output_filename = "small_corpus_mdw.csv"

lol2file(output_table, output_filename)

Wrote the file small_corpus_mdw.csv


In [13]:
target = 'maria'

for f in files:
    words = file2words(f)
    if words.count(target) > 0:
        print(get_shortname(f), words.count(target))

A Sentimental Journey Through France and Italy.txt 17
The Decameron, Volume I.txt 3
Corinne; Or, Italy. Volume 1 (of 2).txt 2
Ulysses.txt 9
Studies in the Psychology of Sex, Volume 6.txt 6
Studies in the Psychology of Sex, Volume 3.txt 2
The Prince.txt 1
Casanova's Homecoming.txt 3
The History of Don Quixote, Volume 2, Complete.txt 1
Les Misérables.txt 3
The Prose Writings of Heinrich Heine.txt 27
Une Vie, a Piece of String and Other Stories.txt 1
The Decameron, Volume II.txt 10
Uncle Tom's Cabin.txt 1
The History of Don Quixote, Volume 1, Complete.txt 8
Voltaire's Philosophical Dictionary.txt 1
Essays of Michel de Montaigne — Complete.txt 1
The Rainbow.txt 6
Dubliners.txt 40
The Memoirs of Jacques Casanova de Seingalt, 1725-1798. Complete.txt 21
The Child of Pleasure.txt 129
Notre-Dame de Paris.txt 3
The Complete Poetical Works of Percy Bysshe Shelley — Volume 3.txt 1
The Complete Poetical Works of Percy Bysshe Shelley — Volume 1.txt 2
Dante. An essay. To which is added a translation 