# Corpus comparisons
Evaluating the difference Hawthorne's work makes between the corpora.

## Mean sentence length
This exemplifies how similar Hawthorne is to the rest of the corpus, as well as his small impact on corpus-wide measures.

In [548]:
import nltk
import random
import statistics
import re
import os
import scipy

### Hawthorne's sentences
For this test, I use the Library of America Hawthorne, as it is better transcribed than Gale.

In [417]:
nh_loa = '/Users/e/code/hawthorne/local/loa_nh/loa_hawthorne_all.txt'

In [164]:
def get_sent_lens(file):
    with open(file) as f:
        text = f.read()
        sents = nltk.tokenize.sent_tokenize(text)
        
    sent_lens = []

    for sent in sents:
        n_toks = len([x for x in re.split('\s+', sent) if x])
        sent_lens.append(n_toks)
    
    return sent_lens

In [165]:
nh_sents = get_sent_lens(nh_loa)

In [166]:
statistics.mean(nh_sents)

23.08811897217661

In [167]:
statistics.quantiles(nh_sents)

[10.0, 20.0, 34.0]

# Hawthorne's peers' sentences
Comparing Hawthorne's sentence lengths against those of his contemporaries:

In [65]:
comparison = '/Users/e/code/hawthorne/local/corpus_no_nh'

In [69]:
files = [os.path.join(comparison, x) for x in os.listdir(comparison) if x.endswith('.txt')]

In [79]:
random.shuffle(files) # randomize sample
sample_size = round(len(files) * 0.1) # get n for 10% sample
sample = files[:sample_size]

In [None]:
sent_lens = []

for i, file in enumerate(sample):
    sent_lens.extend(get_sent_lens(file))
    
    if i % 25 == 0:
        print('\r{} of {}'.format(i, len(sample)), end = '')

In [82]:
statistics.mean(sent_lens)

26.374018741025175

In [83]:
statistics.quantiles(sent_lens)

[11.0, 21.0, 35.0]

# Word Vectors
Here, I use the [hyperhyper](https://github.com/jfilter/hyperhyper) implementation of the SVD PPMI method of creating word vectors.

In [10]:
import hyperhyper as hy

## Hawthorne

In [11]:
nh_corpus = '/Users/e/code/hawthorne/local/gale_nh'

In [131]:
# generate
# corpus = hy.Corpus.from_text_files(nh_corpus, preproc_func = hy.tokenize_texts, keep_n = 5000)

build up vocab: 100%|██████████| 23/23 [00:03<00:00,  7.54it/s]
texts to ids: 100%|██████████| 23/23 [00:02<00:00, 10.55it/s]


In [132]:
# generate
# bunch = hy.Bunch("results/hy_hawthorne", corpus, force_overwrite = True)

In [422]:
# load
bunch = hy.Bunch('results/hy_hawthorne/')

In [423]:
vectors_nh, results_nh = bunch.svd(keyed_vectors = True, subsample = None)

In [424]:
vectors_nh.most_similar('hester')

[('prynne', 0.6416890621185303),
 ('pearl', 0.24967367947101593),
 ('miriam', 0.23768633604049683),
 ('she', 0.22650045156478882),
 ('hibbins', 0.2198188453912735),
 ('thou', 0.19863906502723694),
 ('mistress', 0.19824433326721191),
 ('thyself', 0.19154658913612366),
 ('pit', 0.1914704144001007),
 ('arthur', 0.18614283204078674)]

In [425]:
vectors_nh.most_similar('pearl')

[('child', 0.25472408533096313),
 ('hester', 0.24967367947101593),
 ('mother', 0.2277851104736328),
 ('naughty', 0.19791392982006073),
 ('hannah', 0.1876162588596344),
 ('babe', 0.17599552869796753),
 ('she', 0.17360076308250427),
 ('rigby', 0.1705891191959381),
 ('listened', 0.1696358472108841),
 ('hush', 0.16588468849658966)]

## Gale 1828-1864 including NH

In [151]:
path = '/Users/e/code/hawthorne/local/corpus'

In [644]:
# generate
# corpus = hy.Corpus.from_text_files(path, preproc_func = hy.tokenize_texts, keep_n = 10000)

# load
corpus = hy.Corpus.load('results/corpus_subsample_none/corpus.pkl')

In [646]:
# generate
# bunch = hy.Bunch("results/corpus_subsample_none", corpus, force_overwrite = True)

# load
bunch = hy.Bunch('results/corpus_subsample_none//')

In [647]:
vectors, results = bunch.svd(keyed_vectors = True, subsample = None)

In [648]:
vectors.most_similar('puritan')

[('methodist', 0.3934868574142456),
 ('primitive', 0.3582947552204132),
 ('dutch', 0.35332465171813965),
 ('quaker', 0.33089378476142883),
 ('protestant', 0.3207319974899292),
 ('baptist', 0.3153371214866638),
 ('catholic', 0.3104977011680603),
 ('jewish', 0.24452072381973267),
 ('christian', 0.23519401252269745),
 ('parson', 0.22869554162025452)]

In [727]:
vectors.most_similar('hester')

[('miriam', 0.3072657883167267),
 ('isabel', 0.303394615650177),
 ('christine', 0.29878801107406616),
 ('maud', 0.27142563462257385),
 ('constance', 0.2633606493473053),
 ('marian', 0.26311105489730835),
 ('theresa', 0.25412988662719727),
 ('annie', 0.2525578439235687),
 ('pauline', 0.2524963617324829),
 ('linda', 0.25059691071510315)]

## Gale 1828-1864 excluding NH

In [156]:
path = '/Users/e/code/hawthorne/local/corpus_no_nh'

In [649]:
# generate
# corpus_no_nh = hy.Corpus.from_text_files(path, preproc_func = hy.tokenize_texts, keep_n = 10000)

# load
corpus_no_nh = hy.Corpus.load('results/corpus_no_nh_subsample_none/corpus.pkl')

In [650]:
# generate
# bunch = hy.Bunch("results/corpus_no_nh_subsample_none", corpus, force_overwrite=True)

# load
bunch = hy.Bunch('results/corpus_no_nh_subsample_none//')

In [651]:
vectors_no_nh, results_no_nh = bunch.svd(keyed_vectors = True)

In [652]:
vectors_no_nh.most_similar('puritan')

[('methodist', 0.34290313720703125),
 ('quaker', 0.3168374300003052),
 ('primitive', 0.29920345544815063),
 ('dutch', 0.29672470688819885),
 ('baptist', 0.2812960147857666),
 ('catholic', 0.27922335267066956),
 ('english', 0.2629094123840332),
 ('protestant', 0.2529698610305786),
 ('christian', 0.24499836564064026),
 ('yankee', 0.23915767669677734)]

In [724]:
vectors_no_nh.most_similar('hester')

[('julie', 0.2691107392311096),
 ('christine', 0.2456360012292862),
 ('anna', 0.2245090901851654),
 ('estelle', 0.21444502472877502),
 ('viola', 0.20676776766777039),
 ('julia', 0.20160716772079468),
 ('emma', 0.20148344337940216),
 ('adeline', 0.19821283221244812),
 ('theresa', 0.1944657564163208),
 ('apd', 0.1920011192560196)]

# Comparing vectors across models
In order to compare across models, we take aggregate the differences of the cosine similarities between every vector in the model.

Because SVDs of PPMI are deterministic, differences between the two models are attributable to the presence or absence of Hawthorne's work.

In [653]:
import pandas as pd
from multiprocessing import Pool
import pickle
import time
import os

In [654]:
def get_vocab(c):
    '''
    Gets vocabulary from hyperhyper corpus object, c.
    '''
    return set([x for x in c.vocab.values()])

In [655]:
def get_shared_vocab(c1, c2):
    '''
    Returns shared vocab between hyperhyper corpus objects.
    '''
    return list(get_vocab(c1) & get_vocab(c2))

In [656]:
def diff_vector(word, v1, v2, vocab = None, topn = 10000, pkl = False):
    '''
    Takes hyperhyper vector objects and returns the difference between those vectors as a dict.
    '''
    # get cosine similarities
    sim1 = v1.most_similar(word, topn = topn)
    sim2 = v2.most_similar(word, topn = topn)

    # calculate differences
    sim1 = pd.Series(data = [x[1] for x in sim1], index = [x[0] for x in sim1])
    sim2 = pd.Series(data = [x[1] for x in sim2], index = [x[0] for x in sim2])
    diff = sim1 - sim2
    total = diff.abs().sum()
    
    # optionally pickle output
    if pkl:
        d = {'word'          : word,
             'abs_diff'      : total,
             'sim1'          : sim1,
             'sim2'          : sim2}
        
        fn = str(round(time.time())) + '_' + word + '.pkl'
        path = os.path.join('results/vec_diffs', fn)

        with open(path, 'wb') as f:
            pickle.dump(d, f)
            f.close()
        print('\r{} pickled at {}'.format(word, path))
    
    # return observation
    d = {'word'          : word,
         'abs_diff'      : total}

    return d

In [657]:
# linear (takes 5-10 minutes)

def diff_vectors(v1, v2, vocab):
    '''
    Takes hyperhyper vector objects (v1, v2), and calculates the absolute difference between them.
    
    Corpus objects (c1, c2) used to extract vocabularies and calculate intersection.
    '''
    
    l = []
    
    for i, word in enumerate(vocab):
        l.append(diff_vector(word, v1, v2))
        if i % 100 == 0:
            pct = round((i / len(vocab)) * 100)
            print('\r{}%'.format(pct), end = '')
    
    # pickle
    fn = str(round(time.time())) + '_diff_vectors.pkl'
    path = os.path.join('results/vec_diffs', fn)
    
    with open(path, 'wb') as f:
        pickle.dump(l, f)
        f.close()
    print('\rpickle: {}'.format(path))
            
    return l

In [658]:
vocab = get_shared_vocab(corpus, corpus_no_nh)

In [660]:
# generate
diffs = diff_vectors(vectors, vectors_no_nh, vocab)

pickle: results/vec_diffs/1605914440_diff_vectors.pkl


# Munge results

In [664]:
# load
diffs = pickle.load(open('/Users/e/code/hawthorne/results/vec_diffs/1605914440_diff_vectors.pkl', 'rb'))

In [665]:
df = pd.DataFrame(diffs)

In [668]:
df['abs_diff'].describe()

count    9985.000000
mean      213.882300
std        27.214233
min       119.660099
25%       194.109672
50%       211.449624
75%       232.322910
max       350.309245
Name: abs_diff, dtype: float64

In [669]:
# Add corpus raw frequency to output
d = {v : corpus.counts[k] for k,v in corpus.vocab.items()}
e = {v : corpus_no_nh.counts[k] for k,v in corpus_no_nh.vocab.items()}

In [671]:
# Make pandas objects
ds = pd.Series(d)
ds.name = '# Gale'
es = pd.Series(e)
es.name = '# Gale - NH'

In [672]:
counts = pd.merge(ds, es, left_on = ds.index, right_on = es.index)

In [674]:
counts.rename(columns = {'key_0':'word'}, inplace=True)

In [676]:
df = pd.merge(df, counts, on = 'word').set_index('word')

In [677]:
df.head()

Unnamed: 0_level_0,abs_diff,# Gale,# Gale - NH
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
twice,234.92429,11751,11694
muscle,206.839319,2026,2021
idiot,226.529023,1520,1509
trio,223.319823,1318,1317
again,230.995042,194486,193522


Hawthorne accounts for about 1 million words (`1062724`) in the model:

In [300]:
nh_n = df['# Gale'].sum() - df['# Gale - NH'].sum()

In [310]:
nh_n

1062724

In percentage terms:

In [678]:
'{}%'.format(round((nh_n / df['# Gale'].sum()) * 100, 3))

'0.512%'

In [681]:
# Per-word proportion
observed = 1 - (df['# Gale - NH'] / df['# Gale'])
expected = nh_n / df['# Gale'].sum()
df['obs / exp'] = observed / expected

# Create status for obs/exp for plotting
df['o/e labels'] = ['more' if x > 1 else 'less' for x in df['obs / exp']]

In [696]:
# adapted from David Bamman's anlp19
import operator

def chi_square(one_counts, two_counts):

    one_sum=0.
    two_sum=0.
    vocab={}
    for word in one_counts.index:
        one_sum+=one_counts[word]
        vocab[word]=1
    for word in two_counts.index:
        vocab[word]=1
        two_sum+=two_counts[word]

    N=one_sum+two_sum
    vals={}
    
    for word in vocab:
        O11=one_counts[word]
        O12=two_counts[word]
        O21=one_sum-one_counts[word]
        O22=two_sum-two_counts[word]
        
        # We'll use the simpler form given in Manning and Schuetze (1999) 
        # for 2x2 contingency tables: 
        # https://nlp.stanford.edu/fsnlp/promo/colloc.pdf, equation 5.7
        
        vals[word]=(N*(O11*O22 - O12*O21)**2)/((O11 + O12)*(O11+O21)*(O12+O22)*(O21+O22))
        
    sorted_chi = sorted(vals.items(), key=operator.itemgetter(1), reverse=True)
    one=[]
    two=[]
    
    for k,v in sorted_chi:
        if one_counts[k]/one_sum > two_counts[k]/two_sum:
            one.append(k)
        else:
            two.append(k)
        
    return one, vals

In [None]:
dws, vals = chi_square(df['# Gale'], df['# Gale - NH'])

In [702]:
# per chi2
df['nh_distinctive'] = df.index.isin(dws)

In [703]:
df['chi2'] = pd.Series(vals)

In [707]:
df['n_diff'] = df['# Gale'] - df['# Gale - NH']

In [712]:
# add stopword metadata
stopwords = nltk.corpus.stopwords.words('english')
df['stopword'] = df.index.isin(stopwords)

In [719]:
# dictionary validation
with open('/Users/e/Documents/Literary Lab/word lists/oed_wordlist.txt', 'r') as f:
    oed_words = [x for x in f.read().split('\n') if x]

In [720]:
df['dict_word'] = df.index.isin(oed_words)

This is the dataset used to produce graphs in Tableau:

In [722]:
df.to_csv('/Users/e/code/hawthorne/hawthorne_diffs.csv')

# Evaluating dissimilarity by rank change
How does Hawthorne re-rank vector similarities?

In [786]:
# selected from Tableau
keywords = ['likewise', 'puritan', 'minister', 'artist', 'substance', 'methinks', 'whether', 'lifetime',
           'world', 'mankind', 'pilgrims', 'fling', 'province', 'painter', 'brotherhood', 'clergyman',
           'artist', 'fireside', 'might', 'hither', 'reverend', 'whatever', 'merely', 'fireside', 'antique',
           'wrought', 'airy', 'discern', 'shadow', 'image', 'actual', 'flung', 'dusky']

In [795]:
def rank_similarities(word, vectors):
    c = 1
    d = {}
    
    for x in vectors.most_similar(word, topn=10000):
        d[x[0]] = c
        c += 1
    
    return d

In [810]:
def get_rank_change(word, v1, v2, topn = 20, spearman = True):
    d = rank_similarities(word, v1)
    e = rank_similarities(word, v2)
    data = pd.concat([pd.Series(d), pd.Series(e)], axis = 1)
    data.columns = ['v1_rank', 'v2_rank']
    data['rank_change'] = data['v1_rank'] - data['v2_rank']
    # filter
    data.dropna(axis=0, inplace=True)
    subset = data[data['v1_rank'] <= topn]
    subset.sort_values('rank_change', ascending = False, inplace = True)
    # write
    if spearman:
        sp = stats.spearmanr(subset['v1_rank'], subset['v2_rank'])[0]
        fp = '/Users/e/code/hawthorne/results/rank_changes'
        fp = os.path.join(fp, '{}_{}_rank_change.csv'.format(str(sp)[:4], word))
        subset.to_csv(fp)
    else:
        fp = '/Users/e/code/hawthorne/results/rank_changes'
        fp = os.path.join(fp, '{}_rank_change.csv'.format(word))
    # notify
    print(fp)
    return

In [811]:
for word in keywords:
    get_rank_change(word, vectors_no_nh, vectors, topn = 100)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



/Users/e/code/hawthorne/results/rank_changes/0.54_likewise_rank_change.csv
/Users/e/code/hawthorne/results/rank_changes/0.76_puritan_rank_change.csv
/Users/e/code/hawthorne/results/rank_changes/0.80_minister_rank_change.csv
/Users/e/code/hawthorne/results/rank_changes/0.68_artist_rank_change.csv
/Users/e/code/hawthorne/results/rank_changes/0.53_substance_rank_change.csv
/Users/e/code/hawthorne/results/rank_changes/0.61_methinks_rank_change.csv
/Users/e/code/hawthorne/results/rank_changes/0.58_whether_rank_change.csv
/Users/e/code/hawthorne/results/rank_changes/0.64_lifetime_rank_change.csv
/Users/e/code/hawthorne/results/rank_changes/0.66_world_rank_change.csv
/Users/e/code/hawthorne/results/rank_changes/0.56_mankind_rank_change.csv
/Users/e/code/hawthorne/results/rank_changes/0.70_pilgrims_rank_change.csv
/Users/e/code/hawthorne/results/rank_changes/0.74_fling_rank_change.csv
/Users/e/code/hawthorne/results/rank_changes/0.86_province_rank_change.csv
/Users/e/code/hawthorne/results/ran

In [812]:
# testing low volatility words
for word in ['five', 'six', 'twenty']:
    get_rank_change(word, vectors_no_nh, vectors, topn = 100)

/Users/e/code/hawthorne/results/rank_changes/0.95_five_rank_change.csv
/Users/e/code/hawthorne/results/rank_changes/0.91_six_rank_change.csv
/Users/e/code/hawthorne/results/rank_changes/0.88_twenty_rank_change.csv




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

