## Burrows's Delta

In [None]:
import re

def tokenize(filename):
    '''open text file and return list of tokens'''
    text = open(filename, 'r').read().lower()
    tokens = [word for word in re.split('\W', text) if word != '']
    return tokens

In [None]:
lengths = {}
samples = ['Gratian0', 'Gratian1', 'Gratian2']
filenames = [sample + '.txt' for sample in samples]
for i in range(len(samples)):
   lengths[samples[i]] = len(tokenize(filenames[i]))
lengths

In [None]:
def occurrences(tokens):
    '''create and return token occurrence dictionary'''
    types = list(set(tokens))
    tmp = dict.fromkeys(types, 0)
    for token in tokens: tmp[token] += 1
    occurrences = {
        key: value for key, value in sorted(tmp.items(),
        key = lambda item: (-item[1], item[0]))
    }
    return occurrences

def features(texts, n):
    corpus = []
    for text in texts:
        corpus += tokenize(text + '.txt')
    features = list(occurrences(corpus).keys())[:n]
    return features

mfws = features(samples, 4)
mfws

In [None]:
def counts(features, subcorpora):
    columns = {}
    for subcorpus in subcorpora:
        columns[subcorpus] = []
        tokens = tokenize(subcorpus + '.txt')
        all = occurrences(tokens)
        for feature in features:
            columns[subcorpus].append(all.get(feature, 0))
    return columns

counts(mfws, samples)

**Once we've gotten to this point, we've gathered all the preliminary information we need, and are ready to move the analysis into Pandas dataframes.**

In [None]:
import pandas as pd

df_counts = pd.DataFrame(counts(mfws, samples), index = mfws)
df_counts

In [None]:
df_lengths = pd.DataFrame(lengths, index = ['words'])
df_lengths

**Explain use of occurrences per 1,000 words instead of percent here. Using occurrences per 1,000 words is more convenient than using percentages, because at that scale the word frequency values we are concerned with (at least most them) are greater than 1.0.**

In [None]:
frequencies = (df_counts / df_lengths.values) * 1000
frequencies


This is the point where we need to temporarily drop the Gratian0 column. We're only interested at this point in calculating the mean and sample standard deviation of the values in the two columns we're comparing the candidate to: Gratian1 and Gratian2.

In [None]:
selected = frequencies[['Gratian1', 'Gratian2']]
selected

In [None]:
means = selected.mean(axis = 1).to_frame('mean')
means

$s=\sqrt{\frac{1}{N - 1}\sum_{i=1}^N(x_i-\bar{x})^2}$

In [None]:
stds = selected.std(axis = 1).to_frame('std')
stds

$z=\frac{x - \bar{x}}{s}$

In [None]:
zs = (frequencies - means.values) / stds.values
zs

**Again, remember that the means and standard deviations have been computed from the values in the Gratian1 and Gratian2 columns *only*!**

Now, break the consolidated z-scores dateframe into two dataframes: one for the hypothetical case statements (*themata*), the other for the first- and second recension *dicta* (including the *dicta* from *de Pen*.) with which we want to compare the case statements.

In [None]:
a = zs[['Gratian0']]
b = zs[['Gratian1', 'Gratian2']]
a

In [None]:
# c = (b - a.values).abs()
c = (b - a.values)
c

In [None]:
# (c.mean(axis = 1)).mean(axis = 0)
c.sum(axis = 0)