## Burrows's Delta (continued)

In [1]:
import re

def get_tokens(filename):
    '''open text file and return list of tokens'''
    # text = open(filename, 'r').read().lower()
    f = open(filename, 'r') # open file
    text = f.read() # read file
    text = text.lower() # lower-case text
    tokens = [word for word in re.split('\W', text) if word != ''] # remove punctuation
    return tokens

In [2]:
def get_features(samples):
    tokens = []
    for sample in samples:
        tokens += get_tokens(path + sample + '.txt')
    types = list(set(tokens)) # create unordered list of unique words
    tmp = dict.fromkeys(types, 0) # create temporary dictionary, initialize counts to 0
    for token in tokens: tmp[token] += 1 # count words
    # re-order words in temporary dictionary numerically by descending frequency
    # re-order words with same frequency alphabetically
    features = { 
        key: value for key, value in sorted(tmp.items(),
        key = lambda item: (-item[1], item[0]))
    }
    return features

In [3]:
import pandas as pd

def get_counts(features, samples):
    columns = {}
    for sample in samples:
        columns[sample] = []
        tmp = get_features([sample])
        for feature in features:
            columns[sample].append(tmp.get(feature, 0))
    return pd.DataFrame(columns, index = features)

In [4]:
def get_lengths(samples):
    filenames = [path + sample + '.txt' for sample in samples]
    lengths = {}
    for i in range(len(samples)):
       lengths[samples[i]] = len(get_tokens(filenames[i]))
    return pd.DataFrame(lengths, index = ['words'])

In [5]:
import sys

path = './corpus/a/'
limit = 30 # 30 most frequent words (MFWs)
samples = ['Gratian1', 'dePen', 'Gratian2']
unknown = 'Gratian0'
# unknown = 'psAug'
samples_features = get_features(samples)
unknown_features = get_features([unknown])
missing_features = [word for word in list(samples_features.keys())[:limit] if word not in unknown_features]
print(missing_features, file = sys.stderr)
features = [word for word in samples_features if word in unknown_features]
mfws = features[:limit]
print(mfws, file = sys.stderr)
counts = get_counts(mfws, [unknown] + samples)
lengths = get_lengths([unknown] + samples)
frequencies = (counts / lengths.values) * 1000
means = frequencies[samples].mean(axis = 1).to_frame('mean')
standard_deviations = frequencies[samples].std(axis = 1).to_frame('std')
z_scores = (frequencies - means.values) / standard_deviations.values
test = z_scores[[unknown]]
corpus = z_scores[samples]
differences = (test.values - corpus).abs()
row = (differences.mean(axis = 0)).to_frame(unknown).transpose()
row

['sed', 'unde', 'enim', 'ait', 'ergo']
['in', 'non', 'et', 'est', 'de', 'quod', 'qui', 'ad', 'uel', 'ut', 'si', 'autem', 'cum', 'a', 'ex', 'sunt', 'que', 'etiam', 'uero', 'ab', 'quia', 'esse', 'item', 'per', 'nec', 'se', 'hoc', 'nisi', 'ita', 'illud']


Unnamed: 0,Gratian1,dePen,Gratian2
Gratian0,3.463527,3.477218,3.380142


In [6]:
path = './corpus/b/'

candidates = ['cases', 'laws', 'orders1', 'orders2', 'simony', 'procedure', 'other1', 'other2', 'monastic', 'other3', 'heresy', 'marriage', 'penance', 'second']
deltas = pd.DataFrame(columns = candidates)
limit = 30 # 30 most frequent words (MFWs)
for candidate in candidates:
    unknown = candidate
    samples = candidates[:]
    samples.remove(unknown)
    features = get_features(samples)
    mfws = list(features.keys())[:limit]
    counts = get_counts(mfws, [unknown] + samples)
    lengths = get_lengths([unknown] + samples)
    frequencies = (counts / lengths.values) * 1000
    means = frequencies[samples].mean(axis = 1).to_frame('mean')
    standard_deviations = frequencies[samples].std(axis = 1).to_frame('std')
    z_scores = (frequencies - means.values) / standard_deviations.values
    test = z_scores[[unknown]]
    corpus = z_scores[samples]
    differences = (test.values - corpus).abs()
    row = (differences.mean(axis = 0)).to_frame(unknown).transpose()
    deltas = deltas.append(row)
deltas 


Unnamed: 0,cases,laws,orders1,orders2,simony,procedure,other1,other2,monastic,other3,heresy,marriage,penance,second
cases,,2.276541,1.924653,2.025227,1.963736,1.954481,1.57138,2.278196,1.762169,2.362842,1.871684,1.892278,1.858944,1.633416
laws,2.140962,,1.248994,1.502025,1.463297,1.314724,1.422318,1.4369,1.193099,1.434469,1.18754,1.192357,1.621796,1.2323
orders1,1.618391,1.094915,,1.122318,0.968477,0.884251,1.049854,1.110873,0.869308,1.23966,0.826694,1.012354,0.750538,0.777653
orders2,1.898201,1.52441,1.26862,,1.382037,1.683955,1.414852,1.687281,1.449174,1.620788,1.419841,1.452577,1.552287,1.31953
simony,1.66667,1.349079,0.977183,1.219516,,0.887776,1.130389,1.128661,1.041277,1.171096,0.590034,0.916648,0.905909,1.086345
procedure,1.618734,1.199074,0.892039,1.509545,0.878923,,1.07902,1.122308,0.820975,1.072558,0.656892,0.999328,0.881784,0.985186
other1,1.335318,1.300002,1.061945,1.272178,1.13831,1.075322,,1.279177,0.964928,1.305428,0.996004,1.085321,1.327212,0.815153
other2,1.9416,1.323324,1.091294,1.629055,1.138646,1.109048,1.296338,,0.797917,1.03458,1.059174,0.653951,0.863307,1.096104
monastic,1.455489,1.045089,0.855446,1.267636,1.011384,0.798639,0.93001,0.742869,,1.057799,0.760244,0.661069,0.799881,0.779872
other3,2.070457,1.338796,1.288958,1.514642,1.199653,1.105702,1.349715,0.950503,1.122944,,1.120868,0.712106,1.152123,1.306677
