## Euclidean distance (experimental)

In [1]:
path = './corpus/a/'

In [2]:
import re

def get_tokens(filename):
    '''open text file and return list of tokens'''
    # text = open(filename, 'r').read().lower()
    f = open(filename, 'r') # open file
    text = f.read() # read file
    text = text.lower() # lower-case text
    tokens = [word for word in re.split('\W', text) if word != ''] # remove punctuation
    return tokens

In [3]:
def get_features(samples):
    tokens = []
    for sample in samples:
        tokens += get_tokens(path + sample + '.txt')
    types = list(set(tokens)) # create unordered list of unique words
    tmp = dict.fromkeys(types, 0) # create temporary dictionary, initialize counts to 0
    for token in tokens: tmp[token] += 1 # count words
    # re-order words in temporary dictionary numerically by descending frequency
    # re-order words with same frequency alphabetically
    features = { 
        key: value for key, value in sorted(tmp.items(),
        key = lambda item: (-item[1], item[0]))
    }
    return features

In [4]:
import pandas as pd

def get_counts(features, samples):
    columns = {}
    for sample in samples:
        columns[sample] = []
        tmp = get_features([sample])
        for feature in features:
            columns[sample].append(tmp.get(feature, 0))
    return pd.DataFrame(columns, index = features)

In [5]:
def get_lengths(samples):
    filenames = [path + sample + '.txt' for sample in samples]
    lengths = {}
    for i in range(len(samples)):
       lengths[samples[i]] = len(get_tokens(filenames[i]))
    return pd.DataFrame(lengths, index = ['words'])

In [6]:
limit = 4 # 4 most frequent words (MFWs)
samples = ['Gratian1', 'dePen', 'Gratian2']
unknown = 'Gratian0'
features = get_features(samples)
mfws = list(features.keys())[:limit]
counts = get_counts(mfws, [unknown] + samples)
lengths = get_lengths([unknown] + samples)
frequencies = (counts / lengths.values) * 1000
csv = open('./CSVs/frequencies_t.csv', 'w')
csv.write(frequencies.transpose().to_csv())
csv.close()
means = frequencies[samples].mean(axis = 1).to_frame('mean')
standard_deviations = frequencies[samples].std(axis = 1).to_frame('std')
z_scores = (frequencies - means.values) / standard_deviations.values
pd.options.display.float_format = '{:,.4f}'.format
z_scores

Unnamed: 0,Gratian0,Gratian1,dePen,Gratian2
in,-2.8702,-0.4342,-0.7095,1.1437
non,-6.5491,-0.0361,1.0176,-0.9814
et,-3.2375,-0.9786,1.0201,-0.0414
est,-3.5264,0.4179,0.7233,-1.1412


In [7]:
test = z_scores[[unknown]]
test

Unnamed: 0,Gratian0
in,-2.8702
non,-6.5491
et,-3.2375
est,-3.5264


In [8]:
corpus = z_scores[samples]
corpus

Unnamed: 0,Gratian1,dePen,Gratian2
in,-0.4342,-0.7095,1.1437
non,-0.0361,1.0176,-0.9814
et,-0.9786,1.0201,-0.0414
est,0.4179,0.7233,-1.1412


In [9]:
differences = (test.values - corpus).abs()
differences

Unnamed: 0,Gratian1,dePen,Gratian2
in,2.436,2.1606,4.0139
non,6.513,7.5667,5.5677
et,2.2589,4.2576,3.1961
est,3.9443,4.2497,2.3852


---

### Manhattan distance

In [10]:
manhattan_row = (differences.mean(axis = 0)).to_frame(unknown).transpose()
manhattan_row

Unnamed: 0,Gratian1,dePen,Gratian2
Gratian0,3.788,4.5586,3.7907


### Euclidean distance

Argamon's formula:

$\sqrt{\sum_{i = 1}^n\frac{1}{\sigma_i^2}(f_i(D) - f_i(D'))^2}$

In [11]:
import numpy as np

euclidean_row = np.sqrt(np.square(differences).sum(axis = 0)).to_frame(unknown).transpose()
euclidean_row

Unnamed: 0,Gratian1,dePen,Gratian2
Gratian0,8.3074,9.9051,7.9382


In [12]:
# equivalent to:
euclidean_distance = np.square(frequencies[samples] - frequencies[[unknown]].values) / np.square(standard_deviations.values)
euclidean_row = np.sqrt(euclidean_distance.sum(axis = 0)).to_frame(unknown).transpose()
euclidean_row

Unnamed: 0,Gratian1,dePen,Gratian2
Gratian0,8.3074,9.9051,7.9382


### Manhattan distance

In [13]:
limit = 4 # 4 most frequent words (MFWs)
samples = ['Gratian1', 'dePen', 'Gratian2']
unknown = 'psAug'
features = get_features(samples)
mfws = list(features.keys())[:limit]
counts = get_counts(mfws, [unknown] + samples)
lengths = get_lengths([unknown] + samples)
frequencies = (counts / lengths.values) * 1000
means = frequencies[samples].mean(axis = 1).to_frame('mean')
standard_deviations = frequencies[samples].std(axis = 1).to_frame('std')
z_scores = (frequencies - means.values) / standard_deviations.values
test = z_scores[[unknown]]
corpus = z_scores[samples]
differences = (test.values - corpus).abs()
manhattan_row = (differences.mean(axis = 0)).to_frame(unknown).transpose()
euclidean_row = np.sqrt(np.square(differences).sum(axis = 0)).to_frame(unknown).transpose()
manhattan_row

Unnamed: 0,Gratian1,dePen,Gratian2
psAug,2.6456,1.7373,3.4318


### Euclidean distance

In [14]:
euclidean_row

Unnamed: 0,Gratian1,dePen,Gratian2
psAug,8.3227,6.1961,8.154


### Manhattan distance

In [15]:
path = './corpus/a/'

# author candidates, e.g. Gratian 1, the Master of Penance, Gratian 2, etc.
candidates = ['Gratian0', 'Gratian1', 'dePen', 'Gratian2']
manhattan_deltas = pd.DataFrame(columns = candidates)
euclidean_deltas = pd.DataFrame(columns = candidates)
limit = 4 # 4 most frequent words (MFWs)
for candidate in candidates:
    unknown = candidate
    samples = candidates[:]
    samples.remove(unknown)
    features = get_features(samples)
    mfws = list(features.keys())[:limit]
    counts = get_counts(mfws, [unknown] + samples)
    lengths = get_lengths([unknown] + samples)
    frequencies = (counts / lengths.values) * 1000
    means = frequencies[samples].mean(axis = 1).to_frame('mean')
    standard_deviations = frequencies[samples].std(axis = 1).to_frame('std')
    z_scores = (frequencies - means.values) / standard_deviations.values
    test = z_scores[[unknown]]
    corpus = z_scores[samples]
    differences = (test.values - corpus).abs()
    manhattan_row = (differences.mean(axis = 0)).to_frame(unknown).transpose()
    euclidean_row = np.sqrt(np.square(differences).sum(axis = 0)).to_frame(unknown).transpose()
    manhattan_deltas = manhattan_deltas.append(manhattan_row)
    euclidean_deltas = euclidean_deltas.append(euclidean_row)
manhattan_deltas

Unnamed: 0,Gratian0,Gratian1,dePen,Gratian2
Gratian0,,3.788,4.5586,3.7907
Gratian1,1.4361,,0.3628,0.5453
dePen,1.9873,0.4515,,0.7673
Gratian2,1.7185,0.6278,0.7905,


### Euclidean distance

In [16]:
euclidean_deltas

Unnamed: 0,Gratian0,Gratian1,dePen,Gratian2
Gratian0,,8.3074,9.9051,7.9382
Gratian1,2.949,,0.9617,1.1788
dePen,4.1299,1.2696,,1.5687
Gratian2,3.7632,1.4416,1.742,


### Manhattan distance

In [17]:
path = './corpus/a/'

# author candidates, e.g. pseudo-Augustine, Gratian 1, the Master of Penance, Gratian 2, etc.
candidates = ['psAug', 'Gratian1', 'dePen', 'Gratian2']
manhattan_deltas = pd.DataFrame(columns = candidates)
euclidean_deltas = pd.DataFrame(columns = candidates)
limit = 4 # 4 most frequent words (MFWs)
for candidate in candidates:
    unknown = candidate
    samples = candidates[:]
    samples.remove(unknown)
    features = get_features(samples)
    mfws = list(features.keys())[:limit]
    counts = get_counts(mfws, [unknown] + samples)
    lengths = get_lengths([unknown] + samples)
    frequencies = (counts / lengths.values) * 1000
    means = frequencies[samples].mean(axis = 1).to_frame('mean')
    standard_deviations = frequencies[samples].std(axis = 1).to_frame('std')
    z_scores = (frequencies - means.values) / standard_deviations.values
    test = z_scores[[unknown]]
    corpus = z_scores[samples]
    differences = (test.values - corpus).abs()
    manhattan_row = (differences.mean(axis = 0)).to_frame(unknown).transpose()
    euclidean_row = np.sqrt(np.square(differences).sum(axis = 0)).to_frame(unknown).transpose()
    manhattan_deltas = manhattan_deltas.append(manhattan_row)
    euclidean_deltas = euclidean_deltas.append(euclidean_row)
manhattan_deltas

Unnamed: 0,psAug,Gratian1,dePen,Gratian2
psAug,,2.6456,1.7373,3.4318
Gratian1,1.0228,,0.4653,0.9325
dePen,0.5178,0.4733,,1.3453
Gratian2,5.2005,3.3574,4.2857,


### Euclidean distance

In [18]:
euclidean_deltas

Unnamed: 0,psAug,Gratian1,dePen,Gratian2
psAug,,8.3227,6.1961,8.154
Gratian1,2.4423,,1.0498,2.0896
dePen,1.4376,1.0847,,2.9794
Gratian2,11.8009,8.5068,10.3475,


In [19]:
path = './corpus/b/'

candidates = ['cases', 'laws', 'orders1', 'orders2', 'simony', 'procedure', 'other1', 'other2', 'monastic', 'other3', 'heresy', 'marriage', 'penance', 'second']
deltas = pd.DataFrame(columns = candidates)
limit = 30 # 30 most frequent words (MFWs)
for candidate in candidates:
    unknown = candidate
    samples = candidates[:]
    samples.remove(unknown)
    features = get_features(samples)
    mfws = list(features.keys())[:limit]
    counts = get_counts(mfws, [unknown] + samples)
    lengths = get_lengths([unknown] + samples)
    frequencies = (counts / lengths.values) * 1000
    means = frequencies[samples].mean(axis = 1).to_frame('mean')
    standard_deviations = frequencies[samples].std(axis = 1).to_frame('std')
    z_scores = (frequencies - means.values) / standard_deviations.values
    test = z_scores[[unknown]]
    corpus = z_scores[samples]
    differences = (test.values - corpus).abs()
    row = np.sqrt(np.square(differences).sum(axis = 0)).to_frame(unknown).transpose()
    deltas = deltas.append(row)
deltas


Unnamed: 0,cases,laws,orders1,orders2,simony,procedure,other1,other2,monastic,other3,heresy,marriage,penance,second
cases,,14.6019,13.2056,14.4962,13.4154,13.5849,10.7948,14.7518,11.8344,15.6256,12.2294,13.0306,13.1382,11.0728
laws,13.9939,,9.6632,10.7901,10.0657,9.2939,10.4763,9.9209,8.6599,10.3204,8.9439,9.4009,11.6705,8.8951
orders1,10.7041,7.8557,,7.6818,6.6466,6.126,6.8093,7.2477,6.1265,8.3502,5.7058,6.6936,5.3351,4.8999
orders2,13.784,11.0253,9.3599,,10.2015,11.2141,10.2688,11.0625,10.2378,11.0477,11.1951,10.0888,10.6071,9.1915
simony,10.9824,8.8534,6.6313,8.5617,,5.9158,7.455,7.7858,6.8106,7.687,4.0542,6.2257,6.6891,7.4424
procedure,10.3231,8.182,6.1729,9.4305,5.7996,,7.5749,8.3385,5.9809,7.5729,4.7521,6.56,5.8297,6.8002
other1,8.7694,9.0749,6.8342,8.423,7.4598,7.4129,,8.8537,7.1084,8.4529,6.981,7.0376,8.2278,5.4962
other2,12.2993,8.991,7.3836,10.9217,8.4732,8.5793,9.5622,,7.2651,7.6606,7.4187,5.0866,5.889,7.9417
monastic,9.2623,6.9956,5.9968,8.4198,6.5372,5.7905,6.787,6.1113,,7.1754,5.011,4.4376,5.17,5.4533
other3,13.1289,9.0575,8.7168,9.7941,7.8804,7.7782,8.9226,7.1316,7.7824,,7.5955,5.4524,7.8675,8.6804
