In [None]:
# Imports

from cltkreaders.lat import LatinTesseraeCorpusReader
from collections import defaultdict
from pprint import pprint
import pickle

# Imports; for plotting

import seaborn as sns
import matplotlib.pyplot as plt
pal = sns.color_palette("colorblind")

In [None]:
# Setup corpus

T = LatinTesseraeCorpusReader()

print(f'There are {len(T.fileids())} files in this corpus.\n')
print(f'Here are the first five (5) files: {T.fileids()[:5]}')

In [None]:
# Collect matches in all texts

orbis_counts = defaultdict(int)

for file in T.fileids():
    lines = next(T.doc_rows(file))
    lines = list(lines.items())
    for citation, line in lines:
        if 'orbis' in line:
            orbis_counts[file] += 1
        else:
            orbis_counts[file] += 0

pprint(f'The first five (5) files are {T.fileids()[:5]}.')
print()
print(f'The first five (5) orbis counts are {list(orbis_counts.values())[:5]}.')

In [None]:
# Collect matches in all texts

et_counts = defaultdict(int)

for file in T.fileids():
    lines = next(T.doc_rows(file))
    lines = list(lines.items())
    for citation, line in lines:
        if 'et' in line:
            et_counts[file] += 1
        else:
            et_counts[file] += 0

print(f'The first five (5) et counts are {list(et_counts.values())[:5]}.')

In [None]:
# Get word counts for all texts

word_counts = pickle.load(open('../data/latin_word_counts.pickle', 'rb'))

print(f'The first five (5) word counts are {list(word_counts.values())[:5]}.')

In [None]:
# Normalize counts

orbis_norm_counts = defaultdict(float)

for i, item in enumerate(orbis_counts.items()):
    text = item[0]
    orbis_count = item[1]
    word_count = list(word_counts.values())[i]
    norm_count = (orbis_count / word_count) * 1000
    orbis_norm_counts[text] = norm_count

list(orbis_norm_counts.items())[:5]    

In [None]:
# Make bar plot of normalized data

orbis_norm_counts_sorted = dict(sorted(list(orbis_norm_counts.items()), key=lambda x: x[1], reverse=True)[:10])
keys = list(orbis_norm_counts_sorted.keys())
vals = list(orbis_norm_counts_sorted.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.title('orbis normalized counts in Latin Tesserae', fontsize=36)
plt.xlabel('Works', fontsize=24)
plt.ylabel('Count per 1000 words', fontsize=24)

ax.set_xticklabels(labels=keys, rotation=45, horizontalalignment='right', fontsize=18);
# ax.set_yticks(range(0, max(vals), 2))
plt.show()