In [None]:
# Imports

from cltkreaders.grc import GreekTesseraeCorpusReader
from collections import defaultdict
from pprint import pprint
import pickle

# Imports; for plotting

import seaborn as sns
import matplotlib.pyplot as plt
pal = sns.color_palette("colorblind")

In [None]:
# Setup corpus

T = GreekTesseraeCorpusReader()

print(f'There are {len(T.fileids())} files in this corpus.\n')
print(f'Here are the first five (5) files: {T.fileids()[:5]}')

In [None]:
# Collect matches in all texts

kosmos_counts = defaultdict(int)

for file in T.fileids():
    lines = next(T.doc_rows(file))
    lines = list(lines.items())
    for citation, line in lines:
        if 'κόσμος' in line:
            kosmos_counts[file] += 1
        else:
            kosmos_counts[file] += 0

print(f'The first five (5) kosmos counts are {list(kosmos_counts.values())[:5]}.')

In [None]:
# Get word counts for all texts

word_counts = pickle.load(open('../data/greek_word_counts.pickle', 'rb'))

print(f'The first five (5) word counts are {list(word_counts.values())[:5]}.')

In [None]:
# Normalize counts

kosmos_norm_counts = defaultdict(float)

for i, item in enumerate(kosmos_counts.items()):
    text = item[0]
    kosmos_count = item[1]
    word_count = list(word_counts.values())[i]
    norm_count = (kosmos_count / word_count) * 1000
    kosmos_norm_counts[text] = norm_count

list(kosmos_norm_counts.items())[:5]    

In [None]:
# Make bar plot of normalized data

kosmos_norm_counts_sorted = dict(sorted(list(kosmos_norm_counts.items()), key=lambda x: x[1], reverse=True)[:10])
keys = list(kosmos_norm_counts_sorted.keys())
vals = list(kosmos_norm_counts_sorted.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.title('κόσμος normalized counts in Greek Tesserae', fontsize=36)
plt.xlabel('Works', fontsize=24)
plt.ylabel('Count per 1000 words', fontsize=24)

ax.set_xticklabels(labels=keys, rotation=45, horizontalalignment='right', fontsize=18);
# ax.set_yticks(range(0, max(vals), 2))
plt.show()