In [None]:
# Imports

from cltkreaders.grc import GreekTesseraeCorpusReader
from pprint import pprint
import pickle

In [None]:
# Setup corpus

T = GreekTesseraeCorpusReader()

In [None]:
# Get text 

clouds = 'aristophanes.clouds.tess'
clouds_words = T.words(clouds)

In [None]:
# Find word in text

for word in clouds_words:
    if word == 'κόσμος':
        print('Found one!')
        break

In [None]:
clouds_lines = next(T.doc_rows(clouds))
clouds_lines = list(clouds_lines.items())

In [None]:
clouds_lines[:5]

In [None]:
for citation, line in clouds_lines:
    if 'κόσμος' in line:
        print('-----')
        print(citation)
        print(line)
        print('-----')
        break

In [None]:
for file in T.fileids()[:5]:
    lines = next(T.doc_rows(file))
    lines = list(lines.items())
    for citation, line in lines:
        if 'κόσμος' in line:
            print(citation)

In [None]:
from collections import defaultdict

kosmos_counts = defaultdict(int)

for file in T.fileids()[:5]:
    lines = next(T.doc_rows(file))
    lines = list(lines.items())
    for citation, line in lines:
        if 'κόσμος' in line:
            kosmos_counts[file] += 1

pprint(kosmos_counts)  

In [None]:
kosmos_counts = defaultdict(int)

for file in T.fileids():
    lines = next(T.doc_rows(file))
    lines = list(lines.items())
    for citation, line in lines:
        if 'κόσμος' in line:
            kosmos_counts[file] += 1

In [None]:
'aristophanes.clouds.tess' in kosmos_counts.keys()

In [None]:
# importing the required library
import seaborn as sns
import matplotlib.pyplot as plt
pal = sns.color_palette("colorblind")

In [None]:
keys = list(kosmos_counts.keys())
vals = list(kosmos_counts.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.xlabel('Works')
plt.ylabel('Count')
plt.title('κόσμος raw counts in Greek Tesserae')
ax.set_xticklabels(labels=keys, rotation=90);
plt.show()

In [None]:
kosmos_counts_sorted = dict(sorted(list(kosmos_counts.items()), key=lambda x: x[1], reverse=True))

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=list(kosmos_counts_sorted.keys()), y=list(kosmos_counts_sorted.values()), palette=pal);
plt.xlabel('Works')
plt.ylabel('Count')
plt.title('κόσμος raw counts in Greek Tesserae')
ax.set_xticklabels(labels=keys, rotation=90);
plt.show()


In [None]:
kosmos_counts_sorted = dict(sorted(list(kosmos_counts.items()), key=lambda x: x[1], reverse=True)[:10])
keys = list(kosmos_counts_sorted.keys())
vals = list(kosmos_counts_sorted.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.xlabel('Works')
plt.ylabel('Count')
plt.title('κόσμος raw counts in Greek Tesserae')
ax.set_xticklabels(labels=keys, rotation=45, horizontalalignment='right');
ax.set_yticks(range(0, max(vals), 2))
plt.show()

In [None]:
# word_counts = defaultdict(int)

# for file in T.fileids():
#     word_count = len(list(T.words(file)))
#     word_counts[file] = word_count

# pickle.dump(word_counts, open('../data/word_counts.pickle', 'wb'))

word_counts = pickle.load(open('../data/word_counts.pickle', 'rb'))

In [None]:
pprint(list(word_counts.items())[:5])    

In [None]:
pprint(list(kosmos_counts.items())[:5])

In [None]:
for i, item in enumerate(kosmos_counts.items()):
    text = item[0]
    print(text)
    kosmos_count = item[1]
    print(f'κόσμος appears {kosmos_count} times...')
    word_count = list(word_counts.values())[i]
    print(f'...in {word_count} total words')
    norm_count = (kosmos_count / word_count) * 1000
    print(f'The normalized count is {norm_count:.02} per 1000 words')
    break

In [None]:
kosmos_norm_counts = defaultdict(float)

for i, item in enumerate(kosmos_counts.items()):
    text = item[0]
    kosmos_count = item[1]
    word_count = list(word_counts.values())[i]
    norm_count = (kosmos_count / word_count) * 1000
    kosmos_norm_counts[text] = norm_count

In [None]:
list(kosmos_norm_counts.items())[:5]

In [None]:
kosmos_norm_counts_sorted = dict(sorted(list(kosmos_norm_counts.items()), key=lambda x: x[1], reverse=True)[:10])
keys = list(kosmos_norm_counts_sorted.keys())
vals = list(kosmos_norm_counts_sorted.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.xlabel('Works')
plt.ylabel('Count per 1000 words')
plt.title('κόσμος normalized counts in Greek Tesserae')
ax.set_xticklabels(labels=keys, rotation=45, horizontalalignment='right');
# ax.set_yticks(range(0, max(vals), 2))
plt.show()

In [None]:
text = 'new_testament.i_john.tess'
print(kosmos_counts[text])
print(word_counts[text])

In [None]:
max_kosmos = list(kosmos_counts.keys())[list(kosmos_counts.values()).index(max(kosmos_counts.values()))]
print(max_kosmos)
print(kosmos_counts[max_kosmos])
print(word_counts[max_kosmos])