In [None]:
# Imports

from cltkreaders.grc import GreekTesseraeCorpusReader
from collections import defaultdict
from pprint import pprint
import pickle

In [None]:
# Setup corpus

T = GreekTesseraeCorpusReader()

In [None]:
# Get text 

clouds = 'aristophanes.clouds.tess'
clouds_words = T.words(clouds)

In [None]:
# Find word in text

search_word = 'κόσμος'

for word in clouds_words:
    if word == search_word :
        print('Found one!')
        break

In [None]:
# Get lines

clouds_lines = next(T.doc_rows(clouds))
clouds_lines = list(clouds_lines.items())
clouds_lines[:5]

In [None]:
# Search text

for citation, line in clouds_lines:
    if search_word in line:
        print('-----')
        print(citation)
        print(line)
        print('-----')
        break

In [None]:
# Search "all" texts

for file in T.fileids()[:5]:
    lines = next(T.doc_rows(file))
    lines = list(lines.items())
    for citation, line in lines:
        if search_word in line:
            print(citation)

In [None]:
# Collect matches in "all" texts

from collections import defaultdict

kosmos_counts = defaultdict(int)

for file in T.fileids()[:5]:
    lines = next(T.doc_rows(file))
    lines = list(lines.items())
    for citation, line in lines:
        if 'κόσμος' in line:
            kosmos_counts[file] += 1
        else:
            kosmos_counts[file] += 0


pprint(kosmos_counts)  

In [None]:
# Collect matches in all texts

kosmos_counts = defaultdict(int)

for file in T.fileids():
    lines = next(T.doc_rows(file))
    lines = list(lines.items())
    for citation, line in lines:
        if 'κόσμος' in line:
            kosmos_counts[file] += 1
        else:
            kosmos_counts[file] += 0

In [None]:
# Check assumptions

'aristophanes.clouds.tess' in kosmos_counts.keys()

In [None]:
# Imports; for plotting

import seaborn as sns
import matplotlib.pyplot as plt
pal = sns.color_palette("colorblind")

In [None]:
# Make barplot of counts

kosmos_counts = dict([(k, v) for k, v in kosmos_counts.items() if v > 0])

keys = list(kosmos_counts.keys())
vals = list(kosmos_counts.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.title('κόσμος raw counts in Greek Tesserae',fontsize=36)
plt.xlabel('Works', fontsize=24)
plt.ylabel('Count', fontsize=24)
ax.set_xticklabels(labels=keys, rotation=75, horizontalalignment='right', fontsize=18);
plt.show()


In [None]:
# Make barplot, descending

kosmos_counts_sorted = dict(sorted(list(kosmos_counts.items()), key=lambda x: x[1], reverse=True))

keys = list(kosmos_counts_sorted.keys())
vals = list(kosmos_counts_sorted.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.title('κόσμος raw counts in Greek Tesserae',fontsize=36)
plt.xlabel('Works', fontsize=24)
plt.ylabel('Count', fontsize=24)
ax.set_xticklabels(labels=keys, rotation=75, horizontalalignment='right', fontsize=18);
plt.show()



In [None]:
# Get detail of barplot, only left 10

kosmos_counts_sorted_left = dict(sorted(list(kosmos_counts.items()), key=lambda x: x[1], reverse=True)[:10])

keys = list(kosmos_counts_sorted_left.keys())
vals = list(kosmos_counts_sorted_left.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.title('κόσμος raw counts in Greek Tesserae',fontsize=36)
plt.xlabel('Works', fontsize=24)
plt.ylabel('Count', fontsize=24)
ax.set_xticklabels(labels=keys, rotation=75, horizontalalignment='right', fontsize=18);
plt.show()

In [None]:
# Get counts again (i.e. replace 0 values)

kosmos_counts = defaultdict(int)

for file in T.fileids():
    lines = next(T.doc_rows(file))
    lines = list(lines.items())
    for citation, line in lines:
        if 'κόσμος' in line:
            kosmos_counts[file] += 1
        else:
            kosmos_counts[file] += 0

In [None]:
# Get word counts for all texts
# MB: Cached, takes about ~1 min. to run

# word_counts = defaultdict(int)

# for file in T.fileids():
#     word_count = len(list(T.words(file)))
#     word_counts[file] = word_count

# pickle.dump(word_counts, open('../data/greek_word_counts.pickle', 'wb'))

word_counts = pickle.load(open('../data/greek_word_counts.pickle', 'rb'))


In [None]:
# Compare lists

pprint(list(word_counts.items())[:5])
print()
pprint(list(kosmos_counts.items())[:5])


In [None]:
# Workflow for normalizing

for i, item in enumerate(kosmos_counts.items()):
    text = item[0]
    print(text)
    kosmos_count = item[1]
    print(f'κόσμος appears {kosmos_count} times...')
    word_count = list(word_counts.values())[i]
    print(f'...in {word_count} total words')
    norm_count = (kosmos_count / word_count) * 1000
    print(f'The normalized count is {norm_count:.02} per 1000 words')
    break

In [None]:
# Normalize counts

kosmos_norm_counts = defaultdict(float)

for i, item in enumerate(kosmos_counts.items()):
    text = item[0]
    kosmos_count = item[1]
    word_count = list(word_counts.values())[i]
    norm_count = (kosmos_count / word_count) * 1000
    kosmos_norm_counts[text] = norm_count

list(kosmos_norm_counts.items())[:5]    

In [None]:
# Make bar plot of normalized data

kosmos_norm_counts_sorted = dict(sorted(list(kosmos_norm_counts.items()), key=lambda x: x[1], reverse=True)[:10])

keys = list(kosmos_norm_counts_sorted.keys())
vals = list(kosmos_norm_counts_sorted.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.title('κόσμος normalized counts in Greek Tesserae (top 50)',fontsize=36)
plt.xlabel('Works', fontsize=24)
plt.ylabel('Count', fontsize=24)
ax.set_xticklabels(labels=keys, rotation=75, horizontalalignment='right', fontsize=18);
plt.show()

In [None]:
# Show example

text = 'theocritus.idylls.part.18.tess'
print(kosmos_counts[text])
print(word_counts[text])

In [None]:
# Show example

text = 'new_testament.i_john.tess'
print(kosmos_counts[text])
print(word_counts[text])

In [None]:
# Show example

max_kosmos = list(kosmos_counts.keys())[list(kosmos_counts.values()).index(max(kosmos_counts.values()))]

# "diagrammed"
max_kosmos = list(
                    kosmos_counts.keys()
                    )[
                        list(
                             kosmos_counts.values()
                            ).index(
                                    max(
                                        kosmos_counts.values()
                                        )
                                    )
                     ]


print(max_kosmos)
print(kosmos_counts[max_kosmos])
print(word_counts[max_kosmos])