In [None]:
# Imports

from cltkreaders.lat import LatinTesseraeCorpusReader
import string
from collections import defaultdict
from pprint import pprint
import pickle

# Imports; for plotting

import seaborn as sns
import matplotlib.pyplot as plt
pal = sns.color_palette("colorblind")

In [None]:
# Setup corpus

T = LatinTesseraeCorpusReader()

In [None]:
# Set constant

TERM = 'orbis'

In [None]:
# Collect matches in all texts

term_counts = defaultdict(int)

for file in T.fileids():
    lines = next(T.doc_rows(file))
    lines = list(lines.items())
    for citation, line in lines:
        line = line.split()
        if TERM in line:
            pprint(line)
            term_counts[file] += 1
            break
    break

In [None]:
print('et' in "Et tu, Brute.")
print('tu' in "Et tu, Brute".split())

In [None]:
def preprocess(text):

    # remove punctuation
    puncs = string.punctuation
    for punc in puncs:
        text = text.replace(punc, ' ')
    
    # lower case 
    text = text.lower()

    return text

In [None]:
# Collect matches in all texts with preprocessing

term_counts = defaultdict(int)

for file in T.fileids():
    lines = next(T.doc_rows(file))
    lines = list(lines.items())
    for citation, line in lines:
        line = preprocess(line)
        line = line.split()
        if TERM in line:
            term_counts[file] += 1
        else:
            term_counts[file] += 0

In [None]:
# Show sample

print(f'The first five {TERM} counts are:\n{list(term_counts.items())[:5]}')

In [None]:
# Make barplot of counts

keys = list(term_counts.keys())
vals = list(term_counts.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.title(f'"{TERM}" raw counts in Latin Tesserae',fontsize=36)
plt.xlabel('Works', fontsize=24)
plt.ylabel('Count', fontsize=24)
ax.set_xticklabels(labels=keys, rotation=90, fontsize=12);
plt.show()

In [None]:
# Get detail of barplot, only left 50

term_counts_sorted_left = dict(sorted(list(term_counts.items()), key=lambda x: x[1], reverse=True)[:50])
keys = list(term_counts_sorted_left.keys())
vals = list(term_counts_sorted_left.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.title(f'"{TERM}" raw counts in Latin Tesserae',fontsize=36)
plt.xlabel('Works', fontsize=24)
plt.ylabel('Count', fontsize=24)
ax.set_xticklabels(labels=keys, rotation=45, horizontalalignment='right', fontsize=18);
plt.show()

In [None]:
# Get word counts for all texts
# MB: Cached, takes about ~2-3 min. to run

# word_counts = defaultdict(int)

# for file in T.fileids():
#     word_count = len(list(T.words(file)))
#     word_counts[file] = word_count

# pickle.dump(word_counts, open('../data/latin_word_counts.pickle', 'wb'))

word_counts = pickle.load(open('../data/latin_word_counts.pickle', 'rb'))


In [None]:
# Compare lists

pprint(list(word_counts.items())[:5])
print()
pprint(list(term_counts.items())[:5])

In [None]:
# Workflow for normalizing

for i, item in enumerate(term_counts.items()):
    text = item[0]
    print(text)
    term_count = item[1]
    print(f'{TERM} appears {term_count} times...')
    word_count = list(word_counts.values())[i]
    print(f'...in {word_count} total words')
    norm_count = (term_count / word_count) * 1000
    print(f'The normalized count is {round(norm_count, 3)} per 1000 words')
    break

In [None]:
# Normalize counts

term_norm_counts = defaultdict(float)

for i, item in enumerate(term_counts.items()):
    text = item[0]
    term_count = item[1]
    word_count = list(word_counts.values())[i]
    norm_count = (term_count / word_count) * 1000
    term_norm_counts[text] = norm_count

In [None]:
# Make bar plot of normalized data

term_norm_counts_sorted = dict(sorted(list(term_norm_counts.items()), key=lambda x: x[1], reverse=True)[:50])
keys = list(term_norm_counts_sorted.keys())
vals = list(term_norm_counts_sorted.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.title(f'"{TERM}" normalized counts in Latin Tesserae (top 50)',fontsize=36)
plt.xlabel('Works', fontsize=24)
plt.ylabel('Count', fontsize=24)
ax.set_xticklabels(labels=keys, rotation=75, horizontalalignment='right', fontsize=18);
plt.show()

In [None]:
# Check!

print(next(T.texts('horace.carmen_saeculare.tess')))
print()

for i, item in enumerate(term_counts.items()):
    text = item[0]
    term_count = item[1]
    word_count = list(word_counts.values())[i]
    if 'horace.carmen_saeculare' in text:
        print(text)
        print(term_count)
        print(word_count)
        print(term_count/word_count)
        print((term_count/word_count) * 1000)

In [None]:
# Make bar plot of normalized data

term_norm_counts_sorted = dict(sorted(list(term_norm_counts.items()), key=lambda x: x[1], reverse=True)[-50:])

keys = list(term_norm_counts_sorted.keys())
vals = list(term_norm_counts_sorted.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.title(f'"{TERM}" normalized counts in Latin Tesserae (bottom 50)',fontsize=36)
plt.xlabel('Works', fontsize=24)
plt.ylabel('Count', fontsize=24)
ax.set_xticklabels(labels=keys, rotation=75, horizontalalignment='right', fontsize=18);
plt.show()

In [None]:
# Make bar plot of normalized data

term_norm_counts = {k: v for k, v in term_norm_counts.items() if v > 0}
term_norm_counts_sorted = dict(sorted(list(term_norm_counts.items()), key=lambda x: x[1], reverse=True)[-50:])

keys = list(term_norm_counts_sorted.keys())
vals = list(term_norm_counts_sorted.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.title(f'"{TERM}" normalized counts in Latin Tesserae (bottom 50)',fontsize=36)
plt.xlabel('Works', fontsize=24)
plt.ylabel('Count', fontsize=24)
ax.set_xticklabels(labels=keys, rotation=75, horizontalalignment='right', fontsize=18);
plt.show()