In [None]:
# Imports

from cltkreaders.lat import LatinTesseraeCorpusReader
from collections import defaultdict
from latintools import preprocess
from pprint import pprint
from tqdm import tqdm
import pickle

In [None]:
# Setup corpus

T = LatinTesseraeCorpusReader()

In [None]:
# Get text 

mena = 'plautus.menaechmi.tess'
mena_words = T.words(mena, preprocess=preprocess)

In [None]:
# Find word in text

search_word = 'ego'

for word in mena_words:
    if word == search_word :
        print('Found one!')
        break

In [None]:
# Get lines

mena_lines = next(T.doc_rows(mena))
mena_lines = list(mena_lines.items())
mena_lines[:5]

In [None]:
# Search text

for citation, line in mena_lines:
    if search_word in line:
        print('-----')
        print(citation)
        print(line)
        print('-----')
        break

In [None]:
# Search "all" texts

for file in T.fileids()[:2]:
    lines = next(T.doc_rows(file))
    lines = list(lines.items())
    for citation, line in lines:
        if search_word in line:
            print(citation)

In [None]:
# Collect matches in "all" texts

from collections import defaultdict

ego_counts = defaultdict(int)

for file in T.fileids()[:5]:
    lines = next(T.doc_rows(file))
    lines = list(lines.items())
    for citation, line in lines:
        if 'ego' in line:
            ego_counts[file] += 1
        else:
            ego_counts[file] += 0


pprint(ego_counts)  

In [None]:
# Collect matches in all texts

ego_counts = defaultdict(int)

for file in T.fileids():
    lines = next(T.doc_rows(file))
    lines = list(lines.items())
    for citation, line in lines:
        if 'ego' in line:
            ego_counts[file] += 1
        else:
            ego_counts[file] += 0

In [None]:
# Check assumptions

'plautus.menaechmi.tess' in ego_counts.keys()

In [None]:
# Imports; for plotting

import seaborn as sns
import matplotlib.pyplot as plt
pal = sns.color_palette("colorblind")

In [None]:
# Make barplot of counts

ego_counts = dict([(k, v) for k, v in ego_counts.items() if v > 0])

keys = list(ego_counts.keys())
vals = list(ego_counts.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.title('ego raw counts in Latin Tesserae',fontsize=36)
plt.xlabel('Works', fontsize=12)
plt.ylabel('Count', fontsize=12)
ax.set_xticklabels(labels=keys, rotation=75, horizontalalignment='right', fontsize=18);
plt.show()


In [None]:
# Get detail of barplot, only left 10

ego_counts_sorted_left = dict(sorted(list(ego_counts.items()), key=lambda x: x[1], reverse=True)[:10])

keys = list(ego_counts_sorted_left.keys())
vals = list(ego_counts_sorted_left.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.title('ego raw counts in Latin Tesserae',fontsize=36)
plt.xlabel('Works', fontsize=24)
plt.ylabel('Count', fontsize=24)
ax.set_xticklabels(labels=keys, rotation=75, horizontalalignment='right', fontsize=18);
plt.show()

In [None]:
# Get counts again (i.e. replace 0 values)

ego_counts = defaultdict(int)

for file in T.fileids():
    lines = next(T.doc_rows(file))
    lines = list(lines.items())
    for citation, line in lines:
        if 'ego' in line:
            ego_counts[file] += 1
        else:
            ego_counts[file] += 0

In [None]:
# Get word counts for all texts
# MB: Cached, takes a long(!) time to run

# word_counts = defaultdict(int)

# for file in tqdm(T.fileids()):
#     word_count = len(list(T.words(file, preprocess=preprocess)))
#     word_counts[file] = word_count

# pickle.dump(word_counts, open('data/latin_word_counts.pickle', 'wb'))

word_counts = pickle.load(open('../data/latin_word_counts.pickle', 'rb'))    

In [None]:
# Compare lists

pprint(list(word_counts.items())[:5])
print()
pprint(list(ego_counts.items())[:5])

In [None]:
# Workflow for normalizing

for i, item in enumerate(ego_counts.items()):
    text = item[0]
    print(text)
    ego_count = item[1]
    print(f'ego appears {ego_count} times...')
    word_count = list(word_counts.values())[i]
    print(f'...in {word_count} total words')
    norm_count = (ego_count / word_count) * 1000
    print(f'The normalized count is {norm_count:.02} per 1000 words')
    break

In [None]:
# Normalize counts

ego_norm_counts = defaultdict(float)

for i, item in enumerate(ego_counts.items()):
    text = item[0]
    ego_count = item[1]
    word_count = list(word_counts.values())[i]
    norm_count = (ego_count / word_count) * 1000
    ego_norm_counts[text] = norm_count

list(ego_norm_counts.items())[:5]    

In [None]:
# Make bar plot of normalized data

ego_norm_counts_sorted = dict(sorted(list(ego_norm_counts.items()), key=lambda x: x[1], reverse=True)[:10])

keys = list(ego_norm_counts_sorted.keys())
vals = list(ego_norm_counts_sorted.values())

plt.figure(figsize=(32, 8))
ax = sns.barplot(x=keys, y=vals, palette=pal);
plt.title('ego normalized counts in Latin Tesserae (top 50)',fontsize=36)
plt.xlabel('Works', fontsize=24)
plt.ylabel('Count', fontsize=24)
ax.set_xticklabels(labels=keys, rotation=75, horizontalalignment='right', fontsize=18);
plt.show()

In [None]:
# Show example

text = 'plautus.menaechmi.tess'
print(ego_counts[text])
print(word_counts[text])

In [None]:
# Show example

max_ego = list(ego_counts.keys())[list(ego_counts.values()).index(max(ego_counts.values()))]

# "diagrammed"
max_ego = list(
                    ego_counts.keys()
                    )[
                        list(
                             ego_counts.values()
                            ).index(
                                    max(
                                        ego_counts.values()
                                        )
                                    )
                     ]


print(max_ego)
print(ego_counts[max_ego])
print(word_counts[max_ego])