In [None]:
# Imports

from cltkreaders.lat import LatinTesseraeCorpusReader
from collections import defaultdict
from pprint import pprint
import pickle

# Imports; for plotting

import seaborn as sns
import matplotlib.pyplot as plt
pal = sns.color_palette("colorblind")

In [None]:
# Setup corpus

T = LatinTesseraeCorpusReader()

print(f'There are {len(T.fileids())} files in this corpus.\n')
print(f'Here are the first five (5) files: {T.fileids()[:5]}')

In [None]:
plautus = [file for file in T.fileids() if 'plautus' in file]
pprint(plautus)

In [None]:
print(f'There are {len(plautus)} works by Plautus in this text collection.')

In [None]:
menaechmi = 'plautus.menaechmi.tess'

![](img/perseus_menaechmi_prologus.png)

In [None]:
menaechmi_words = T.words(menaechmi)

for i in range(6):
    print(next(menaechmi_words))

![](img/perseus_menaechmi_prologus_count.png)

In [None]:
menaechmi_words = list(T.words(menaechmi))

print(menaechmi_words[:22])

In [None]:
from latintools import preprocess

In [None]:
menaechmi_tokens = list(T.words(menaechmi))
menaechmi_words = list(T.words(menaechmi, preprocess=preprocess))

print(menaechmi_words[:22])

In [None]:
menaechmi_word_count = len(menaechmi_words)

print(f'There are {menaechmi_word_count} words in Plautus\'s *Menaechmi*.')

In [None]:
menaechmi_unique_word_count = len(set(menaechmi_words))

print(f'There are {menaechmi_unique_word_count} unique words in Plautus\'s *Menaechmi*.')

**ANALOGUE CHALLENGE**

- What do you think are the most frequent words in the Menaechmi? In groups of 3-4 people, make a list of ten (10) words that you all think appear the most in the play.

In [None]:
from collections import Counter

word_counter = Counter(menaechmi_words)

print(word_counter.most_common(25))

In [None]:
# Get counter with all texts available
# Cache results since this will take some time to run

# Consider doing this—it is an interesting comparison

In [None]:
for play in plautus[:3]:
    top_five = [word for word, _ in Counter(T.words(play, preprocess=preprocess)).most_common(10)]
    print(f'{play}: {", ".join(top_five)}')

In [None]:
for work in [file for file in T.fileids() if 'cicero' in file][:3]:
    top_five = [word for word, _ in Counter(T.words(work, preprocess=preprocess)).most_common(10)]
    print(f'{work}: {", ".join(top_five)}')

In [None]:
from tqdm import tqdm

plautus_lens = []

for play in tqdm(plautus):
    play_len = len(list(T.words(play)))
    plautus_lens.append(play_len)


In [None]:
print(plautus_lens)

In [None]:
data = list(zip(plautus, plautus_lens))

In [None]:
from tabulate import tabulate

print(tabulate(data))

In [None]:
data = sorted(data, key=lambda x: x[1], reverse=True)

In [None]:
print(tabulate(data))

In [None]:
# Make bar plot of normalized data

xs, ys = zip(*data)

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(32, 8))
ax = fig.add_axes([0,0,1,1])
ax.bar(xs, ys)
# ax.set_xticklabels(labels=xs, rotation=45, horizontalalignment='right', fontsize=18);
ax.set_xticks(ax.get_xticks())
ax.set_xticklabels(xs, rotation = 45, ha="right", fontsize=20)
ax.set_yticks(ax.get_yticks())
ax.set_yticklabels([int(tick) for tick in ax.get_yticks()], fontsize=16)
ax.set_xlabel('Works', fontsize=24)
ax.set_ylabel('Word count', fontsize=24)
plt.title('Lengths of Plautus\'s plays (in words)', fontsize=36)
plt.show()