In [None]:
### NB: Much of what is included below draws on examples from the "NLTK Book" (https://www.nltk.org/book/ch01.html) Steven Bird, Ewan Klein, and Edward Loper. Highly recommended to continue with the kinds of work introduced in the workshop. 

In [None]:
## Imports

from cltkreaders.lat import LatinTesseraeCorpusReader
from pprint import pprint
from natsort import natsorted
from latintools import preprocess

In [None]:
# Setup corpus

T = LatinTesseraeCorpusReader()

In [None]:
# Sample text

aeneid = natsorted([file for file in T.fileids() if 'aeneid' in file])
aeneid_text = "\n".join([next(T.texts(file, preprocess=preprocess)) for file in aeneid])

In [None]:
print('Beginning of Aeneid...')
print(aeneid_text[:100])

print()

print('End of Aeneid...')
print(aeneid_text[-100:])

In [None]:
# Make a Text object with NLTK

from nltk import Text
aeneid_tokens = aeneid_text.split()
aeneid_Text = Text(aeneid_tokens)
print(aeneid_Text)

In [None]:
# Check type of Text object

type(aeneid_Text)

In [None]:
# Check methods of Text object

pprint(dir(aeneid_Text))

In [None]:
# Check methods of Text object, again

pprint([item for item in dir(aeneid_Text) if not item.startswith('_')])

In [None]:
# Make a KWIC list

concordance = aeneid_Text.concordance('aeneas', width=50, lines=5)
concordance

In [None]:
# Get collocations

collocations = aeneid_Text.collocation_list(num=10)

for collocation in collocations:
    print(' '.join(collocation))

In [None]:
# Make a dispersion plot

displot = aeneid_Text.dispersion_plot(['aeneas', 'dido', 'turnus'])

In [None]:
# Get tokenized sents

tokenized_sents = T.tokenized_sents(aeneid)
sample = next(tokenized_sents)
print(sample[:5])

In [None]:
# Get list of lemmas

tokenized_sents = T.tokenized_sents(aeneid)

all_lemmas = []
for sent in tokenized_sents:
    for token in sent:
        if token[1]:
            lemma = preprocess(token[1])
            if lemma:
                all_lemmas.append(preprocess(token[1], remove_spaces=True))
        

In [None]:
# Make lemmatized Text object

aeneid_lemmatized_Text = Text(all_lemmas)

In [None]:
print(aeneid_lemmatized_Text)

In [None]:
# Before, unlemmatized

displot = aeneid_Text.dispersion_plot(['aeneas', 'dido', 'turnus'])

In [None]:
# After, lemmatized

displot = aeneid_lemmatized_Text.dispersion_plot(['aeneas', 'dido', 'turnus'])

In [None]:
# Other ideas

displot = aeneid_lemmatized_Text.dispersion_plot(['arma', 'uir'])

In [None]:
# Other ideas

displot = aeneid_lemmatized_Text.dispersion_plot(['bonus', 'malus'])

In [None]:
# Other ideas

displot = aeneid_lemmatized_Text.dispersion_plot(['albus', 'ater'])

In [None]:
# Find similar words

similars = aeneid_lemmatized_Text.similar('aeneas', num=10)

In [None]:
# Find similar words

similars = aeneid_lemmatized_Text.similar('pater', num=10)

In [None]:
# Get frequencies

from collections import Counter
counts = Counter(aeneid_lemmatized_Text)
print(counts.most_common(15))

In [None]:
# Make "stops"

stops = [k for k, v in counts.most_common(10)]
print(stops)

In [None]:
# Remove stops

stopped_lemmas = [lemma for lemma in all_lemmas if lemma not in stops]
aeneid_lemmatized_stopped_Text = Text(stopped_lemmas)

In [None]:
similars = aeneid_lemmatized_stopped_Text.similar('pater', num=10)

In [None]:
# Generate text ?!?!

print(aeneid_Text.generate(text_seed='aeneas est'.split()))

In [None]:
# Get frequencies; similar to counter

from nltk import FreqDist
freqs = FreqDist(aeneid_lemmatized_Text)

In [None]:
# Plot freqs

freqs.plot(50, cumulative=False);

In [None]:
# Plot freqs, cumulative

freqs.plot(50, cumulative=True);

In [None]:
# Length of Text

len(aeneid_lemmatized_Text)