In [None]:
### NB: Much of what is included below draws on examples from the "NLTK Book" (https://www.nltk.org/book/ch01.html) Steven Bird, Ewan Klein, and Edward Loper. Highly recommended to continue with the kinds of work introduced in the workshop. 

In [None]:
## Imports

from cltkreaders.readers import PerseusTreebankCorpusReader
from pprint import pprint

In [None]:
# Setup corpus

PT = PerseusTreebankCorpusReader('../data/texts/', '.*\.xml')

In [None]:
# Sample xml text

sample_doc = next(PT.docs())
print(sample_doc[8273:9182])

In [None]:
# Show sample words

words = PT.words()

print(next(words))
print(next(words))
print(next(words))
print(next(words))
print(next(words))

In [None]:
# Organize text

iliad_text = " ".join(PT.words())
iliad_text[:150]

In [None]:
# Preprocess text

from texttools import preprocess
iliad_text = preprocess(iliad_text)
iliad_text[:150]

In [None]:
# Tokenize text
iliad_tokens = iliad_text.split()
print(iliad_tokens[:15])

In [None]:
# Make a Text object with NLTK

from nltk import Text
iliad_Text = Text(iliad_tokens)
iliad_Text

In [None]:
# Check type of Text object

type(iliad_Text)

In [None]:
# Get collocations

collocations = iliad_Text.collocation_list(num=5)

for collocation in collocations:
    print(' '.join(collocation))

In [None]:
# Make a dispersion plot

displot = iliad_Text.dispersion_plot(['ἀχιλλεύς', 'πρίαμος', 'πάτροκλος'])

In [None]:
# Make lemmatized Text object

tokenized_sents = PT.tokenized_sents()

all_lemmas = []
for sent in tokenized_sents:
    for token in sent:
        if token[1]:
            lemma = preprocess(token[1])
            if lemma:
                all_lemmas.append(preprocess(token[1], remove_spaces=True))

iliad_lemmatized_Text = Text(all_lemmas)        

In [None]:
# Before, unlemmatized

displot = iliad_Text.dispersion_plot(['ἀχιλλεύς', 'πρίαμος', 'πάτροκλος'])

In [None]:
# After, lemmatized

displot = iliad_lemmatized_Text.dispersion_plot(['ἀχιλλεύς', 'πρίαμος', 'πάτροκλος'])

In [None]:
# Other ideas

displot = iliad_lemmatized_Text.dispersion_plot(['ἀγαθός', 'κακός'])

In [None]:
# Other ideas

displot = iliad_lemmatized_Text.dispersion_plot(['μέλας', 'λευκός'])

In [None]:
# Plot freqs

from nltk import FreqDist
freqs = FreqDist(iliad_lemmatized_Text)
freqs.plot(50, cumulative=False);

In [None]:
# Plot freqs, cumulative

freqs.plot(50, cumulative=True);