In [1]:
import random
import re
import ipywidgets as widgets
from IPython.display import display

In [2]:
from dicesapi import DicesAPI
api = DicesAPI(dices_api='http://localhost:8000/api')
# this is just to provide progress bars in Jupyter
from dicesapi.jupyter import NotebookPBar
api._ProgressClass = NotebookPBar

In [3]:
from cltk.corpus.utils.importer import CorpusImporter
corpora = [
    '{}_models_cltk',
    '{}_text_perseus',
    '{}_treebank_perseus',
    '{}_lexica_perseus',
]

print('Importing corpora:')

for lang in ['latin', 'greek']:
    downloader = CorpusImporter(lang)
    for corpus in corpora:
        print(" - " + corpus.format(lang))
        downloader.import_corpus(corpus.format(lang))

from cltk.tokenize.word import WordTokenizer
tokenizer = {
    'greek': WordTokenizer('greek'),
    'latin': WordTokenizer('latin'),
}

Importing corpora:
 - latin_models_cltk
 - latin_text_perseus
 - latin_treebank_perseus
 - latin_lexica_perseus
 - greek_models_cltk
 - greek_text_perseus
 - greek_treebank_perseus
 - greek_lexica_perseus


# Download some speeches

In [None]:
speeches = api.getSpeeches(work_title='Iliad', progress=True) + \
            api.getSpeeches(work_title='Odyssey', progress=True)
speeches.sort()

# Download the text of the speeches

In [None]:
passages = [None] * len(speeches)

# create a progress bar
pbar = widgets.IntProgress(
    value = 0,
    min = 0,
    max = len(speeches),
    bar_style='info',
    orientation='horizontal'
)
pbar_label = widgets.Label(value = f'Downloading {pbar.value}/{len(speeches)}')
display(widgets.HBox([pbar, pbar_label]))

# download text of speeches
for i, s in enumerate(speeches):
    cts_passage = s.getCTS()
    text = cts_passage.text
    passages[i] = text
    pbar.value = i
    pbar_label.value = f'Downloading {i+1}/{len(speeches)}'

# Lemmatizing

In [None]:
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.lemmatize.greek.backoff import BackoffGreekLemmatizer
lemmatizer = {
    'greek': BackoffGreekLemmatizer(),
    'latin': BackoffLatinLemmatizer(),    
}

# regular expressions to tidy up perseus texts for ctlk
replacements = {
    'greek': [
        (r'·', ','),           # FIXME: raised dot? 
        (chr(700), chr(8217)), # two different apostrophes that look alike
    ],
    'latin': [
        
    ],
}

# compile the regexes
for lang in ['greek', 'latin']:
    replacements[lang] = [(re.compile(pat), repl) for pat, repl in replacements[lang]]
    

# generic tokenize-lemmatize function
def lemmatize(text, lang):
    '''return a set of (token,lemmata) pairs for a string'''
    
    for pat, repl in replacements[lang]:
        text = pat.sub(repl, text)
    
    tokens = tokenizer[lang].tokenize(text)
    lemmata = lemmatizer[lang].lemmatize(tokens)
    
    return lemmata

In [None]:
lemmata = [None] * len(passages)

# create a progress bar
pbar = widgets.IntProgress(
    value = 0,
    min = 0,
    max = len(speeches),
    bar_style='info',
    orientation='horizontal'
)
pbar_label = widgets.Label(value = f'Lemmatizing {pbar.value}/{len(passages)}')
display(widgets.HBox([pbar, pbar_label]))

# download text of speeches
for i, p in enumerate(passages):
    lang = speeches[i].work.lang
    lemmatized = lemmatize(p.lower(), lang)
    lemmata[i] = lemmatized
    pbar.value = i
    pbar_label.value = f'Lemmatizing {i+1}/{len(passages)}'

# Compare two speeches to see whether they share lemmata

In [None]:
def lem_comp(lemmatized_a, lemmatized_b, inc_punc=False):
    lems_a = set([lem for tok, lem in lemmatized_a])
    lems_b = set([lem for tok, lem in lemmatized_b])
    shared = set([lem for lem in lems_a if lem in lems_b])
    
    if not inc_punc:
        if 'punc' in shared:
            shared = set([lem for lem in shared if lem != 'punc'])
    
    return shared

In [None]:
for i in range(1, len(lemmata)):
    if speeches[i].part > 1:
        shared = lem_comp(lemmata[i-1], lemmata[i])
        
        print('\t'.join([
            str(speeches[i-1]),
            str(speeches[i]),
            str(len(passages[i-1])),
            str(len(passages[i])),
            str(len(shared)),
            str(shared)
        ]))

In [None]:
shared_lems_no_reply = []
nshared_no_reply = []

for i in range(len(speeches)):
    if speeches[i].part > 1:
        j = random.randint(0, len(speeches)-1)
        shared = lem_comp(lemmata[i], lemmata[j])
        print('\t'.join([
            str(speeches[i-1]),
            str(speeches[i]),
            str(len(passages[i-1])),
            str(len(passages[i])),
            str(len(shared)),
            str(shared)
        ]))

In [None]:
from matplotlib import pyplot
%matplotlib inline

In [None]:
pyplot.boxplot([nshared_reply, nshared_no_reply])