## Preliminaries

In [1]:
# this lets me change the api while the notebook is open
%load_ext autoreload
%autoreload 2

# import statements
import pandas as pd
import re
import ipywidgets as widgets
from IPython.display import display

In [2]:
from dicesapi import DicesAPI
api = DicesAPI(
    dices_api = 'http://localhost:8000/api',
    cts_api = 'http://cts.perseids.org/api/cts/',
)

ModuleNotFoundError: No module named 'dicesapi'

In [10]:
# this is just to provide progress bars in Jupyter
from dicesapi.jupyter import NotebookPBar
api._ProgressClass = NotebookPBar

In [3]:
from cltk.corpus.utils.importer import CorpusImporter
corpora = [
    '{}_models_cltk',
    '{}_text_perseus',
    '{}_treebank_perseus',
    '{}_lexica_perseus',
]

print('Importing corpora:')

for lang in ['latin', 'greek']:
    downloader = CorpusImporter(lang)
    for corpus in corpora:
        print(" - " + corpus.format(lang))
        downloader.import_corpus(corpus.format(lang))

from cltk.tokenize.word import WordTokenizer
tokenizer = {
    'greek': WordTokenizer('greek'),
    'latin': WordTokenizer('latin'),
}
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.lemmatize.greek.backoff import BackoffGreekLemmatizer
lemmatizer = {
    'greek': BackoffGreekLemmatizer(),
    'latin': BackoffLatinLemmatizer(),    
}

# regular expressions to tidy up perseus texts for ctlk
replacements = {
    'greek': [
        (r'·', ','),           # FIXME: raised dot? 
        (chr(700), chr(8217)), # two different apostrophes that look alike
    ],
    'latin': [
        
    ],
}

# compile the regexes
for lang in ['greek', 'latin']:
    replacements[lang] = [(re.compile(pat), repl) for pat, repl in replacements[lang]]
    

# generic tokenize-lemmatize function
def lemmatize(text, lang):
    '''return a set of (token,lemmata) pairs for a string'''
    
    for pat, repl in replacements[lang]:
        text = pat.sub(repl, text)
    
    tokens = tokenizer[lang].tokenize(text)
    lemmata = lemmatizer[lang].lemmatize(tokens)
    
    return lemmata

Importing corpora:
 - latin_models_cltk
 - latin_text_perseus
 - latin_treebank_perseus
 - latin_lexica_perseus
 - greek_models_cltk
 - greek_text_perseus
 - greek_treebank_perseus
 - greek_lexica_perseus


## Download some speeches

In [4]:
speeches = api.getSpeeches(spkr_name='Achilles', progress=True)

In [5]:
speeches

[<Speech: Homer Iliad 1.59-1.67>,
 <Speech: Homer Iliad 1.85-1.91>,
 <Speech: Homer Iliad 1.122-1.129>,
 <Speech: Homer Iliad 1.149-1.171>,
 <Speech: Homer Iliad 1.202-1.205>,
 <Speech: Homer Iliad 1.216-1.218>,
 <Speech: Homer Iliad 1.225-1.244>,
 <Speech: Homer Iliad 1.293-1.303>,
 <Speech: Homer Iliad 1.334-1.344>,
 <Speech: Homer Iliad 1.352-1.356>,
 <Speech: Homer Iliad 1.365-1.412>,
 <Speech: Homer Iliad 9.197-9.198>,
 <Speech: Homer Iliad 9.202-9.204>,
 <Speech: Homer Iliad 9.308-9.429>,
 <Speech: Homer Iliad 9.607-9.619>,
 <Speech: Homer Iliad 9.644-9.655>,
 <Speech: Homer Iliad 11.608-11.615>,
 <Speech: Homer Iliad 16.7-16.19>,
 <Speech: Homer Iliad 16.49-16.100>,
 <Speech: Homer Iliad 16.126-16.129>,
 <Speech: Homer Iliad 16.200-16.209>,
 <Speech: Homer Iliad 16.233-16.248>,
 <Speech: Homer Iliad 16.839-16.841>,
 <Speech: Homer Iliad 18.6-18.14>,
 <Speech: Homer Iliad 18.79-18.93>,
 <Speech: Homer Iliad 18.98-18.126>,
 <Speech: Homer Iliad 18.182-18.182>,
 <Speech: Homer Ilia

## look for quotation marks

In [6]:
s = speeches[-1].getCTS()

In [7]:
for e in s.xml.iter():
    print(e.tag)

{http://www.tei-c.org/ns/1.0}TEI
{http://www.tei-c.org/ns/1.0}text
{http://www.tei-c.org/ns/1.0}body
{http://www.tei-c.org/ns/1.0}div
{http://www.tei-c.org/ns/1.0}div
{http://www.tei-c.org/ns/1.0}l
{http://www.tei-c.org/ns/1.0}l
{http://www.tei-c.org/ns/1.0}l
{http://www.tei-c.org/ns/1.0}l


In [8]:
s.xml.find('.//q', namespaces=e.nsmap)

## How many speeches have a `<q>` tag?

In [11]:
speeches = api.getSpeeches(progress=True)

HBox(children=(IntProgress(value=0, bar_style='info', max=2362), Label(value='0/2362')))

In [12]:
cts_failed = []
for s in speeches:
    try:
        s.cts = s.getCTS()
    except:
        print(f'CTS error for: {s}')
        cts_failed.append(s)
        s.cts = None

CTS error for: <Speech: Homer Odyssey 9.2-9.1472>
CTS error for: <Speech: Homer Odyssey 11.378-11.1093>
CTS error for: <Speech: Apollonius Argonautica 3.727-3.739>
CTS error for: <Speech: Virgil Aeneid 2.3-2.3,715>
CTS error for: <Speech: Virgil Aeneid 7.68b-7.70>
CTS error for: <Speech: Ovid Metamorphoses 1.545-1.546>
CTS error for: <Speech: Ovid Metamorphoses 4.793-4.803>
CTS error for: <Speech: Ovid Metamorphoses 10.62-10.62b>
CTS error for: <Speech: Ovid Metamorphoses 14.383-14.385>


In [None]:
tag_succeeded = []
sq_succeeded = []
failed = []

for s in speeches:
    if s.cts is not None:
        if s.cts.xml.find('.//q', namespaces=s.cts.xml.nsmap) is not None:
            tag_succeeded.append(s)
        elif '‘' in s.cts.text:
            sq_succeeded.append(s)
        else:
            failed.append(s)

In [None]:
print(len(tag_succeeded), len(sq_succeeded), len(failed))

In [None]:
failed

In [None]:
passage

In [None]:
passage.text

In [None]:
print(s.urn)