## Preliminaries

In [None]:
# this lets me change the api while the notebook is open
%load_ext autoreload
%autoreload 2

# import statements
import pandas as pd
import re
import ipywidgets as widgets
from IPython.display import display

In [None]:
from dicesapi import DicesAPI
from dicesapi.jupyter import NotebookPBar

api = DicesAPI(
    dices_api = 'http://localhost:8000/api',
    cts_api = 'http://cts.perseids.org/api/cts/',
    progress_class = NotebookPBar,
    logfile = 'dices.log',
)

In [None]:
from cltk.corpus.utils.importer import CorpusImporter
corpora = [
    '{}_models_cltk',
    '{}_text_perseus',
    '{}_treebank_perseus',
    '{}_lexica_perseus',
]

print('Importing corpora:')

for lang in ['latin', 'greek']:
    downloader = CorpusImporter(lang)
    for corpus in corpora:
        print(" - " + corpus.format(lang))
        downloader.import_corpus(corpus.format(lang))

from cltk.tokenize.word import WordTokenizer
tokenizer = {
    'greek': WordTokenizer('greek'),
    'latin': WordTokenizer('latin'),
}
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.lemmatize.greek.backoff import BackoffGreekLemmatizer
lemmatizer = {
    'greek': BackoffGreekLemmatizer(),
    'latin': BackoffLatinLemmatizer(),    
}

# regular expressions to tidy up perseus texts for ctlk
replacements = {
    'greek': [
        (r'·', ','),           # FIXME: raised dot? 
        (chr(700), chr(8217)), # two different apostrophes that look alike
    ],
    'latin': [
        
    ],
}

# compile the regexes
for lang in ['greek', 'latin']:
    replacements[lang] = [(re.compile(pat), repl) for pat, repl in replacements[lang]]
    

# generic tokenize-lemmatize function
def lemmatize(text, lang):
    '''return a set of (token,lemmata) pairs for a string'''
    
    for pat, repl in replacements[lang]:
        text = pat.sub(repl, text)
    
    tokens = tokenizer[lang].tokenize(text)
    lemmata = lemmatizer[lang].lemmatize(tokens)
    
    return lemmata

## Download some speeches

In [None]:
speeches = api.getSpeeches(spkr_name='Achilles', progress=True)

In [None]:
speeches

## look for quotation marks

In [None]:
s = speeches[-1].getCTS()

In [None]:
for e in s.xml.iter():
    print(e.tag)

In [None]:
s.xml.find('.//q', namespaces=e.nsmap)

## How many speeches have a `<q>` tag?

In [None]:
speeches = api.getSpeeches(progress=True)

In [None]:
cts_failed = []
for s in speeches:
    try:
        s.cts = s.getCTS()
    except:
        print(f'CTS error for: {s}')
        cts_failed.append(s)
        s.cts = None

In [None]:
tag_succeeded = []
sq_succeeded = []
failed = []

for s in speeches:
    if s.cts is not None:
        if s.cts.xml.find('.//q', namespaces=s.cts.xml.nsmap) is not None:
            tag_succeeded.append(s)
        elif '‘' in s.cts.text:
            sq_succeeded.append(s)
        else:
            failed.append(s)

In [None]:
print(len(tag_succeeded), len(sq_succeeded), len(failed))

In [None]:
failed

In [None]:
passage

In [None]:
passage.text

In [None]:
print(s.urn)