# Preliminaries

### Import statements

In [None]:
# utils
import os
import re
import json
import git
import requests

# DICES packages
from dicesapi import DicesAPI, SpeechGroup
from dicesapi.text import CtsAPI, spacy_load
import dicesapi.text

# for working with local CTS repositories
from MyCapytain.resolvers.cts.local import CtsCapitainsLocalResolver
from MyCapytain.resources.prototypes.metadata import UnknownCollection

# for analysis
import pandas as pd

# verbose output
DEBUG = False

### Set up local text repositories

Here we clone Christopher's fork of the Perseus Greek and Latin texts, so that we can use a local CTS resolver instead of querying the Perseus server.

In [None]:
repo_names = ['canonical-greekLit', 'canonical-latinLit']

print('Checking for local text repositories...')

for repo in repo_names:
    local_dir = os.path.join('data', repo)
    remote_url = f'https://github.com/cwf2/{repo}.git'

    if os.path.exists(local_dir):
        print(f' - {local_dir} exists!')
    else:
        print(f' - retrieving {remote_url}')
        git.Repo.clone_from(remote_url, local_dir)

### Connection to DICES

In [None]:
api = DicesAPI(
    logfile = 'dices.log',
    logdetail = 0,
)

### Set up local CTS connection

This is the CTS API, allowing us to retrieve texts by URN. In this example, we not only instantiate a default CTS API, but we also create a local resolver that can serve texts from the local repositories we downloaded in the first cell.

We have to do a little surgery to overwrite the default CTS API object's resolver with the local one.

<div class="alert alert-warning" style="margin:1em 2em">
    <p><strong>Note:</strong> The resolver will generate a lot of errors; these can be ignored unless they pertain to a text you want to retrieve.</p>
</div>

In [None]:
# path to local repos
repo_paths = [os.path.join('data', repo) for repo in repo_names]

# create a local resolver
local_resolver = CtsCapitainsLocalResolver(repo_paths, logger=api.log)

# initialize the CTS API
cts = CtsAPI(dices_api = api)

# overwrite the default resolver
cts._resolvers = {None: local_resolver}

# Data

### Download the entire DICES dataset

We'll start by downloading records for all the speeches in DICES. Then we can select the mother speeches locally.

In [None]:
all_speeches = api.getSpeeches()

#### ⚠️ Workaround for certain Perseus texts

These texts have an extra hierarchical level inserted into their loci on Perseus' CTS server. This is a temporary workaround to convert our loci to a form that the server understands.

Because `all_speeches` and `mother_speeches` just contain pointers to the same object pool, we can do this modification once on `all_speeches` and the mother speeches will also be affected.

In [None]:
adj_book_line = [
    'De Raptu Proserpinae',
    'In Rufinum',
]
adj_line = [
    'Panegyricus de consulatu Manlii Theodori',
    'Panegyricus de Tertio Consulatu Honorii Augusti',
    'Panegyricus de Sexto Consulatu Honorii Augusti',
    'Epithalamium de Nuptiis Honorii Augusti',
    'De Bello Gothico',
    'Psychomachia',
]

for s in all_speeches:
    if s.work.title in adj_book_line:
        m = re.fullmatch(r'(\d+)\.(\d+)', s.l_fi)
        if m:
            s.l_fi = f'{m.group(1)}.1.{m.group(2)}'

        m = re.fullmatch(r'(\d+)\.(\d+)', s.l_la)
        if m:
            s.l_la = f'{m.group(1)}.1.{m.group(2)}'

    elif s.work.title in adj_line:
        m = re.fullmatch(r'(\d+)', s.l_fi)
        if m:
            s.l_fi = '1.' + m.group(1)

        m = re.fullmatch(r'(\d+)', s.l_la)
        if m:
            s.l_la = '1.' + m.group(1)

In [None]:
# adjust loci for perseus editions

errata_file = os.path.join('data', 'changed_loci.txt')
errata = pd.read_csv(errata_file, sep='\t', dtype=str)
errata = dict([
    (f'{row.author} {row.work} {row.l_fi_old}-{row.l_la_old}', (row.l_fi_new, row.l_la_new))
    for row in errata.itertuples()])

for s in all_speeches:
    key = f'{s.author.name} {s.work.title} {s.l_range}'
    if key in errata:
        print(f'Corrected {s}', end=' ')
        s.l_fi, s.l_la = errata[key]
        print(f'to {s}')

### Get the text

Because we're retrieving the texts from a local repository I've turned off caching to save memory.

In [None]:
failed = []

for i, s in enumerate(all_speeches):
    if (i % 200 == 0) or (i == len(all_speeches) - 1):
        print(f'\r{round(i * 100 /len(all_speeches))} % complete', end='')
    if not hasattr(s, 'passage') or s.passage is None:
        try:
            s.passage = cts.getPassage(s, cache=False)
        except:
            s.passage = None
    if s.passage is None:
        failed.append(s)

print()
if DEBUG:
    print (f'{len(failed)} failed:')
    for s in failed:
        print(f'\t{s.author.name} {s.work.title} {s.l_range}')

### Add supplementary text for speeches not in Perseus

In [None]:
path = os.path.join('data', 'supp_mother_speeches.txt')

with open(path) as f:
    supplement = json.load(f)

for rec in supplement:
    for s in all_speeches:
        if s.id == rec['id']:
            s.passage = dicesapi.text.Passage()
            s.passage.line_array = rec['line_array']
            s.passage._line_index = []
            cumsum = 0
            for i in range(len(s.passage.line_array)):
                s.passage._line_index.append(cumsum)
                cumsum += len(s.passage.line_array[i]['text']) + 1
            s.passage.text = ' '.join([l['text'] for l in s.passage.line_array])
            s.passage.speech = s
            break

### Remove speeches with no text available

In [None]:
test_speeches = all_speeches.advancedFilter(lambda s: s.passage is not None).sorted()

### Add book number to line array for multi-book speeches

We have to add book identifiers to the line numbers in `line_array` for any speech spanning multiple books, in order to make sure that each line has a unique id.

In [None]:
flagged = []
for s in test_speeches:
    if '.' not in s.l_fi:
        for rec in s.passage.line_array:
            rec['N'] = rec['n']
    else:
        pref_fi, n_fi = s.l_fi.rsplit('.', 1)
        pref_la, n_la = s.l_la.rsplit('.', 1)

        if pref_fi == pref_la:
            n = int(n_fi) - 1

            for rec in s.passage.line_array:
                if rec['n'] is None:
                    n = n + 1
                    rec['N'] = pref_fi + '.' + str(n)
                    if s not in flagged:
                        flagged.append(s)
                elif '.' not in rec['n']:
                    rec['N'] = pref_fi + '.' + rec['n']
                    n = int(rec['n'].replace('a', ''))
                else:
                    rec['N'] = rec['n']
        else:
            pref = int(pref_fi)
            old_n = int(n_fi)

            for rec in s.passage.line_array:
                n = int(rec['n'])
                if n < (old_n - 100):
                    pref = pref + 1
                rec['N'] = f'{pref}.{n}'
                old_n = n

if DEBUG:
    for s in flagged:
        print(s)
        for rec in s.passage.line_array:
            print(f'{rec["N"]}\t{rec["text"]}')
        print()

### Create fake URNs for any texts that don't have them

In [None]:
for s in test_speeches:
    if s.work.urn is None or s.work.urn == '':
        s.work.urn = f'{s.work.id}'