# Preliminaries

### Install prerequisites
This is necessary for Google Colab.

In [None]:
# python packages
!pip install -q GitPython MyCapytain
!pip install -q git+https://github.com/cwf2/dices-client

# language models
!pip install -q https://huggingface.co/latincy/la_core_web_lg/resolve/1973a08557127e1d306ab70239bfb73f560a8cb4/la_core_web_lg-any-py3-none-any.whl
!pip install -q https://huggingface.co/chcaa/grc_odycy_joint_trf/resolve/main/grc_odycy_joint_trf-any-py3-none-any.whl


### Import statements

In [None]:
# utils
import os
import re
import json
import git
import requests

# DICES packages
from dicesapi import DicesAPI, SpeechGroup
from dicesapi.text import CtsAPI, spacy_load
import dicesapi.text

# for working with local CTS repositories
from MyCapytain.resolvers.cts.local import CtsCapitainsLocalResolver
from MyCapytain.resources.prototypes.metadata import UnknownCollection

# for analysis
import pandas as pd

# verbose output
DEBUG = False

### Download data files

Also necessary for Colab

In [None]:
if not os.path.exists("data"):
    os.mkdir("data")
for filename in ["changed_loci.txt", "supp_mother_speeches.txt"]:
    path = os.path.join('data', filename)
    if not os.path.exists(path):
        res = requests.get(f'https://raw.githubusercontent.com/cwf2/dices-mta/main/data/{filename}')
        with open(path, 'wb') as f:
            f.write(res.content)

### Set up local text repositories

Here we clone Christopher's fork of the Perseus Greek and Latin texts, so that we can use a local CTS resolver instead of querying the Perseus server.

In [None]:
repo_names = ['canonical-greekLit', 'canonical-latinLit']

print('Checking for local text repositories...')

for repo in repo_names:
    local_dir = os.path.join('data', repo)
    remote_url = f'https://github.com/cwf2/{repo}.git'

    if os.path.exists(local_dir):
        print(f' - {local_dir} exists!')
    else:
        print(f' - retrieving {remote_url}')
        git.Repo.clone_from(remote_url, local_dir)

### Connection to DICES

In [None]:
api = DicesAPI(
    logfile = 'dices.log',
    logdetail = 0,
)

### Set up local CTS connection

This is the CTS API, allowing us to retrieve texts by URN. In this example, we not only instantiate a default CTS API, but we also create a local resolver that can serve texts from the local repositories we downloaded in the first cell.

We have to do a little surgery to overwrite the default CTS API object's resolver with the local one.

<div class="alert alert-warning" style="margin:1em 2em">
    <p><strong>Note:</strong> The resolver will generate a lot of errors; these can be ignored unless they pertain to a text you want to retrieve.</p>
</div>

In [None]:
# path to local repos
repo_paths = [os.path.join('data', repo) for repo in repo_names]

# create a local resolver
local_resolver = CtsCapitainsLocalResolver(repo_paths, logger=api.log)

# initialize the CTS API
cts = CtsAPI(dices_api = api)

# overwrite the default resolver
cts._resolvers = {None: local_resolver}

# Data

### Download the entire DICES dataset

We'll start by downloading records for all the speeches in DICES. Then we can select the mother speeches locally.

In [None]:
all_speeches = api.getSpeeches()

#### ⚠️ Workaround for certain Perseus texts

These texts have an extra hierarchical level inserted into their loci on Perseus' CTS server. This is a temporary workaround to convert our loci to a form that the server understands.

Because `all_speeches` and `mother_speeches` just contain pointers to the same object pool, we can do this modification once on `all_speeches` and the mother speeches will also be affected.

In [None]:
adj_book_line = [
    'De Raptu Proserpinae',
    'In Rufinum',
]
adj_line = [
    'Panegyricus de consulatu Manlii Theodori',
    'Panegyricus de Tertio Consulatu Honorii Augusti',
    'Panegyricus de Sexto Consulatu Honorii Augusti',
    'Epithalamium de Nuptiis Honorii Augusti',
    'De Bello Gothico',
    'Psychomachia',
]

for s in all_speeches:
    if s.work.title in adj_book_line:
        m = re.fullmatch(r'(\d+)\.(\d+)', s.l_fi)
        if m:
            s.l_fi = f'{m.group(1)}.1.{m.group(2)}'

        m = re.fullmatch(r'(\d+)\.(\d+)', s.l_la)
        if m:
            s.l_la = f'{m.group(1)}.1.{m.group(2)}'

    elif s.work.title in adj_line:
        m = re.fullmatch(r'(\d+)', s.l_fi)
        if m:
            s.l_fi = '1.' + m.group(1)

        m = re.fullmatch(r'(\d+)', s.l_la)
        if m:
            s.l_la = '1.' + m.group(1)

In [None]:
# adjust loci for perseus editions

errata_file = os.path.join('data', 'changed_loci.txt')
errata = pd.read_csv(errata_file, sep='\t', dtype=str)
errata = dict([
    (f'{row.author} {row.work} {row.l_fi_old}-{row.l_la_old}', (row.l_fi_new, row.l_la_new))
    for row in errata.itertuples()])

for s in all_speeches:
    key = f'{s.author.name} {s.work.title} {s.l_range}'
    if key in errata:
        print(f'Corrected {s}', end=' ')
        s.l_fi, s.l_la = errata[key]
        print(f'to {s}')


## Get the text

Because we're retrieving the texts from a local repository I've turned off caching to save memory.

In [None]:
failed = []

for i, s in enumerate(all_speeches):
    if (i % 200 == 0) or (i == len(all_speeches) - 1):
        print(f'\r{round(i * 100 /len(all_speeches))} % complete', end='')
    if not hasattr(s, 'passage') or s.passage is None:
        try:
            s.passage = cts.getPassage(s, cache=False)
        except:
            s.passage = None
    if s.passage is None:
        failed.append(s)

print()
if DEBUG:
    print (f'{len(failed)} failed:')
    for s in failed:
        print(f'\t{s.author.name} {s.work.title} {s.l_range}')

### Add supplementary text for speeches not in Perseus

In [None]:
path = os.path.join('data', 'supp_mother_speeches.txt')

with open(path) as f:
    supplement = json.load(f)

for rec in supplement:
    for s in all_speeches:
        if s.id == rec['id']:
            if DEBUG:
                print(s)
            s.passage = dicesapi.text.Passage()
            s.passage.line_array = rec['line_array']
            s.passage._line_index = []
            cumsum = 0
            for i in range(len(s.passage.line_array)):
                s.passage._line_index.append(cumsum)
                cumsum += len(s.passage.line_array[i]['text']) + 1
            s.passage.text = ' '.join([l['text'] for l in s.passage.line_array])
            s.passage.speech = s
            break

### Remove speeches with no text available

In [None]:
test_speeches = all_speeches.advancedFilter(lambda s: s.passage is not None).sorted()

### Add book number to line array for multi-book speeches

We have to add book identifiers to the line numbers in `line_array` for any speech spanning multiple books, in order to make sure that each line has a unique id.

In [None]:
flagged = []
for s in test_speeches:
    if '.' not in s.l_fi:
        for rec in s.passage.line_array:
            rec['N'] = rec['n']
    else:
        pref_fi, n_fi = s.l_fi.rsplit('.', 1)
        pref_la, n_la = s.l_la.rsplit('.', 1)

        if pref_fi == pref_la:
            n = int(n_fi) - 1

            for rec in s.passage.line_array:
                if rec['n'] is None:
                    n = n + 1
                    rec['N'] = pref_fi + '.' + str(n)
                    if s not in flagged:
                        flagged.append(s)
                elif '.' not in rec['n']:
                    rec['N'] = pref_fi + '.' + rec['n']
                    n = int(rec['n'].replace('a', ''))
                else:
                    rec['N'] = rec['n']
        else:
            pref = int(pref_fi)
            old_n = int(n_fi)

            for rec in s.passage.line_array:
                n = int(rec['n'])
                if n < (old_n - 100):
                    pref = pref + 1
                rec['N'] = f'{pref}.{n}'
                old_n = n

if DEBUG:
    for s in flagged:
        print(s)
        for rec in s.passage.line_array:
            print(f'{rec["N"]}\t{rec["text"]}')
        print()

### Create fake URNs for any texts that don't have them

In [None]:
for s in test_speeches:
    if s.work.urn is None or s.work.urn == '':
        s.work.urn = f'{s.work.id}'

# Run NLP
## Spacy

In [None]:
# initialize spacy models
spacy_load(
    latin_model = 'la_core_web_lg',
    greek_model = 'grc_odycy_joint_trf',
)

In [None]:
failed = []

for i, s in enumerate(test_speeches):
    if (i % 50 == 0) or (i == len(test_speeches) - 1):
        print(f'\r{round(i * 100 /len(test_speeches))} % complete', end='')
    if s.passage.spacy_doc is None:
        s.passage.runSpacyPipeline()
    if s.passage.spacy_doc is None:
        failed.append(s)

if len(failed) > 0:
    print(f'SpaCy failed for {len(failed)} speeches:')
    for s in failed:
        print(f' - {s.work.urn}\t{s.work.title}\t{s.l_range}')

### Generate tabular data

In [None]:
spacy_tokens = []

# extract features
for i, s in enumerate(test_speeches):
    # progress
    if (i % 200 == 0) or (i == len(test_speeches) - 1):
        print(f'\r{round(i * 100 /len(test_speeches))} % complete', end='')

    # process all tokens in speech
    for tok in s.passage.spacy_doc:
        line_n = s.passage.line_array[s.passage.getLineIndex(tok)]['N'] if s.passage.getLineIndex(tok) is not None else None
        spacy_tokens.append(dict(
            speech_id = s.id,
            lang = s.lang,
            author = s.author.name,
            work = s.work.title,
            urn = s.work.urn,
            l_fi = s.l_fi,
            l_la = s.l_la,
            nlines = len(s.passage.line_array),
            spkr = ','.join([inst.name for inst in s.spkr]),
            addr = ','.join([inst.name for inst in s.addr]),
            part = s.part,
            level = s.level,
            line_n = line_n,
            line_id = f'{s.work.urn}:{line_n}',
            token = tok.text,
            tok_id = f'{s.id}:{s.passage.getTextPos(tok)}',
            lemma = tok.lemma_,
            pos = tok.pos_,
            mood = tok.morph.get('Mood'),
            tense = tok.morph.get('Tense'),
            voice = tok.morph.get('Voice'),
            person = tok.morph.get('Person'),
            number = tok.morph.get('Number'),
            case = tok.morph.get('Case'),
            gender = tok.morph.get('Gender'),
            verbform = tok.morph.get('VerbForm'),
            degree = tok.morph.get('Degree'),
            prontype = tok.morph.get('PronType'),
        ))

# convert to data frame
spacy_tokens = pd.DataFrame(spacy_tokens)

# simplify list cells
cols = ['mood', 'tense', 'voice', 'person', 'number', 'case', 'gender', 'verbform', 'degree', 'prontype']
spacy_tokens[cols] = spacy_tokens[cols].map(lambda x: None if len(x) == 0 else ','.join(x))

# display
display(spacy_tokens)

## CLTK

In [None]:
failed = []

for i, s in enumerate(test_speeches):
    if (i % 200 == 0) or (i == len(test_speeches) - 1):
        print(f'\r{round(i * 100 /len(test_speeches))} % complete', end='')

    if s.passage.cltk_doc is None:
        try:
            s.passage.runCltkPipeline()
        except:
            print(s)
            print(s.passage.text)
    if s.passage.cltk_doc is None:
        failed.append(s)

if len(failed) > 0:
    print(f'CLTK failed for {len(failed)} speeches:')
    for s in failed:
        print(f' - {s.work.urn}\t{s.work.title}\t{s.l_range}')

In [None]:
# Helper function to extract CLTK features as strings

def getCltkFeature(token, feature, default=None):
    '''convert token's feature bundle to a dictionary and perform a get'''
    d = dict(zip([str(k) for k in token.features.keys()], token.features.values()))
    vlist = d.get(feature)

    if vlist is None:
        return(default)

    return [str(v) for v in vlist]

In [None]:
cltk_tokens = []

# extract features
for i, s in enumerate(test_speeches):
    # progress
    if (i % 200 == 0) or (i == len(test_speeches) - 1):
        print(f'\r{round(i * 100 /len(test_speeches))} % complete', end='')

    # process all tokens in speech
    for tok in s.passage.cltk_doc:
        if s.passage.getLineIndex(tok) is not None:
            line_n = s.passage.line_array[s.passage.getLineIndex(tok)]['N']
        else:
            tok_idx = s.passage.getCltkWordIndex(tok)
            if tok_idx == 0:
                line_n = s.passage.line_array[0]['N']
            elif tok_idx == len(s.passage.cltk_doc.words) - 1:
                line_n = s.passage.line_array[-1]['N']
            else:
                left_tok = s.passage.cltk_doc[tok_idx-1]
                left_line_idx = s.passage.getLineIndex(left_tok)
                right_tok = s.passage.cltk_doc[tok_idx+1]
                right_line_idx = s.passage.getLineIndex(right_tok)
                if (left_line_idx is not None) and (right_line_idx is not None) and (left_line_idx == right_line_idx):
                    line_n = s.passage.line_array[left_line_idx]['N']
                else:
                    line_n = None
        cltk_tokens.append(dict(
            speech_id = s.id,
            lang = s.lang,
            author = s.author.name,
            work = s.work.title,
            urn = s.work.urn,
            l_fi = s.l_fi,
            l_la = s.l_la,
            nlines = len(s.passage.line_array),
            spkr = ','.join([inst.name for inst in s.spkr]),
            addr = ','.join([inst.name for inst in s.addr]),
            part = s.part,
            level = s.level,
            line_n = line_n,
            line_id = f'{s.work.urn}:{line_n}' if line_n is not None else None,
            token = tok.string,
            tok_id = f'{s.id}:{s.passage.getTextPos(tok)}',
            lemma = tok.lemma,
            pos = tok.upos,
            mood = getCltkFeature(tok, 'Mood'),
            tense = getCltkFeature(tok, 'Tense'),
            voice = getCltkFeature(tok, 'Voice'),
            aspect = getCltkFeature(tok, 'Aspect'),
            person = getCltkFeature(tok, 'Person'),
            number = getCltkFeature(tok, 'Number'),
            case = getCltkFeature(tok, 'Case'),
            gender = getCltkFeature(tok, 'Gender'),
            degree = getCltkFeature(tok, 'Degree'),
            verbform = getCltkFeature(tok, 'VerbForm'),
        ))

cltk_tokens = pd.DataFrame(cltk_tokens)

# simplify list cells
cols = ['mood', 'tense', 'voice', 'aspect', 'person', 'number', 'case', 'gender', 'degree', 'verbform']
cltk_tokens[cols] = cltk_tokens[cols].map(lambda x: None if x is None else ','.join(x))

# display results
display(cltk_tokens)

## Add Greek question marks

In [None]:
extra_rows = []

for s in test_speeches:
    if s.lang == "latin":
        continue
    for match in re.finditer("(;)", s.passage.text):
        l_idx = 0
        for next_l_idx, next_c_idx in enumerate(s.passage._line_index):
            if next_c_idx > match.start():
                break
            else:
                l_idx = next_l_idx
        line_n = s.passage.line_array[l_idx]["N"]        
        
        extra_rows.append(dict(
            speech_id = s.id,
            lang = s.lang,
            author = s.author.name,
            work = s.work.title,
            urn = s.work.urn,
            l_fi = s.l_fi,
            l_la = s.l_la,
            nlines = len(s.passage.line_array),
            spkr = ','.join([inst.name for inst in s.spkr]),
            addr = ','.join([inst.name for inst in s.addr]),
            part = s.part,
            level = s.level,
            line_n = line_n,
            line_id = f'{s.work.urn}:{line_n}',
            token = ";",
            tok_id = f'{s.id}:{match.start()}',
            lemma = ";",
            pos = "PUNCT",
        ))

extra_rows = pd.DataFrame(extra_rows)

In [None]:
spacy_tokens = (pd.concat([spacy_tokens, extra_rows], ignore_index=True)
    .assign(temp=lambda df: df["tok_id"].str.split(":"))
    .assign(left=lambda df: df["temp"].str[0].astype(int),
            right=lambda df: df["temp"].str[1].astype(int))
    .sort_values(by=["left", "right"])
    .drop(columns=["left", "right", "temp"])
    .reset_index(drop=True)
    )

## Deduplicate embedded lines

Replace any NA values in the **line_id** column.

In [None]:
mask = cltk_tokens['line_id'].isna()
cltk_tokens.loc[mask, 'line_id'] = cltk_tokens.loc[mask, 'urn'] + ':' + cltk_tokens.loc[mask, 'token']

In [None]:
max_levels = cltk_tokens.groupby('line_id').agg(level=('level', 'max'))
x = cltk_tokens.loc[:,['line_id','level']].merge(max_levels, how='left', on='line_id')
mask = x['level_x'] == x['level_y']
cltk_no_dups = cltk_tokens.loc[mask]
cltk_no_dups.to_csv('cltk_tokens.csv', index=False)
display(cltk_no_dups)

In [None]:
x = spacy_tokens.loc[:,['line_id','level']].merge(max_levels, how='left', on='line_id')
mask = x['level_x'] == x['level_y']
spacy_no_dups = spacy_tokens.loc[mask]
spacy_no_dups.to_csv('spacy_tokens.csv', index=False)
display(spacy_no_dups)

### Alignment

In [None]:
cols = ['tok_id', 'token', 'lemma', 'pos', 'mood', 'tense', 'voice', 'aspect', 'person', 'number', 'case', 'gender', 'degree', 'verbform']
merged = spacy_no_dups.merge(cltk_no_dups[cols], how='left', on='tok_id', suffixes=('_spacy', '_cltk'))
merged.to_csv(os.path.join('data', 'merged.csv'), index=False)
display(merged)