# 1. General notes

### Local CTS server

The public Scaife CTS server from Perseus doesn't provide Quintus. The text exists in the [canonical-greekLit](https://github.com/PerseusDL/canonical-greekLit) Git repo, but it's not configured for Nautilus to serve it. I created my own [canonical-greekLit fork](https://github.com/cwf2/canonical-greekLit) and edited Quintus until Nautilus was happy.

Run the following code in a terminal window to install and run the server locally.

```bash
    git clone https://github.com/cwf2/canonical-greekLit
    capitains-nautilus canonical-greekLit --port 5000
```

### Odyssey variant reading

The DICES database has a speech by Circe to Odysseus beginning at Od. 10.456; but in the Perseus edition, 456 is missing and the speech begins at 457. I've manually changed the speech start line here to agree with Perseus, avoiding an error when we download the text.

# 2. Steps for processing the speeches

## Import statements

In [None]:
from dicesapi import DicesAPI
from dicesapi.jupyter import NotebookPBar
from dicesapi.text import CtsAPI
import pandas as pd
import os
import pickle

## Initialize connections to DICES and CTS server

In [None]:
# initialize connection to the database
api = DicesAPI(logfile='dices.log')

# initialize connection to digital libraries
cts = CtsAPI(
    dices_api = api,
    servers = {
        # None:  'https://scaife-cts.perseus.org/api/cts', # default
        None: 'http://localhost:5000/cts', # use local server
    }
)


# cache file for saving parsed text
pickle_file = os.path.join('..', 'data', '{work}_speeches.pickle')
# csv file for export to Excel
csv_file = os.path.join('..', 'data', '{work}.csv')

## Workflow

I'm setting this up the steps as a series of function definitions so that it's easier to loop over the individual texts.

### Download the speech metadata

In [None]:
def dlSpeechData(work):
    '''Download all the speeches for a given work'''

    print(f'Retrieving speeches for {work}')
            
    speeches = api.getSpeeches(work_title=work.title())
    print('retrieved', len(speeches), 'results')

    
    # cludge for textual variant in Odyssey
    if work.title() == 'Odyssey':
        for s in speeches:
            if s.l_fi == '10.456':
                s.l_fi = '10.457'
    
    # another cludge to remove the apologia
    if work.title() == 'Odyssey':
        speeches = [s for s in speeches if s.l_fi.split('.')[0] == s.l_la.split('.')[0]]
                
    return speeches

### Download the text of the speeches

In [None]:
def dlSpeechText(speeches):
    '''Download the text of the speeches from CTS server, append to speech objects'''
    pbar = NotebookPBar(max=len(speeches), prefix='Downloading text')

    for s in speeches:
        if not hasattr(s, 'passage') or s.passage is None:
            s.passage = cts.getPassage(s)
        pbar.update()

### Parse the text

In [None]:
def parseSpeechText(speeches):
    '''Run CLTK NLP pipeline to parse all the speeches'''
    
    pbar = NotebookPBar(max=len(speeches), prefix='Running NLP')

    for s in speeches:
        if not hasattr(s, 'passage') or s.passage is None:
            print('no passage:', s)
        elif not hasattr(s.passage, 'cltk') or s.passage.cltk is None:
            s.passage.runCltkPipeline(remove_punct=True)
        pbar.update()

### Format tokens as a table

In [None]:
def makeTokenTable(speeches):
    '''Create a DataFrame with one row per token'''
    words = pd.DataFrame(dict(
        speech_id = s.id,
        book = s.l_fi.split('.')[0],
        line = s.passage.line_array[s.passage.getLineIndex(w)]['n'],
        l_ind = s.passage.getLineIndex(w)+1,
        spkr = s.getSpkrString(),
        addr = s.getAddrString(),
        gend_spkr = ','.join(sorted(set(inst.gender for inst in s.spkr))),
        gend_addr = ','.join(sorted(set(inst.gender for inst in s.addr))),
        string = w.string,
        lemma = w.lemma,
        upos = w.upos,
        is_voc = 'vocative' in str(w.features),
        features = str(w.features),
    ) for s in speeches for w in s.passage.cltk)

    # filter out punctuation tokens
    words = words[(words.string != '.') & (words.upos != 'PUNCT')]
        
    return words

### Run the whole workflow on a specific text

In [None]:
def runWorkflow(work):
    '''Run all the previous functions in order on one text'''

    print('Processing', work.title())
    
    # use cached data if present
    cache = pickle_file.format(work=work)
    if os.path.exists(cache):
        with open(cache, 'rb') as f:
            speeches = pickle.load(f)
        print('loaded', len(speeches), 'cached results')
    else:    
        speeches = dlSpeechData(work)
        dlSpeechText(speeches)
        parseSpeechText(speeches)
        with open(cache, 'wb') as f:
            pickle.dump(speeches, f)
        print('saved', len(speeches), 'results to', cache)
    
    # generate tabular data
    words = makeTokenTable(speeches)
    
    # save output
    output = csv_file.format(work=work)
    print(f'Writing {output}')
    words.to_csv(output, index=False)
    
    # return the table
    return words

# 3. Try it out

## process one text

### select work

Change this to one of `'iliad'`, `'odyssey'`, or `'posthomerica'`

In [None]:
work = 'iliad'

### run the workflow

In [None]:
words = runWorkflow(work)

### inspect the table of values

In [None]:
words

## summary statistics

### vocatives by book

In [None]:
words[words.is_voc].pivot_table(
    index = 'book',
    values = 'speech_id',
    aggfunc = 'count',
    sort = False,
    fill_value = 0,
).plot.bar(
    title = f'Vocatives in the {work.title()}',
    legend = False,
    rot = False,
    ylabel = 'count',
    figsize = (10,4),
)

### Normalized for book length

In [None]:
voc_book = words.pivot_table(
    index = 'book',
    values = 'speech_id',
    columns = 'is_voc',
    aggfunc = 'count',
    sort = False,
    fill_value = 0,
).rename(columns={True:'voc', False:'other'})

voc_book['prop'] = voc_book['voc'] / (voc_book['voc'] + voc_book['other']) * 1000

voc_book['prop'].plot.bar(
    title = f'Vocatives in the {work.title()}',
    legend = False,
    ylabel = 'count per 1000 words',
    rot = False,
    figsize = (10,4),
)

### by speaker

In [None]:
voc_spkr = words.pivot_table(
    index = 'spkr',
    values = 'speech_id',
    columns = 'is_voc',
    aggfunc = 'count',
    fill_value = 0,
)

voc_spkr = voc_spkr.rename(columns={True:'voc', False:'other'})

voc_spkr['prop'] = round(voc_spkr['voc'] / (voc_spkr['voc'] + voc_spkr['other']) * 1000, 2)

#### greatest number

In [None]:
display(voc_spkr.sort_values('voc', ascending=False)[:10])

#### highest proportion

In [None]:
display(voc_spkr.sort_values('prop', ascending=False)[:10])

#### highest proportion among speakers of at least 1000 words

In [None]:
display(voc_spkr[(voc_spkr.other + voc_spkr.voc) > 999].sort_values('prop', ascending=False)[:10])

### by part of speech

In [None]:
voc_pos = words.pivot_table(
    index = 'upos',
    values = 'speech_id',
    columns = 'is_voc',
    aggfunc = 'count',
    fill_value = 0,
).rename(columns={True:'voc', False:'other'})

voc_pos.sort_values('voc', ascending=False)