# 1. General notes

### Local CTS server

The public Scaife CTS server from Perseus doesn't provide Quintus. The text exists in the [canonical-greekLit](https://github.com/PerseusDL/canonical-greekLit) Git repo, but it's not configured for Nautilus to serve it. I created my own [canonical-greekLit fork](https://github.com/cwf2/canonical-greekLit) and edited Quintus until Nautilus was happy.

This notebook should come with cached data, so you don't need to reprocess the texts. If you do want to replicate everything from scratch, then run the following code in a terminal window to install and run the CTS server locally.

```bash
    git clone https://github.com/cwf2/canonical-greekLit
    capitains-nautilus canonical-greekLit --port 5000
```

### Odyssey variant reading

The DICES database has a speech by Circe to Odysseus beginning at Od. 10.456; but in the Perseus edition, 456 is missing and the speech begins at 457. I've manually changed the speech start line here to agree with Perseus, avoiding an error when we download the text.

# 2. Steps for processing the speeches

## Import statements

In [1]:
from dicesapi import DicesAPI
from dicesapi.jupyter import NotebookPBar
from dicesapi.text import CtsAPI
from IPython.display import display
from ipywidgets import interactive, widgets
import pandas as pd
import os
import pickle

In [2]:
from cltk.morphology.morphosyntax import Case

In [12]:
!pip install https://huggingface.co/chcaa/grc_odycy_joint_trf/resolve/main/grc_odycy_joint_trf-any-py3-none-any.whl

Collecting grc-odycy-joint-trf==any
  Downloading https://huggingface.co/chcaa/grc_odycy_joint_trf/resolve/main/grc_odycy_joint_trf-any-py3-none-any.whl (497.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m497.3/497.3 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting spacy<3.6.0,>=3.5.0
  Using cached spacy-3.5.4-cp310-cp310-macosx_10_9_x86_64.whl (6.9 MB)
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.7.2
    Uninstalling spacy-3.7.2:
      Successfully uninstalled spacy-3.7.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
la-core-web-lg 3.6.0 requires spacy<3.7.0,>=3.6.0, but you have spacy 3.5.4 which is incompatible.
grc-proiel-lg 3.6.0 requires spacy<3.7.0,>=3.6.0, but you have spacy 3.5.4 which is incompatible.[0m[31m
[0mSuccessfully i

## Initialize connections to DICES and CTS server

In [3]:
# initialize connection to the database
api = DicesAPI(dices_api='http://localhost:8000/api', logfile='dices.log')

# initialize connection to digital libraries
cts = CtsAPI(
    dices_api = api,
    servers = {
        # None:  'https://scaife-cts.perseus.org/api/cts', # default
        None: 'http://localhost:5000/cts', # use local server
    }
)

## Workflow

In [4]:
speeches = {}
words = {}
works = ['iliad', 'odyssey', 'posthomerica']

### Download the speech metadata

In [5]:
for work in works:
    print(f'Retrieving speeches for {work}')
            
    speeches[work] = api.getSpeeches(work_title=work.title())
    print('retrieved', len(speeches[work]), 'results')

    # cludge for textual variant in Odyssey
    if work.title() == 'Odyssey':
        for s in speeches[work]:
            if s.l_fi == '10.456':
                s.l_fi = '10.457'
    
    # another cludge to remove the apologia
    if work.title() == 'Odyssey':
        speeches[work] = [s for s in speeches[work] if s.l_fi.split('.')[0] == s.l_la.split('.')[0]]

Retrieving speeches for iliad
retrieved 698 results
Retrieving speeches for odyssey
retrieved 673 results
Retrieving speeches for posthomerica
retrieved 175 results


### Download the text of the speeches

In [6]:
for work in works:

    pbar = NotebookPBar(max=len(speeches[work]), prefix=f'{work}: ')

    for s in speeches[work]:
        if not hasattr(s, 'passage') or s.passage is None:
            s.passage = cts.getPassage(s)
        if s.passage is None:
            print(f'Text download failed: {s}')
        pbar.update()

HBox(children=(IntProgress(value=0, bar_style='info', max=698), Label(value='iliad: 0/698')))

HBox(children=(IntProgress(value=0, bar_style='info', max=671), Label(value='odyssey: 0/671')))

HBox(children=(IntProgress(value=0, bar_style='info', max=175), Label(value='posthomerica: 0/175')))

### Parse the text

In [7]:
for work in works:
    
    pbar = NotebookPBar(max=len(speeches[work]), prefix=f'{work}: ')

    for s in speeches[work]:
        pbar.update()
        if not hasattr(s, 'passage') or s.passage is None:
            print('no passage:', s)
            continue
        s.passage.runCltkPipeline()
        s.passage.runSpacyPipeline()
        if s.passage.cltk_doc is None:
            print(f'CLTK failed: {s}')
        if s.passage.spacy_doc is None:
            print(f'SpaCy failed: {s}')


HBox(children=(IntProgress(value=0, bar_style='info', max=698), Label(value='iliad: 0/698')))



HBox(children=(IntProgress(value=0, bar_style='info', max=671), Label(value='odyssey: 0/671')))

HBox(children=(IntProgress(value=0, bar_style='info', max=175), Label(value='posthomerica: 0/175')))

### Format tokens as a table

In [8]:
def makeTokenTable(speeches):
    '''Create a DataFrame with one row per token'''
    
    speeches = [s for s in speeches if s.passage.nlp is not None]
    
    words = pd.DataFrame(dict(
        speech_id = s.id,
        book = s.l_fi.split('.')[0],
        line = s.passage.line_array[s.passage.getLineIndex(w)]['n'],
        l_ind = s.passage.getLineIndex(w)+1,
        spkr = s.getSpkrString(),
        addr = s.getAddrString(),
        gend_spkr = ';'.join(sorted(set(inst.gender for inst in s.spkr))),
        gend_addr = ';'.join(sorted(set(inst.gender for inst in s.addr))),
        being_spkr = ';'.join(sorted(set(inst.being for inst in s.spkr))),
        being_addr = ';'.join(sorted(set(inst.being for inst in s.addr))),
        disg = ';'.join([spkr.disg for spkr in s.spkr if spkr.disg is not None]),
        type = ';'.join([t['type'] for t in s._attributes['tags']]),
        token = w.string,
        lemma = w.lemma,
        upos = w.upos,
        case = w.features[Case][0].name if Case in w.features.keys() else None,
        is_voc = 'vocative' in str(w.features),
        features = str(w.features),
    ) for s in speeches for w in s.passage.cltk_doc)

    # filter out punctuation tokens
    words = words[(words.token != '.') & (words.upos != 'PUNCT')]
        
    return words

In [9]:
def makeSpacyTable(speeches):
    '''Create a DataFrame with one row per token'''
    
    speeches = [s for s in speeches if s.passage.spacy_doc is not None]
    
    words = pd.DataFrame(dict(
        speech_id = s.id,
        book = s.l_fi.split('.')[0],
        line = s.passage.line_array[s.passage.getLineIndex(w)]['n'],
        l_ind = s.passage.getLineIndex(w)+1,
        spkr = s.getSpkrString(),
        addr = s.getAddrString(),
        gend_spkr = ';'.join(sorted(set(inst.gender for inst in s.spkr))),
        gend_addr = ';'.join(sorted(set(inst.gender for inst in s.addr))),
        being_spkr = ';'.join(sorted(set(inst.being for inst in s.spkr))),
        being_addr = ';'.join(sorted(set(inst.being for inst in s.addr))),
        disg = ';'.join([spkr.disg for spkr in s.spkr if spkr.disg is not None]),
        type = ';'.join([t['type'] for t in s._attributes['tags']]),
        token = w.text,
        lemma = w.lemma_,
        upos = w.pos_,
        case = w.morph.to_dict().get('Case'),
        is_voc = w.morph.to_dict().get('Case') == 'Voc',
        features = str(w.morph),
    ) for s in speeches for w in s.passage.spacy_doc)

    # filter out punctuation tokens
    words = words[(words.token != '.') & (words.upos != 'PUNCT')]
        
    return words

In [10]:
cltk = {}
spacy = {}
for work in works:
    cltk[work] = makeTokenTable(speeches[work])
    spacy[work] = makeSpacyTable(speeches[work])

In [11]:
cltk['iliad'].to_csv('iliad_cltk.csv', index=False)
spacy['iliad'].to_csv('iliad_spacy.csv', index=False)

## 3. Run the whole workflow

In [None]:
for work in works:
    print('Processing', work.title())
    
    # local file paths
    cache = os.path.join('data', f'{work}_speeches.pickle')
    output = os.path.join('data', f'{work}.csv')
    
    # use cached data if present
    if os.path.exists(cache):
        with open(cache, 'rb') as f:
            speeches[work] = pickle.load(f)
        print('loaded', len(speeches[work]), 'cached results')
    else:    
        speeches[work] = dlSpeechData(work)
        dlSpeechText(speeches[work])
        parseSpeechText(speeches[work])
        with open(cache, 'wb') as f:
            pickle.dump(speeches[work], f)
        print('saved', len(speeches[work]), 'results to', cache)
    
    # generate tabular data
    words[work] = makeTokenTable(speeches[work])
    
    # save output
    print(f'Writing {output}')
    words[work].to_csv(output, index=False)

### inspect the table of values

In [None]:
words['iliad']

# 4. summary statistics

## helper functions

### Simple count of vocatives by book

In [None]:
def getTableByBook(work):
    df = words[work][words[work].is_voc].pivot_table(
        index = 'book',
        values = 'speech_id',
        aggfunc = 'count',
        sort = False,
        fill_value = 0,
    )
    return df

def getPlotByBook(work):
    df = tableByBook(work)
    plot = df.plot.bar(
        title = f'Vocatives in the {work.title()}',
        legend = False,
        rot = False,
        ylabel = 'count',
        figsize = (10,4),
    )
    return plot

### Normalized for book length

In [None]:
def getTableByBookNorm(work):

    df = words[work].pivot_table(
        index = 'book',
        values = 'speech_id',
        columns = 'is_voc',
        aggfunc = 'count',
        sort = False,
        fill_value = 0,
    ).rename(columns={True:'voc', False:'other'})

    df['prop'] = df['voc'] / (df['voc'] + df['other']) * 1000

    return df

def getPlotByBookNorm(work):
    df = getTableByBookNorm(work)
    plot = df['prop'].plot.bar(
        title = f'Vocatives in the {work.title()}',
        legend = False,
        ylabel = 'count per 1000 words',
        rot = False,
        figsize = (10,4),
    )
    
    return plot

### by speaker

In [None]:
def getTableBySpeaker(work):
    
    df = words[work].pivot_table(
        index = 'spkr',
        values = 'speech_id',
        columns = 'is_voc',
        aggfunc = 'count',
        fill_value = 0,
    )

    df = df.rename(columns={True:'voc', False:'other'})

    df['prop'] = round(df['voc'] / (df['voc'] + df['other']) * 1000, 2)
    
    return df

### by part of speech

In [None]:
def getTableByPOS(work):

    df = words[work].pivot_table(
        index = 'upos',
        values = 'speech_id',
        columns = 'is_voc',
        aggfunc = 'count',
        fill_value = 0,
    ).rename(columns={True:'voc', False:'other'})

    df.sort_values('voc', ascending=False)

## Display results

### Normalized vocatives by book

In [None]:
display(interactive(getPlotByBookNorm, work=works))

### By Speaker

In [None]:
view = lambda work: display(getTableBySpeaker(work))
interactive(view, work=works)

#### greatest number

In [None]:
view = lambda work, n: display(getTableBySpeaker(work).sort_values('voc', ascending=False)[:n])
display(interactive(view, work=works, n=widgets.IntSlider(min=10, max=100, step=5, value=10)))

#### highest proportion

In [None]:
view = lambda work, n: display(getTableBySpeaker(work).sort_values('prop', ascending=False)[:n])
display(interactive(view, work=works, n=widgets.IntSlider(min=10, max=100, step=5, value=10)))

#### highest proportion among speakers of at least 1000 words

In [None]:
def view(work, min_words, n_results):
    df = getTableBySpeaker(work)
    df = df[(df.other + df.voc) >= min_words]
    df = df.sort_values('prop', ascending=False)[:n_results]
    display(df)

display(interactive(view, work=works, n_results=widgets.IntSlider(min=10, max=100, step=5, value=10),
                   min_words = widgets.IntSlider(min=100, max=2000, step=100, value=1000)))