In [None]:
# load the DICES interface
from dicesapi import DicesAPI
from dicesapi.jupyter import NotebookPBar
from dicesapi.text import CtsAPI
import pandas as pd
import os
import pickle

In [None]:
# initialize connection to the database
api = DicesAPI(logfile='dices.log')

# initialize connection to digital libraries
cts = CtsAPI()

In [None]:
cache_file = os.path.join('data', 'iliad_parsed.pickle')

if os.path.exists(cache_file):
    # load saved data  
    with open(cache_file, 'rb') as f:
        speeches = pickle.load(f)
    print('loaded', len(speeches), 'cached results')
else:
    # download speeches
    speeches = api.getSpeeches(work_title='Iliad')
    print('retrieved', len(speeches), 'results')

In [None]:
pbar = NotebookPBar(max=len(speeches))

for s in speeches:
    if not hasattr(s, 'passage'):
        s.passage = cts.getPassage(s)
    pbar.update()

In [None]:
pbar = NotebookPBar(max=len(speeches))

for s in speeches:
    if not hasattr(s.passage, 'cltk'):
        s.passage.runCltkPipeline(index=False)
    pbar.update()

In [None]:
words = pd.DataFrame(dict(
    speech_id = s.id,
    book = s.l_fi.split('.')[0],
    spkr = s.getSpkrString(),
    addr = s.getAddrString(),
    gend_spkr = ','.join(sorted(set(inst.gender for inst in s.spkr))),
    gend_addr = ','.join(sorted(set(inst.gender for inst in s.addr))),
    string = w.string,
    lemma = w.lemma,
    upos = w.upos,
    features = str(w.features),
) for s in speeches for w in s.passage.cltk)

display(words)

### Remove punctuation

In [None]:
words = words[(words.string != '.') & (words.upos != 'PUNCT')]

### Vocatives

In [None]:
words['is_voc'] = words.features.str.contains('vocative')

In [None]:
display(words[words.is_voc])

### By book

In [None]:
words[words.is_voc].pivot_table(
    index = 'book',
    values = 'speech_id',
    aggfunc = 'count',
    sort = False,
    fill_value = 0,
).plot.bar(
    title = 'Vocatives in the Iliad',
    legend = False,
    rot = False,
    figsize = (10,4),
)

### Normalize by book length

In [None]:
voc_book = words.pivot_table(
    index = 'book',
    values = 'speech_id',
    columns = 'is_voc',
    aggfunc = 'count',
    sort = False,
    fill_value = 0,
).rename(columns={True:'voc', False:'other'})

voc_book['prop'] = voc_book['voc'] / (voc_book['voc'] + voc_book['other']) * 1000

voc_book['prop'].plot.bar(
    title = 'Vocatives in the Iliad',
    legend = False,
    rot = False,
    figsize = (10,4),
)

## By speaker

In [None]:
voc_spkr = words.pivot_table(
    index = 'spkr',
    values = 'speech_id',
    columns = 'is_voc',
    aggfunc = 'count',
    fill_value = 0,
)

voc_spkr = voc_spkr.rename(columns={True:'voc', False:'other'})

voc_spkr['prop'] = round(voc_spkr['voc'] / (voc_spkr['voc'] + voc_spkr['other']) * 1000, 2)

#### Greatest number

In [None]:
display(voc_spkr.sort_values('voc', ascending=False)[:10])

#### highest proportion

In [None]:
display(voc_spkr.sort_values('prop', ascending=False)[:10])

#### highest proportion among speakers of at least 1000 words

In [None]:
display(voc_spkr[(voc_spkr.other + voc_spkr.voc) > 999].sort_values('prop', ascending=False)[:10])

## By part of speech

In [None]:
voc_pos = words.pivot_table(
    index = 'upos',
    values = 'speech_id',
    columns = 'is_voc',
    aggfunc = 'count',
    fill_value = 0,
).rename(columns={True:'voc', False:'other'})

voc_pos.sort_values('voc', ascending=False)