### Import statemtents

In addition to the usual DICES code, we're also importing **re** the regular expression module for Python. This provides more advanced functionality for searching and replacing text, and we'll use it to adjust the loci in Claudian.

In [1]:
from dicesapi import DicesAPI, SpeechGroup
from dicesapi.jupyter import NotebookPBar
from dicesapi.text import CtsAPI
import re
import pandas as pd

### Connections to DICES and Perseus

In [2]:
api = DicesAPI(
    logfile = 'dices.log', 
    progress_class = NotebookPBar,
)
cts = CtsAPI(
    dices_api = api,
)

### Download all of Claudian

In [3]:
speeches = api.getSpeeches(author_name='Claudian', progress=True)

HBox(children=(IntProgress(value=0, bar_style='info', max=112), Label(value='0/112')))

### Adjust line numbers for some Claudian texts (and Prudentius)

This is necessary to match the somewhat unorthodox format used by Perseus for loci in poems with a preface.

In [4]:
adj_book_line = [
    'De Raptu Proserpinae',
    'In Rufinum',
]
adj_line = [
    'Panegyricus de consulatu Manlii Theodori',
    'Panegyricus de Tertio Consulatu Honorii Augusti',
    'Panegyricus de Sexto Consulatu Honorii Augusti',
    'Epithalamium de Nuptiis Honorii Augusti',
    'De Bello Gothico',
    'Psychomachia',    
]

for s in speeches:
    if s.work.title in adj_book_line:
        m = re.fullmatch(r'(\d+)\.(\d+)', s.l_fi)
        if m:
            s.l_fi = f'{m.group(1)}.1.{m.group(2)}'

        m = re.fullmatch(r'(\d+)\.(\d+)', s.l_la)
        if m:
            s.l_la = f'{m.group(1)}.1.{m.group(2)}'

    elif s.work.title in adj_line:
        m = re.fullmatch(r'(\d+)', s.l_fi)
        if m:
            s.l_fi = '1.' + m.group(1)

        m = re.fullmatch(r'(\d+)', s.l_la)
        if m:
            s.l_la = '1.' + m.group(1)

### Download the text

In [5]:
pbar = NotebookPBar(max=len(speeches))

for i, s in enumerate(speeches):
    pbar.update()
    if not hasattr(s, 'passage') or s.passage is None:
        s.passage = cts.getPassage(s)
    if s.passage is None:
        print(f'Failed: {s.work.urn}\t{s.work.title}\t{s.l_range}')

HBox(children=(IntProgress(value=0, bar_style='info', max=112), Label(value='0/112')))

Failed: urn:cts:latinLit:stoa0089.stoa005.perseus-lat2	De Raptu Proserpinae	2.praef.33-2.praef.48


### NLP with SpaCy

In [6]:
pbar = NotebookPBar(max=len(speeches))

for i, s in enumerate(speeches):
    pbar.update()
    if hasattr(s, 'passage') and s.passage is not None:
        if not hasattr(s.passage, 'spacy_doc') or s.passage.spacy_doc is None:
            s.passage.runSpacyPipeline()
        if s.passage.spacy_doc is None:
            print(f'Failed: {s.work.urn}\t{s.work.title}\t{s.l_range}')

HBox(children=(IntProgress(value=0, bar_style='info', max=112), Label(value='0/112')))

### Drop speeches for which there is no text

In [7]:
dropped = speeches.advancedFilter(lambda s: s.passage is None)
print('dropped:')
for s in dropped:
    print(s)
    
speeches = speeches - dropped

dropped:
<Speech 4660: De Raptu Proserpinae 2.praef.33-2.praef.48>


### Make a table of tokens

In [8]:
rows = []

for s in speeches:
    for w in s.passage.spacy_doc:        
        morph = w.morph.to_dict()
    
        this_row = dict(
            speech_id = s.id,
            author = s.author.name,
            work = s.work.title,
            loci = s.l_range,
            spkr = [inst.name for inst in s.spkr],
            addr = [inst.name for inst in s.spkr],
            tags = [tag['type'] for tag in s._attributes['tags']],
            nlines = len(s.passage.line_array),      
            token = w.text,
            lemma = w.lemma_,
            line = s.passage.getLineIndex(w),
            pos = w.pos_,
            mood = morph.get('Mood', pd.NA),
            voice = morph.get('Voice', pd.NA),
            tense = morph.get('Tense', pd.NA),
            person = morph.get('Person', pd.NA),
            number = morph.get('Number', pd.NA),
            case = morph.get('Case', pd.NA),
            gender = morph.get('Gender', pd.NA),
        )
        
        rows.append(this_row)

tokens = pd.DataFrame(rows)
display(tokens)

Unnamed: 0,speech_id,author,work,loci,spkr,addr,tags,nlines,token,lemma,line,pos,mood,voice,tense,person,number,case,gender
0,4578,Claudian,De bello Gildonico,28-127,[Roma],[Roma],[und],100,Si,si,0,SCONJ,,,,,,,
1,4578,Claudian,De bello Gildonico,28-127,[Roma],[Roma],[und],100,mea,meus,0,ADJ,,,,,Sing,Nom,Fem
2,4578,Claudian,De bello Gildonico,28-127,[Roma],[Roma],[und],100,mansuris,mansura,0,NOUN,,,,,Sing,Gen,Neut
3,4578,Claudian,De bello Gildonico,28-127,[Roma],[Roma],[und],100,meruerunt,mereo,0,VERB,Ind,Act,Past,3,Plur,,
4,4578,Claudian,De bello Gildonico,28-127,[Roma],[Roma],[und],100,moenia,moenium,0,NOUN,,,,,Plur,Acc,Neut
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17554,4592,Claudian,Panegyricus Probino et Olybrio,236-262,[Tiberinus],[Tiberinus],"[que, lau, com]",27,dies,dies,26,NOUN,,,,,Plur,Acc,Fem
17555,4592,Claudian,Panegyricus Probino et Olybrio,236-262,[Tiberinus],[Tiberinus],"[que, lau, com]",27,semper,semper,26,ADV,,,,,,,
17556,4592,Claudian,Panegyricus Probino et Olybrio,236-262,[Tiberinus],[Tiberinus],"[que, lau, com]",27,dapibus,dapis,26,ADJ,,,,,Plur,Abl,Fem
17557,4592,Claudian,Panegyricus Probino et Olybrio,236-262,[Tiberinus],[Tiberinus],"[que, lau, com]",27,recoletur,recoletur,26,VERB,Sub,Pass,Pres,3,Plur,,


### Export to Excel

In [9]:
filename = 'claudian_tokens.csv'
tokens.to_csv(filename, index=False)