# Collect speeches

This executes all code in `1 - Collect Speeches.ipynb` and generates a DICES `SpeechGroup` called `test_speeches`, which we use below. To see in detail how the speeches are downloaded and pre-processed, you can run the first notebook separately.

In [1]:
%run "1 - Collect Speeches.ipynb"

Checking for local text repositories...
 - data/canonical-greekLit exists!
 - data/canonical-latinLit exists!


2025-06-19 13:57:46,639 - [ERROR] data/canonical-greekLit/data/tlg0003/tlg001/tlg0003.tlg001.perseus-eng5.xml is not present
2025-06-19 13:57:46,719 - [ERROR] data/canonical-greekLit/data/tlg0060/tlg001/tlg0060.tlg001.perseus-grc3.xml does not accept parsing at some level (most probably citation) 
2025-06-19 13:57:46,775 - [ERROR] data/canonical-greekLit/data/tlg0719/tlg002/tlg0719.tlg002.perseus-eng2.xml is not present
2025-06-19 13:57:47,034 - [ERROR] data/canonical-greekLit/data/tlg0086/tlg035/tlg0086.tlg035.perseus-grc1.xml does not accept parsing at some level (most probably citation) 
2025-06-19 13:57:47,038 - [ERROR] data/canonical-greekLit/data/tlg0086/tlg035/tlg0086.tlg035.perseus-eng1.xml does not accept parsing at some level (most probably citation) 
2025-06-19 13:57:47,042 - [ERROR] data/canonical-greekLit/data/tlg0086/tlg029/tlg0086.tlg029.perseus-grc1.xml does not accept parsing at some level (most probably citation) 
2025-06-19 13:57:47,043 - [ERROR] data/canonical-greek

Corrected <Speech 1481: Argonautica 3.727-3.739> to <Speech 1481: Argonautica 3.727-3.738>
Corrected <Speech 931: Odyssey 10.456-10.465> to <Speech 931: Odyssey 10.457-10.465>
Corrected <Speech 4379: 4 To Hermes 526b-568> to <Speech 4379: 4 To Hermes 526-568>
Corrected <Speech 3584: Dionysiaca 40.542-40.569> to <Speech 3584: Dionysiaca 40.542-40.568>
Corrected <Speech 1884: Metamorphoses 1.545-1.546> to <Speech 1884: Metamorphoses 1.545-1.547>
Corrected <Speech 2025: Metamorphoses 4.793-4.803> to <Speech 2025: Metamorphoses 4.793-4.801>
Corrected <Speech 2341: Metamorphoses 14.383-14.385> to <Speech 2341: Metamorphoses 14.383-14.384>
Corrected <Speech 3085: Thebaid 4.832-4.850> to <Speech 3085: Thebaid 4.825-4.842>
100 % complete


# Run Spacy

In [2]:
# initialize spacy models
spacy_load(
    latin_model = 'la_core_web_lg',
    greek_model = 'grc_odycy_joint_trf',
)



In [3]:
failed = []

for i, s in enumerate(test_speeches):
    if (i % 50 == 0) or (i == len(test_speeches) - 1):
        print(f'\r{round(i * 100 /len(test_speeches))} % complete', end='')
    if s.passage.spacy_doc is None:
        s.passage.runSpacyPipeline()
    if s.passage.spacy_doc is None:
        failed.append(s)

if len(failed) > 0:
    print(f'SpaCy failed for {len(failed)} speeches:')
    for s in failed:
        print(f' - {s.work.urn}\t{s.work.title}\t{s.l_range}')

100 % complete

### Generate tabular data

In [43]:
spacy_tokens = []

# extract features
for i, s in enumerate(test_speeches):
    # progress
    if (i % 200 == 0) or (i == len(test_speeches) - 1):
        print(f'\r{round(i * 100 /len(test_speeches))} % complete', end='')

    # FIXME: getTextPos() breaks if Latin marks dieresis with e.g. ë
    lastpos = 0
    
    # process all tokens in speech
    for tok in s.passage.spacy_doc:

        # FIXME: see above
        textpos = s.passage.getTextPos(tok) or (lastpos + 1)
        lastpos = textpos
        
        line_n = s.passage.line_array[s.passage.getLineIndex(tok)]['N'] if s.passage.getLineIndex(tok) is not None else None
        spacy_tokens.append(dict(
            speech_id = s.id,
            lang = s.lang,
            author = s.author.name,
            work = s.work.title,
            urn = s.work.urn,
            l_fi = s.l_fi,
            l_la = s.l_la,
            nlines = len(s.passage.line_array),
            spkr = ','.join([inst.name for inst in s.spkr]),
            addr = ','.join([inst.name for inst in s.addr]),
            part = s.part,
            level = s.level,
            line_n = line_n,
            line_id = f'{s.work.urn}:{line_n}',
            token = tok.text,
            tok_id = f'{s.id}:{textpos}',
            lemma = tok.lemma_,
            pos = tok.pos_,
            mood = tok.morph.get('Mood'),
            tense = tok.morph.get('Tense'),
            voice = tok.morph.get('Voice'),
            person = tok.morph.get('Person'),
            number = tok.morph.get('Number'),
            case = tok.morph.get('Case'),
            gender = tok.morph.get('Gender'),
            verbform = tok.morph.get('VerbForm'),
            degree = tok.morph.get('Degree'),
            prontype = tok.morph.get('PronType'),
        ))

# convert to data frame
spacy_tokens = pd.DataFrame(spacy_tokens)

# simplify list cells
cols = ['mood', 'tense', 'voice', 'person', 'number', 'case', 'gender', 'verbform', 'degree', 'prontype']
spacy_tokens[cols] = spacy_tokens[cols].map(lambda x: None if len(x) == 0 else ','.join(x))

100 % complete

### Add Greek question marks

In [44]:
extra_rows = []

for s in test_speeches:
    if s.lang == "latin":
        continue
    for match in re.finditer("(;)", s.passage.text):
        l_idx = 0
        for next_l_idx, next_c_idx in enumerate(s.passage._line_index):
            if next_c_idx > match.start():
                break
            else:
                l_idx = next_l_idx
        line_n = s.passage.line_array[l_idx]["N"]        
        
        extra_rows.append(dict(
            speech_id = s.id,
            lang = s.lang,
            author = s.author.name,
            work = s.work.title,
            urn = s.work.urn,
            l_fi = s.l_fi,
            l_la = s.l_la,
            nlines = len(s.passage.line_array),
            spkr = ','.join([inst.name for inst in s.spkr]),
            addr = ','.join([inst.name for inst in s.addr]),
            part = s.part,
            level = s.level,
            line_n = line_n,
            line_id = f'{s.work.urn}:{line_n}',
            token = ";",
            tok_id = f'{s.id}:{match.start()}',
            lemma = ";",
            pos = "PUNCT",
        ))

extra_rows = pd.DataFrame(extra_rows)

In [45]:
spacy_tokens = (pd.concat([spacy_tokens, extra_rows], ignore_index=True)
    .assign(temp=lambda df: df["tok_id"].str.split(":"))
    .assign(left=lambda df: df["temp"].str[0].astype(int),
            right=lambda df: df["temp"].str[1].astype(int))
    .sort_values(by=["left", "right"])
    .drop(columns=["left", "right", "temp"])
    .reset_index(drop=True)
    )

### Save and display

In [46]:
# save to temp file
spacy_tokens.to_csv("spacy_tokens.csv", index=False)

# display
display(spacy_tokens)

Unnamed: 0,speech_id,lang,author,work,urn,l_fi,l_la,nlines,spkr,addr,...,mood,tense,voice,person,number,case,gender,verbform,degree,prontype
0,1,greek,Homer,Iliad,urn:cts:greekLit:tlg0012.tlg001.perseus-grc2,1.17,1.21,5,Chryses,"Agamemnon,Greeks",...,,,,,Plur,Nom,Masc,,,
1,1,greek,Homer,Iliad,urn:cts:greekLit:tlg0012.tlg001.perseus-grc2,1.17,1.21,5,Chryses,"Agamemnon,Greeks",...,,,,,,,,,,
2,1,greek,Homer,Iliad,urn:cts:greekLit:tlg0012.tlg001.perseus-grc2,1.17,1.21,5,Chryses,"Agamemnon,Greeks",...,,,,,,,,,,
3,1,greek,Homer,Iliad,urn:cts:greekLit:tlg0012.tlg001.perseus-grc2,1.17,1.21,5,Chryses,"Agamemnon,Greeks",...,,,,,Plur,Nom,Masc,,,
4,1,greek,Homer,Iliad,urn:cts:greekLit:tlg0012.tlg001.perseus-grc2,1.17,1.21,5,Chryses,"Agamemnon,Greeks",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410805,4696,latin,Prudentius,Psychomachia,urn:cts:latinLit:stoa0238.stoa002.perseus-lat2,1.799,1.822,24,Fides,Virtues,...,,,,,Plur,Abl,Neut,,,
410806,4696,latin,Prudentius,Psychomachia,urn:cts:latinLit:stoa0238.stoa002.perseus-lat2,1.799,1.822,24,Fides,Virtues,...,,,,,Sing,Acc,Fem,,,
410807,4696,latin,Prudentius,Psychomachia,urn:cts:latinLit:stoa0238.stoa002.perseus-lat2,1.799,1.822,24,Fides,Virtues,...,Sub,Pres,Act,3,Sing,,,Fin,,
410808,4696,latin,Prudentius,Psychomachia,urn:cts:latinLit:stoa0238.stoa002.perseus-lat2,1.799,1.822,24,Fides,Virtues,...,,Past,Pass,,Sing,Nom,Fem,Part,,
