# Preliminaries

## File paths

In [None]:
data_dir = "data"
cltk_file = "cltk_tokens.csv"

# Collect speeches

This executes all code in `1 - Collect Speeches.ipynb` and generates a DICES `SpeechGroup` called `test_speeches`, which we use below. To see in detail how the speeches are downloaded and pre-processed, you can run the first notebook separately.

In [1]:
%run "1 - Collect Speeches.ipynb"

Checking for local text repositories...
 - data/canonical-greekLit exists!
 - data/canonical-latinLit exists!


2025-06-19 14:03:07,710 - [ERROR] data/canonical-greekLit/data/tlg0003/tlg001/tlg0003.tlg001.perseus-eng5.xml is not present
2025-06-19 14:03:07,799 - [ERROR] data/canonical-greekLit/data/tlg0060/tlg001/tlg0060.tlg001.perseus-grc3.xml does not accept parsing at some level (most probably citation) 
2025-06-19 14:03:07,867 - [ERROR] data/canonical-greekLit/data/tlg0719/tlg002/tlg0719.tlg002.perseus-eng2.xml is not present
2025-06-19 14:03:08,170 - [ERROR] data/canonical-greekLit/data/tlg0086/tlg035/tlg0086.tlg035.perseus-grc1.xml does not accept parsing at some level (most probably citation) 
2025-06-19 14:03:08,175 - [ERROR] data/canonical-greekLit/data/tlg0086/tlg035/tlg0086.tlg035.perseus-eng1.xml does not accept parsing at some level (most probably citation) 
2025-06-19 14:03:08,180 - [ERROR] data/canonical-greekLit/data/tlg0086/tlg029/tlg0086.tlg029.perseus-grc1.xml does not accept parsing at some level (most probably citation) 
2025-06-19 14:03:08,181 - [ERROR] data/canonical-greek

Corrected <Speech 1481: Argonautica 3.727-3.739> to <Speech 1481: Argonautica 3.727-3.738>
Corrected <Speech 931: Odyssey 10.456-10.465> to <Speech 931: Odyssey 10.457-10.465>
Corrected <Speech 4379: 4 To Hermes 526b-568> to <Speech 4379: 4 To Hermes 526-568>
Corrected <Speech 3584: Dionysiaca 40.542-40.569> to <Speech 3584: Dionysiaca 40.542-40.568>
Corrected <Speech 1884: Metamorphoses 1.545-1.546> to <Speech 1884: Metamorphoses 1.545-1.547>
Corrected <Speech 2025: Metamorphoses 4.793-4.803> to <Speech 2025: Metamorphoses 4.793-4.801>
Corrected <Speech 2341: Metamorphoses 14.383-14.385> to <Speech 2341: Metamorphoses 14.383-14.384>
Corrected <Speech 3085: Thebaid 4.832-4.850> to <Speech 3085: Thebaid 4.825-4.842>
100 % complete


# Run CLTK

In [2]:
failed = []

for i, s in enumerate(test_speeches):
    if (i % 200 == 0) or (i == len(test_speeches) - 1):
        print(f'\r{round(i * 100 /len(test_speeches))} % complete', end='')

    if s.passage.cltk_doc is None:
        try:
            s.passage.runCltkPipeline()
        except:
            if DEBUG:
                print(s)
                print(s.passage.text)
            raise
    if s.passage.cltk_doc is None:
        failed.append(s)

if len(failed) > 0:
    print(f'CLTK failed for {len(failed)} speeches:')
    for s in failed:
        print(f' - {s.work.urn}\t{s.work.title}\t{s.l_range}')

0 % completeCLTK message: This part of the CLTK depends upon a spaCy NLP mode.
CLTK message: Allow download of spaCy model ``grc_odycy_joint_sm`` from ``https://huggingface.co/chcaa/grc_odycy_joint_sm/resolve/main/grc_odycy_joint_sm-any-py3-none-any.whl``? [Y/n] 


 


Collecting grc_odycy_joint_sm@ https://huggingface.co/chcaa/grc_odycy_joint_sm/resolve/main/grc_odycy_joint_sm-any-py3-none-any.whl
  Downloading https://huggingface.co/chcaa/grc_odycy_joint_sm/resolve/main/grc_odycy_joint_sm-any-py3-none-any.whl (19.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.0/19.0 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: grc_odycy_joint_sm
Successfully installed grc_odycy_joint_sm-0.7.0



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Unrecognized UD feature 'Variant' with value 'Greek'.
If you believe this is not an error in the dependency parser, please raise an issue at <https://github.com/cltk/cltk/issues> and include a short text to reproduce the error.

Unrecognized UD feature 'Compound' with value 'Yes'.
If you believe this is not an error in the dependency parser, please raise an issue at <https://github.com/cltk/cltk/issues> and include a short text to reproduce the error.

Unrecognized UD feature 'Compound' with value 'Yes'.
If you believe this is not an error in the dependency parser, please raise an issue at <https://github.com/cltk/cltk/issues> and include a short text to reproduce the error.

Unrecognized UD feature 'Compound' with value 'Yes'.
If you believe this is not an error in the dependency parser, please raise an issue at <https://github.com/cltk/cltk/issues> and include a short text to reproduce the error.

Unrecognized UD feature 'Compound' with value 'Yes'.
If you believe this is not an erro

In [3]:
# Helper function to extract CLTK features as strings

def getCltkFeature(token, feature, default=None):
    '''convert token's feature bundle to a dictionary and perform a get'''
    d = dict(zip([str(k) for k in token.features.keys()], token.features.values()))
    vlist = d.get(feature)

    if vlist is None:
        return(default)

    return [str(v) for v in vlist]

In [4]:
cltk_tokens = []

# extract features
for i, s in enumerate(test_speeches):
    # progress
    if (i % 200 == 0) or (i == len(test_speeches) - 1):
        print(f'\r{round(i * 100 /len(test_speeches))} % complete', end='')

    # process all tokens in speech
    for tok in s.passage.cltk_doc:
        if s.passage.getLineIndex(tok) is not None:
            line_n = s.passage.line_array[s.passage.getLineIndex(tok)]['N']
        else:
            tok_idx = s.passage.getCltkWordIndex(tok)
            if tok_idx == 0:
                line_n = s.passage.line_array[0]['N']
            elif tok_idx == len(s.passage.cltk_doc.words) - 1:
                line_n = s.passage.line_array[-1]['N']
            else:
                left_tok = s.passage.cltk_doc[tok_idx-1]
                left_line_idx = s.passage.getLineIndex(left_tok)
                right_tok = s.passage.cltk_doc[tok_idx+1]
                right_line_idx = s.passage.getLineIndex(right_tok)
                if (left_line_idx is not None) and (right_line_idx is not None) and (left_line_idx == right_line_idx):
                    line_n = s.passage.line_array[left_line_idx]['N']
                else:
                    line_n = None
        cltk_tokens.append(dict(
            speech_id = s.id,
            lang = s.lang,
            author = s.author.name,
            work = s.work.title,
            urn = s.work.urn,
            l_fi = s.l_fi,
            l_la = s.l_la,
            nlines = len(s.passage.line_array),
            spkr = ','.join([inst.name for inst in s.spkr]),
            addr = ','.join([inst.name for inst in s.addr]),
            part = s.part,
            level = s.level,
            line_n = line_n,
            line_id = f'{s.work.urn}:{line_n}' if line_n is not None else None,
            token = tok.string,
            tok_id = f'{s.id}:{s.passage.getTextPos(tok)}',
            lemma = tok.lemma,
            pos = tok.upos,
            mood = getCltkFeature(tok, 'Mood'),
            tense = getCltkFeature(tok, 'Tense'),
            voice = getCltkFeature(tok, 'Voice'),
            aspect = getCltkFeature(tok, 'Aspect'),
            person = getCltkFeature(tok, 'Person'),
            number = getCltkFeature(tok, 'Number'),
            case = getCltkFeature(tok, 'Case'),
            gender = getCltkFeature(tok, 'Gender'),
            degree = getCltkFeature(tok, 'Degree'),
            verbform = getCltkFeature(tok, 'VerbForm'),
        ))

cltk_tokens = pd.DataFrame(cltk_tokens)

# simplify list cells
cols = ['mood', 'tense', 'voice', 'aspect', 'person', 'number', 'case', 'gender', 'degree', 'verbform']
cltk_tokens[cols] = cltk_tokens[cols].map(lambda x: None if x is None else ','.join(x))

100 % complete

### Fix NA line ids

Consider dropping these lines?

In [None]:
mask = cltk_tokens['line_id'].isna()
cltk_tokens.loc[mask, 'line_id'] = cltk_tokens.loc[mask, 'urn'] + ':' + cltk_tokens.loc[mask, 'token']

### Save and display

In [None]:
# save to temporary file
cltk_tokens.to_csv(os.path.join(data_dir, cltk_file), index=False)

# display results
display(cltk_tokens)