# Preliminaries

## Import statements and paths

In [None]:
import os

data_dir = "data"
spacy_file = "spacy_tokens.csv"

## Collect speeches

This executes all code in `1 - Collect Speeches.ipynb` and generates a DICES `SpeechGroup` called `test_speeches`, which we use below. To see in detail how the speeches are downloaded and pre-processed, you can run the first notebook separately.

In [None]:
%run "1 - Collect Speeches.ipynb"

# Run Spacy

In [None]:
# initialize spacy models
spacy_load(
    latin_model = 'la_core_web_lg',
    greek_model = 'grc_odycy_joint_trf',
)

In [None]:
failed = []

for i, s in enumerate(test_speeches):
    if (i % 50 == 0) or (i == len(test_speeches) - 1):
        print(f'\r{round(i * 100 /len(test_speeches))} % complete', end='')
    if s.passage.spacy_doc is None:
        s.passage.runSpacyPipeline()
    if s.passage.spacy_doc is None:
        failed.append(s)

if len(failed) > 0:
    print(f'SpaCy failed for {len(failed)} speeches:')
    for s in failed:
        print(f' - {s.work.urn}\t{s.work.title}\t{s.l_range}')

### Generate tabular data

In [None]:
spacy_tokens = []

# extract features
for i, s in enumerate(test_speeches):
    # progress
    if (i % 200 == 0) or (i == len(test_speeches) - 1):
        print(f'\r{round(i * 100 /len(test_speeches))} % complete', end='')

    # FIXME: getTextPos() breaks if Latin marks dieresis with e.g. ë
    lastpos = 0
    
    # process all tokens in speech
    for tok in s.passage.spacy_doc:

        # FIXME: see above
        textpos = s.passage.getTextPos(tok) or (lastpos + 1)
        lastpos = textpos
        
        line_n = s.passage.line_array[s.passage.getLineIndex(tok)]['N'] if s.passage.getLineIndex(tok) is not None else None
        spacy_tokens.append(dict(
            speech_id = s.id,
            lang = s.lang,
            author = s.author.name,
            work = s.work.title,
            urn = s.work.urn,
            l_fi = s.l_fi,
            l_la = s.l_la,
            nlines = len(s.passage.line_array),
            spkr = ','.join([inst.name for inst in s.spkr]),
            addr = ','.join([inst.name for inst in s.addr]),
            part = s.part,
            level = s.level,
            line_n = line_n,
            line_id = f'{s.work.urn}:{line_n}',
            token = tok.text,
            tok_id = f'{s.id}:{textpos}',
            lemma = tok.lemma_,
            pos = tok.pos_,
            mood = tok.morph.get('Mood'),
            tense = tok.morph.get('Tense'),
            voice = tok.morph.get('Voice'),
            person = tok.morph.get('Person'),
            number = tok.morph.get('Number'),
            case = tok.morph.get('Case'),
            gender = tok.morph.get('Gender'),
            verbform = tok.morph.get('VerbForm'),
            degree = tok.morph.get('Degree'),
            prontype = tok.morph.get('PronType'),
        ))

# convert to data frame
spacy_tokens = pd.DataFrame(spacy_tokens)

# simplify list cells
cols = ['mood', 'tense', 'voice', 'person', 'number', 'case', 'gender', 'verbform', 'degree', 'prontype']
spacy_tokens[cols] = spacy_tokens[cols].map(lambda x: None if len(x) == 0 else ','.join(x))

### Add Greek question marks

In [None]:
extra_rows = []

for s in test_speeches:
    if s.lang == "latin":
        continue
    for match in re.finditer("(;)", s.passage.text):
        l_idx = 0
        for next_l_idx, next_c_idx in enumerate(s.passage._line_index):
            if next_c_idx > match.start():
                break
            else:
                l_idx = next_l_idx
        line_n = s.passage.line_array[l_idx]["N"]        
        
        extra_rows.append(dict(
            speech_id = s.id,
            lang = s.lang,
            author = s.author.name,
            work = s.work.title,
            urn = s.work.urn,
            l_fi = s.l_fi,
            l_la = s.l_la,
            nlines = len(s.passage.line_array),
            spkr = ','.join([inst.name for inst in s.spkr]),
            addr = ','.join([inst.name for inst in s.addr]),
            part = s.part,
            level = s.level,
            line_n = line_n,
            line_id = f'{s.work.urn}:{line_n}',
            token = ";",
            tok_id = f'{s.id}:{match.start()}',
            lemma = ";",
            pos = "PUNCT",
        ))

extra_rows = pd.DataFrame(extra_rows)

In [None]:
spacy_tokens = (pd.concat([spacy_tokens, extra_rows], ignore_index=True)
    .assign(temp=lambda df: df["tok_id"].str.split(":"))
    .assign(left=lambda df: df["temp"].str[0].astype(int),
            right=lambda df: df["temp"].str[1].astype(int))
    .sort_values(by=["left", "right"])
    .drop(columns=["left", "right", "temp"])
    .reset_index(drop=True)
    )

### Save and display

In [None]:
# save to temp file
spacy_tokens.to_csv(os.path.join(data_dir, spacy_file), index=False)

# display
display(spacy_tokens)