# Preliminaries

## File paths

In [None]:
data_dir = "data"
cltk_file = "cltk_tokens.csv"

# Collect speeches

This executes all code in `1 - Collect Speeches.ipynb` and generates a DICES `SpeechGroup` called `test_speeches`, which we use below. To see in detail how the speeches are downloaded and pre-processed, you can run the first notebook separately.

In [None]:
%run "1 - Collect Speeches.ipynb"

# Run CLTK

In [None]:
failed = []

for i, s in enumerate(test_speeches):
    if (i % 200 == 0) or (i == len(test_speeches) - 1):
        print(f'\r{round(i * 100 /len(test_speeches))} % complete', end='')

    if s.passage.cltk_doc is None:
        try:
            s.passage.runCltkPipeline()
        except:
            if DEBUG:
                print(s)
                print(s.passage.text)
            raise
    if s.passage.cltk_doc is None:
        failed.append(s)

if len(failed) > 0:
    print(f'CLTK failed for {len(failed)} speeches:')
    for s in failed:
        print(f' - {s.work.urn}\t{s.work.title}\t{s.l_range}')

In [None]:
# Helper function to extract CLTK features as strings

def getCltkFeature(token, feature, default=None):
    '''convert token's feature bundle to a dictionary and perform a get'''
    d = dict(zip([str(k) for k in token.features.keys()], token.features.values()))
    vlist = d.get(feature)

    if vlist is None:
        return(default)

    return [str(v) for v in vlist]

In [None]:
cltk_tokens = []

# extract features
for i, s in enumerate(test_speeches):
    # progress
    if (i % 200 == 0) or (i == len(test_speeches) - 1):
        print(f'\r{round(i * 100 /len(test_speeches))} % complete', end='')

    # process all tokens in speech
    for tok in s.passage.cltk_doc:
        if s.passage.getLineIndex(tok) is not None:
            line_n = s.passage.line_array[s.passage.getLineIndex(tok)]['N']
        else:
            tok_idx = s.passage.getCltkWordIndex(tok)
            if tok_idx == 0:
                line_n = s.passage.line_array[0]['N']
            elif tok_idx == len(s.passage.cltk_doc.words) - 1:
                line_n = s.passage.line_array[-1]['N']
            else:
                left_tok = s.passage.cltk_doc[tok_idx-1]
                left_line_idx = s.passage.getLineIndex(left_tok)
                right_tok = s.passage.cltk_doc[tok_idx+1]
                right_line_idx = s.passage.getLineIndex(right_tok)
                if (left_line_idx is not None) and (right_line_idx is not None) and (left_line_idx == right_line_idx):
                    line_n = s.passage.line_array[left_line_idx]['N']
                else:
                    line_n = None
        cltk_tokens.append(dict(
            speech_id = s.id,
            lang = s.lang,
            author = s.author.name,
            work = s.work.title,
            urn = s.work.urn,
            l_fi = s.l_fi,
            l_la = s.l_la,
            nlines = len(s.passage.line_array),
            spkr = ','.join([inst.name for inst in s.spkr]),
            addr = ','.join([inst.name for inst in s.addr]),
            part = s.part,
            level = s.level,
            line_n = line_n,
            line_id = f'{s.work.urn}:{line_n}' if line_n is not None else None,
            token = tok.string,
            tok_id = f'{s.id}:{s.passage.getTextPos(tok)}',
            lemma = tok.lemma,
            pos = tok.upos,
            mood = getCltkFeature(tok, 'Mood'),
            tense = getCltkFeature(tok, 'Tense'),
            voice = getCltkFeature(tok, 'Voice'),
            aspect = getCltkFeature(tok, 'Aspect'),
            person = getCltkFeature(tok, 'Person'),
            number = getCltkFeature(tok, 'Number'),
            case = getCltkFeature(tok, 'Case'),
            gender = getCltkFeature(tok, 'Gender'),
            degree = getCltkFeature(tok, 'Degree'),
            verbform = getCltkFeature(tok, 'VerbForm'),
        ))

cltk_tokens = pd.DataFrame(cltk_tokens)

# simplify list cells
cols = ['mood', 'tense', 'voice', 'aspect', 'person', 'number', 'case', 'gender', 'degree', 'verbform']
cltk_tokens[cols] = cltk_tokens[cols].map(lambda x: None if x is None else ','.join(x))

### Fix NA line ids

Consider dropping these lines?

In [None]:
mask = cltk_tokens['line_id'].isna()
cltk_tokens.loc[mask, 'line_id'] = cltk_tokens.loc[mask, 'urn'] + ':' + cltk_tokens.loc[mask, 'token']

### Save and display

In [None]:
# save to temporary file
cltk_tokens.to_csv(os.path.join(data_dir, cltk_file), index=False)

# display results
display(cltk_tokens)