# Preliminaries

### Import statements

In [None]:
# utils
import os
import re
import json

# DICES packages
from dicesapi import DicesAPI, SpeechGroup
from dicesapi.text import CtsAPI, spacy_load
import dicesapi.text

# for working with local CTS repositories
from MyCapytain.resolvers.cts.local import CtsCapitainsLocalResolver
from MyCapytain.resources.prototypes.metadata import UnknownCollection

# for analysis
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn.decomposition import PCA

### Function definitions

In [None]:
def highlightKeywordsFullCltk(speech, token=None, lemma=None):
    '''Generate HTML results highlighting words matching token/lemma'''

    tok_strings = []

    for tok in speech.passage.cltk_doc:
        flag = True
        if (token is not None) and (tok.string != token):
            flag = False
        if (lemma is not None) and (tok.lemma != lemma):
            flag = False
        if flag:
            tok_string = f'<span style="color:red;font-weight:bold">{tok.string}</span>'
        else:
            tok_string = tok.string
        tok_strings.append(tok_string)

    return ' '.join(tok_strings)

def highlightKeywordsCltk(speech, token=None, lemma=None):
    '''Generate HTML results highlighting words matching token/lemma'''

    if speech.passage.line_array is None:
        print(f'no line array: {s}')
        return highlightKeywordsFullCltk(speech, token, lemma)

    hl_by_line = {}
    
    for tok in speech.passage.cltk_doc:
        flag = True
        if (token is not None) and (tok.string != token):
            flag = False
        if (lemma is not None) and (tok.lemma != lemma):
            flag = False
        if flag:
            try:
                l_idx = speech.passage.getLineIndex(tok)
                assert l_idx is not None
            except:
                print(f'cannot get line index: {s}')
                return highlightKeywordsFullCltk(speech, token, lemma)
            hl_by_line[l_idx] = hl_by_line.get(l_idx, []) + [tok]

    rows = []

    for l_idx in hl_by_line:
        l_loc = s.passage.line_array[l_idx]['n']
        l_string = s.passage.line_array[l_idx]['text']
        try:
            for tok in reversed(hl_by_line[l_idx]):
                l_pos = s.passage.getLinePos(tok)
                head = l_string[:l_pos]
                tail = l_string[l_pos+len(tok.string):]
                tok_string = f'<span style="color:red;font-weight:bold">{tok.string}</span>'
                l_string = head + tok_string + tail
        except:
            print(f'highlighting failed: {s}')
            return highlightKeywordsFullCltk(speech, token, lemma)

        rows.append(f'<tr><td>{l_loc}</td><td>{l_string}</td></tr>')

    html = '<table>' + ''.join(rows) + '</table>'
    return html

### Set up local text repositories

Here we clone Christopher's fork of the Perseus Greek and Latin texts, so that we can use a local CTS resolver instead of querying the Perseus server.

In [None]:
repo_names = ['canonical-greekLit', 'canonical-latinLit']

print('Checking for local text repositories...')

for repo in repo_names:
    local_dir = os.path.join('data', repo)
    remote_url = f'https://github.com/cwf2/{repo}.git'

    if os.path.exists(local_dir):
        print(f' - {local_dir} exists!')
    else:
        print(f' - retrieving {remote_url}')
        git.Repo.clone_from(remote_url, local_dir)

### Connection to DICES

In [None]:
api = DicesAPI(
    dices_api = 'https://fierce-ravine-99183-425639eee484.herokuapp.com/api/',
    logdetail = 0)

### Set up local CTS connection

This is the CTS API, allowing us to retrieve texts by URN. In this example, we not only instantiate a default CTS API, but we also create a local resolver that can serve texts from the local repositories we downloaded in the first cell.

We have to do a little surgery to overwrite the default CTS API object's resolver with the local one.

<div class="alert alert-warning" style="margin:1em 2em">
    <p><strong>Note:</strong> The resolver will generate a lot of errors; these can be ignored unless they pertain to a text you want to retrieve.</p>
</div>

In [None]:
# path to local repos
repo_paths = [os.path.join('data', repo) for repo in repo_names]

# create a local resolver
local_resolver = CtsCapitainsLocalResolver(repo_paths)

# initialize the CTS API
cts = CtsAPI(dices_api = api)

# overwrite the default resolver
cts._resolvers = {None: local_resolver}

# Data

### Download the entire DICES dataset

We'll start by downloading records for all the speeches in DICES. Then we can select the mother speeches locally.

In [None]:
all_speeches = api.getSpeeches()

#### ⚠️ Workaround for certain Perseus texts

These texts have an extra hierarchical level inserted into their loci on Perseus' CTS server. This is a temporary workaround to convert our loci to a form that the server understands.

Because `all_speeches` and `mother_speeches` just contain pointers to the same object pool, we can do this modification once on `all_speeches` and the mother speeches will also be affected.

In [None]:
adj_book_line = [
    'De Raptu Proserpinae',
    'In Rufinum',
]
adj_line = [
    'Panegyricus de consulatu Manlii Theodori',
    'Panegyricus de Tertio Consulatu Honorii Augusti',
    'Panegyricus de Sexto Consulatu Honorii Augusti',
    'Epithalamium de Nuptiis Honorii Augusti',
    'De Bello Gothico',
    'Psychomachia',    
]

for s in all_speeches:
    if s.work.title in adj_book_line:
        m = re.fullmatch(r'(\d+)\.(\d+)', s.l_fi)
        if m:
            s.l_fi = f'{m.group(1)}.1.{m.group(2)}'

        m = re.fullmatch(r'(\d+)\.(\d+)', s.l_la)
        if m:
            s.l_la = f'{m.group(1)}.1.{m.group(2)}'

    elif s.work.title in adj_line:
        m = re.fullmatch(r'(\d+)', s.l_fi)
        if m:
            s.l_fi = '1.' + m.group(1)

        m = re.fullmatch(r'(\d+)', s.l_la)
        if m:
            s.l_la = '1.' + m.group(1)

In [None]:
# adjust loci for perseus editions

errata_file = os.path.join('data', 'changed_loci.txt')
errata = pd.read_csv(errata_file, sep='\t', dtype=str)
errata = dict([
    (f'{row.author} {row.work} {row.l_fi_old}-{row.l_la_old}', (row.l_fi_new, row.l_la_new))
    for row in errata.itertuples()])

for s in all_speeches:
    key = f'{s.author.name} {s.work.title} {s.l_range}'
    if key in errata:
        print(f'Corrected {s}', end=' ')
        s.l_fi, s.l_la = errata[key]
        print(f'to {s}')
        

## Get the text

Because we're retrieving the texts from a local repository I've turned off caching to save memory.

In [None]:
failed = []

for i, s in enumerate(all_speeches):
    if (i % 200 == 0) or (i == len(all_speeches) - 1):
        print(f'\r{round(i * 100 /len(all_speeches))} % complete', end='')
    if not hasattr(s, 'passage') or s.passage is None:
        try:
            s.passage = cts.getPassage(s, cache=False)
        except:
            s.passage = None
    if s.passage is None:
        failed.append(s)

print()
print (f'{len(failed)} failed:')
for s in failed:
    print(f'\t{s.author.name} {s.work.title} {s.l_range}')

### Add supplementary text for speeches not in Perseus

In [None]:
path = os.path.join('data', 'supp_mother_speeches.txt')

with open(path) as f:
    supplement = json.load(f)

for rec in supplement:
    for s in all_speeches:
        if s.id == rec['id']:
            print(s)
            s.passage = dicesapi.text.Passage()
            s.passage.line_array = rec['line_array']
            s.passage._line_index = []
            cumsum = 0
            for i in range(len(s.passage.line_array)):
                s.passage._line_index.append(cumsum)
                cumsum += len(s.passage.line_array[i]['text']) + 1
            s.passage.text = ' '.join([l['text'] for l in s.passage.line_array])
            s.passage.speech = s
            break

### Remove speeches with no text available

In [None]:
test_speeches = all_speeches.advancedFilter(lambda s: s.passage is not None).sorted()

### Read the list of mother-child pairs

In [None]:
mothers_file = os.path.join('data', 'mother-child.csv')
mothers = pd.read_csv(mothers_file, sep='\t')
display(mothers)

### Identify mother speeches

In [None]:
def motherValidation(speech):
    '''check whether any speaker-addressee combo is in the mother-child list'''
    valid_keys = list(mothers.spkr + ':' + mothers.addr)

    for spkr in speech.spkr:
        for addr in speech.addr:
            key = f'{spkr.name}:{addr.name}'
            if key in valid_keys:
                return True

    return False

In [None]:
mother_speeches = all_speeches.advancedFilter(motherValidation)

### Class as mother/non-mother

In [None]:
for s in all_speeches:
    if s in mother_speeches:
        s.is_mother = True
    else:
        s.is_mother = False

In [None]:
kept_mother = len([s for s in test_speeches if s.is_mother])
kept_non_mother = len([s for s in test_speeches if not s.is_mother])


print(f'{kept_mother}/{len(mother_speeches)} mother speeches selected')
print(f'{kept_non_mother}/{len(all_speeches) - len(mother_speeches)} non-mother speeches selected')

# Run NLP
## Spacy

In [None]:
# initialize spacy models
spacy_load(
    latin_model = 'la_core_web_lg',
    greek_model = 'grc_odycy_joint_trf',
)

In [None]:
failed = []

for i, s in enumerate(test_speeches):
    if (i % 200 == 0) or (i == len(test_speeches) - 1):
        print(f'\r{round(i * 100 /len(test_speeches))} % complete', end='')
    if s.passage.spacy_doc is None:
        s.passage.runSpacyPipeline()
    if s.passage.spacy_doc is None:
        failed.append(s)

if len(failed) > 0:
    print(f'SpaCy failed for {len(failed)} speeches:')
    for s in failed:
        print(f' - {s.work.urn}\t{s.work.title}\t{s.l_range}')

### Generate tabular data

In [None]:
spacy_tokens = []

# extract features
for i, s in enumerate(test_speeches):
    if (i % 200 == 0) or (i == len(test_speeches) - 1):
        print(f'\r{round(i * 100 /len(test_speeches))} % complete', end='')
    for tok in s.passage.spacy_doc:
        spacy_tokens.append(dict(
            speech_id = s.id,
            lang = s.lang,
            author = s.author.name,
            work = s.work.title,
            l_fi = s.l_fi,
            l_la = s.l_la,
            nlines = len(s.passage.line_array),
            spkr = ','.join([inst.name for inst in s.spkr]),
            addr = ','.join([inst.name for inst in s.addr]),
            part = s.part,
            mother = s.is_mother,
            line = s.passage.line_array[s.passage.getLineIndex(tok)]['n'],
            token = tok.text,
            lemma = tok.lemma_,
            pos = tok.pos_,
            mood = tok.morph.get('Mood'),
            tense = tok.morph.get('Tense'),
            voice = tok.morph.get('Voice'),
            person = tok.morph.get('Person'),
            number = tok.morph.get('Number'),
            case = tok.morph.get('Case'),
            gender = tok.morph.get('Gender'),
            verbform = tok.morph.get('VerbForm'),
            degree = tok.morph.get('Degree'),
            prontype = tok.morph.get('PronType'),
        ))

# convert to data frame
spacy_tokens = pd.DataFrame(spacy_tokens)

# simplify list cells
cols = ['mood', 'tense', 'voice', 'person', 'number', 'case', 'gender', 'verbform', 'degree', 'prontype']
spacy_tokens[cols] = spacy_tokens[cols].map(lambda x: None if len(x) == 0 else ','.join(x))

# display
display(spacy_tokens)

## CLTK

In [None]:
failed = []

for i, s in enumerate(test_speeches):
    if (i % 200 == 0) or (i == len(test_speeches) - 1):
        print(f'\r{round(i * 100 /len(test_speeches))} % complete', end='')

    if s.passage.cltk_doc is None:
        try:
            s.passage.runCltkPipeline()
        except:
            print(s)
            print(s.passage.text)
    if s.passage.cltk_doc is None:
        failed.append(s)

if len(failed) > 0:
    print(f'CLTK failed for {len(failed)} speeches:')
    for s in failed:
        print(f' - {s.work.urn}\t{s.work.title}\t{s.l_range}')

In [None]:
# Helper function to extract CLTK features as strings

def getCltkFeature(token, feature, default=None):
    '''convert token's feature bundle to a dictionary and perform a get'''
    d = dict(zip([str(k) for k in token.features.keys()], token.features.values()))
    vlist = d.get(feature)

    if vlist is None:
        return(default)

    return [str(v) for v in vlist]

In [None]:
cltk_tokens = []

# extract features
for i, s in enumerate(test_speeches):
    # progress
    if (i % 200 == 0) or (i == len(test_speeches) - 1):
        print(f'\r{round(i * 100 /len(test_speeches))} % complete', end='')

    # process all tokens in speech
    for tok in s.passage.cltk_doc:
        cltk_tokens.append(dict(
            speech_id = s.id,
            lang = s.lang,
            author = s.author.name,
            work = s.work.title,
            l_fi = s.l_fi,
            l_la = s.l_la,
            nlines = len(s.passage.line_array),
            spkr = ','.join([inst.name for inst in s.spkr]),
            addr = ','.join([inst.name for inst in s.addr]),
            part = s.part,
            mother = s.is_mother,
            line = s.passage.line_array[s.passage.getLineIndex(tok)]['n'] if s.passage.getLineIndex(tok) is not None else None,
            token = tok.string,
            lemma = tok.lemma,
            pos = tok.upos,
            mood = getCltkFeature(tok, 'Mood'),
            tense = getCltkFeature(tok, 'Tense'),
            voice = getCltkFeature(tok, 'Voice'),
            aspect = getCltkFeature(tok, 'Aspct'),
            person = getCltkFeature(tok, 'Person'),
            number = getCltkFeature(tok, 'Number'),
            case = getCltkFeature(tok, 'Case'),
            gender = getCltkFeature(tok, 'Gender'),
            degree = getCltkFeature(tok, 'Degree'),
            verbform = getCltkFeature(tok, 'VerbForm'),    
        ))

cltk_tokens = pd.DataFrame(cltk_tokens)

# simplify list cells
cols = ['mood', 'tense', 'voice', 'aspect', 'person', 'number', 'case', 'gender', 'degree', 'verbform']
cltk_tokens[cols] = cltk_tokens[cols].map(lambda x: None if x is None else ','.join(x))

# display results
display(cltk_tokens)

## Deduplicate embedded lines

Keep only the most embedded instance of each line.

In [None]:
x = cltk_tokens.loc[(cltk_tokens['work']=='Odyssey') & cltk_tokens['l_fi'].str.startswith('9.')]

## Load hand-selected feature sets

In [None]:
excel_file = '/Users/chris/Dropbox/Epic Speeches/Listen to mummy/classification.xlsx'
lemma_class = pd.read_excel(excel_file, 'classification')

In [None]:
lem_dict = dict()

for label in lemma_class.label.unique():
    if not pd.isna(label):
        lem_dict[label] = lemma_class.loc[lemma_class.label == label, 'lemma'].values

In [None]:
lem_dict

### Classify lemmata

In [None]:
for tag in ['family', 'pers_poss_s', 'pers_poss_p']:
    spacy_tokens[tag] = spacy_tokens['lemma'].isin(lem_dict[tag])
    cltk_tokens[tag] = cltk_tokens['lemma'].isin(lem_dict[tag])
cltk_tokens['marked_verb'] = cltk_tokens['mood'].isin(['optative', 'subjunctive'])
spacy_tokens['marked_verb'] = spacy_tokens['mood'].isin(['Opt', 'Sub'])
# cltk_tokens['marked_verb'] = (cltk_tokens['mood'].isin(['optative', 'subjunctive', 'imperative']) | 
#                               cltk_tokens['tense'] == 'future')
# spacy_tokens['marked_verb'] = (spacy_tokens['mood'].isin(['Opt', 'Sub', 'Imp',]) | 
#                               spacy_tokens['tense'] == 'Fut')

## Export CSV

In [None]:
spacy_tokens.to_csv(os.path.join('data', 'spacy_token_table.csv'), index=False)
cltk_tokens.to_csv(os.path.join('data', 'cltk_token_table.csv'), index=False)

# Data Analysis

### Speech labels

In [None]:
labels = tokens.groupby('speech_id').agg(
    lang = ('lang', 'first'),
    author = ('author', 'first'),
    work = ('work', 'first'),
    l_fi = ('l_fi', 'first'),
    l_la = ('l_la', 'first'),    
    spkr = ('spkr', 'first'),
    addr = ('addr', 'first'),
    part = ('part', 'first'),
    nlines = ('nlines', 'first'),
    mother = ('mother', 'first'),
    ntokens = ('token', 'count'),
)
display(labels)

### Mother/non-mother by author

In [None]:
labels.groupby(['lang', 'mother', 'author']).agg(
    speeches = ('l_fi', 'count'),
    lines = ('nlines', 'sum'),
)

### By language

In [None]:
tokens.groupby(['mother', 'lang']).agg(
    speeches = ('speech_id', 'nunique'),
    tokens = ('token', 'count'),
)

## Distribution of morphological features

### Moods

In [None]:
mask = tokens.mood.isin(['Imp', 'Ind', 'Opt', 'Sub'])
grouped = tokens[mask].groupby('lang')

for name, group in grouped:
    df = pd.crosstab(group.mother, group.mood, normalize='index')
    display(name, df)

### Person and number (finite verbs)

In [None]:
mask = tokens.mood.isin(['Imp', 'Ind', 'Opt', 'Sub'])
grouped = tokens[mask].groupby('lang')

for name, group in grouped:
    df = pd.crosstab(group.mother, group.person + group.number, normalize='index')
    display(name, df)

### Pronouns

In [None]:
for name, group in tokens.groupby('lang'):
    df = pd.crosstab(group.mother, group.prontype, normalize='index')
    display(name, df)

In [None]:
x = tokens.loc[tokens.prontype=='Prs']
for name, group in x.groupby(x.mother):
    print(name)
    df = pd.crosstab(group.number, group.person)
    display(df)

In [None]:
lem_dict['giving']

In [None]:
lem_dict

In [None]:
tokens['family'] = tokens['lemma'].isin(lem_dict['family'])

In [None]:
x = tokens.lang=='greek'
pd.crosstab(tokens.loc[x]['mother'], tokens.loc[x]['family'])

In [None]:
x = tokens.lang=='greek'
pd.crosstab(tokens.loc[x]['mother'], tokens.loc[x]['family'], normalize='index')

## Full feature set

In [None]:
feature_table = (
    cltk_token_table.pivot_table(
        index = 'speech_id',
        columns = 'theme',
        values = 'token',
        aggfunc = 'count',
        fill_value = 0,
    )
    .drop('', axis=1)
.join(
    cltk_token_table.pivot_table(
        index = 'speech_id',
        columns = 'mood',
        values = 'token',
        aggfunc = 'count',
        fill_value = 0,
    )
    .assign(
        subopt = lambda x: x['subjunctive'] + x['optative'])
    
    .drop(['', 'subjunctive', 'optative', 'gerund', 'gerundive'], axis=1)
)
.join(
    cltk_token_table.pivot_table(
        index = 'speech_id',
        columns = 'pers',
        values = 'token',
        aggfunc = 'count',
        fill_value = 0,
    )
    .drop(['', 'first', 'plural', 'second', 'singular'], axis=1)
)
.join(
    cltk_token_table.pivot_table(
        index = 'speech_id',
        columns = 'pron',
        values = 'token',
        aggfunc = 'count',
        fill_value = 0,
    ).loc[:,['interrogative', 'personal', 'reciprocal', 'relative']]
)
.join( 
    cltk_token_table.groupby('speech_id')['poss']
    .value_counts()
    .unstack()
    .fillna(0)
    .rename(columns={'pos':'possessive'})['possessive']
)
.div(labels.ntokens, axis=0)
)

feature_table

### Log frequencies

In [None]:
x = feature_table.apply(np.log)
x[feature_table==0] = np.nan
x

### Principal Components

In [None]:
pca_model = PCA(n_components=3)

pca = pd.DataFrame(
    index = feature_table.index,
    data = pca_model.fit_transform(feature_table), 
    columns=['PC1', 'PC2', 'PC3'])

## Plots

In [None]:
feat = 'family'
label = 'mother'

groups = x.groupby(labels[label])

fig, ax = plt.subplots()
ax.boxplot(
    [group[feat].dropna() for name, group in groups])
ax.set_xticks([j + 1 for j in range(len(groups))],
    labels = [name for name, group in groups])
ax.set_xlabel(label)
ax.set_ylabel('log frequency')
ax.set_title(f'{feat} vocabulary')
plt.savefig(f'{feat}_box.png')
plt.show()

In [None]:
feat = 'family'
label = 'mother'

groups = x.groupby(labels[label])

fig, ax = plt.subplots()
ax.violinplot(
    [group[feat].dropna() for name, group in groups], showmeans=True)
ax.set_xticks([j + 1 for j in range(len(groups))],
    labels = [name for name, group in groups])
ax.set_xlabel(label)
ax.set_ylabel('log frequency')
ax.set_title(f'{feat} vocabulary')
plt.savefig(f'{feat}_box.png')
plt.show()

In [None]:
x_feat = 'family'
y_feat = 'giving'
label = 'mother'

fig, ax = plt.subplots()
for label_value in labels[label].unique():
    mask = labels[label]==label_value
    ax.loglog(feature_table.loc[mask, x_feat], feature_table.loc[mask, y_feat], marker='o', linestyle='', label=label_value)
ax.set_xlabel(x_feat)
ax.set_ylabel(y_feat)
ax.legend(title=label)
plt.show()

In [None]:
x_feat = 'personal'
y_feat = 'possessive'
label = 'lang'

fig, ax = plt.subplots()
for label_value in labels[label].unique():
    mask = labels[label]==label_value
    ax.loglog(feature_table.loc[mask, x_feat], feature_table.loc[mask, y_feat], marker='o', linestyle='', label=label_value)
ax.set_xlabel(x_feat)
ax.set_ylabel(y_feat)
ax.legend(title=label)
ax.set_title('pronouns in mother-child speeches')
plt.savefig('pron.png')
plt.show()

In [None]:
x_feat = 'PC1'
y_feat = 'PC2'
label = 'lang'

fig, ax = plt.subplots()
for label_value in labels[label].unique():
    mask = labels[label]==label_value
    ax.plot(pca.loc[mask, x_feat], pca.loc[mask, y_feat], marker='o', linestyle='', label=label_value)
ax.set_xlabel(x_feat)
ax.set_ylabel(y_feat)
ax.legend(title=label)
ax.set_title(f'Principal Components from {len(feature_table.columns)} parameters')
plt.savefig('foo.png')

In [None]:
x_feat = 'PC1'
y_feat = 'PC2'
label = 'auth'

fig, ax = plt.subplots()
for label_value in ['Homer', 'Apollonius', 'Virgil', 'Nonnus']:
    mask = (labels[label]==label_value) & labels.mother
    ax.plot(pca.loc[mask, x_feat], pca.loc[mask, y_feat], marker='o', ls='', label=label_value)
ax.set_xlabel(x_feat)
ax.set_ylabel(y_feat)
ax.legend(title=label)
ax.set_title(f'Principal Components from {len(feature_table.columns)} parameters')
plt.savefig('foo.png')

# Scratch

In [None]:
intj_list = cltk_tokens.loc[cltk_tokens['pos']=='INTJ']['lemma'].unique()
cltk_tokens.loc[cltk_tokens['lemma'].isin(intj_list)].groupby('token').agg(
    lemma = ('lemma', set),
    count = ('l_fi', 'count'),
).sort_values('count', ascending=False).to_clipboard()

In [None]:
speech_list = cltk_tokens.loc[(cltk_tokens['token']=='ἦ') & (cltk_tokens['lemma']=='ἤ')]['speech_id'].unique()
html = ''

for s in test_speeches.filterIDs(speech_list):
    tok_strings = []
    for tok in s.passage.cltk_doc:
        if (tok.string == 'ἦ') & (tok.lemma == 'ἤ'):
            tok_string = f'<span style="color:red">{tok.string}</span>'
        else:
            tok_string = tok.string
        tok_strings.append(tok_string)

    html += '<div>'
    html += f'<h3>{s.author.name} {s.work.title} {s.l_range}: {s.getSpkrString()} to {s.getAddrString()}</h3>'
    html += highlightKeywordsCltk(s, token='ἦ', lemma='ἤ')
    html += '</div>\n'

In [None]:
toks = [s.passage.cltk_doc[17],s.passage.cltk_doc[14]]
l_idx = s.passage.getLineIndex(toks[0])
line_string = s.passage.line_array[l_idx]['text']
for tok in toks:
    l_pos = s.passage.getLinePos(tok)
    line_string = line_string[:l_pos] + '[' + tok.string + ']' + line_string[l_pos+len(tok.string):]
print(line_string)

In [None]:
toks

In [None]:
is_lem = spacy_tokens['lemma'].isin(lem_dict['family'])
is_tok = spacy_tokens['token'].isin(lem_dict['family'])
lem_or_tok = is_lem | is_tok
spacy_fam = spacy_tokens.loc[lem_or_tok].groupby(['lemma', 'token']).agg(
    count = ('token', 'count')
)

In [None]:
is_lem = cltk_tokens['lemma'].isin(lem_dict['family'])
is_tok = cltk_tokens['token'].isin(lem_dict['family'])
lem_or_tok = is_lem | is_tok
cltk_fam = cltk_tokens.loc[lem_or_tok].groupby(['lemma', 'token']).agg(
    count = ('token', 'count')
)

In [None]:
(spacy_fam.join(cltk_fam, how='outer', lsuffix='_spacy', rsuffix='_cltk')
         .fillna(0)
         .astype(int)
).to_clipboard()

In [None]:
cltk_tokens.loc[cltk_tokens['token']=='τοκῆος'].groupby('lemma').agg(count=('token','count')).sort_values('count', ascending=False)

In [None]:
spacy_tokens.loc[spacy_tokens['token']=='γενέτη']

In [None]:
pd.crosstab(cltk_tokens.mother, cltk_tokens['mood']=='imperative', normalize='index')

In [None]:
pd.crosstab(spacy_tokens.mother, spacy_tokens['mood']=='Imp', normalize='index')

In [None]:
x = cltk_tokens.groupby('speech_id').agg(
    lang = ('lang', 'first'),
    author = ('author', 'first'),
    work = ('work', 'first'),
    l_fi = ('l_fi', 'first'),
    l_la = ('l_la', 'first'),
    spkr = ('spkr', 'first'),
    addr = ('addr', 'first'),    
    mother = ('mother', 'first'),
    family = ('family', 'sum'),
    pers_poss_s = ('pers_poss_s', 'sum'),
    marked_verb = ('marked_verb', 'sum'),
    tokens = ('token', 'count'),
)
x['family'] = x['family'].div(x['tokens'])
x['pers_poss_s'] = x['pers_poss_s'].div(x['tokens'])
x['marked_verb'] = x['marked_verb'].div(x['tokens'])

x[['mother', 'family', 'pers_poss_s', 'marked_verb']].plot.box('mother')

In [None]:
cltk_tokens.marked_verb.unique()

In [None]:
for lang, group in cltk_tokens.groupby('speech_id').agg(
    lang = ('lang', 'first'),
    mother = ('mother', 'first'),
    family = ('family', 'sum'),
    pers_poss_s = ('pers_poss_s', 'sum'),
    marked_verb = ('marked_verb', 'sum'),
    tokens = ('token', 'count'),
).groupby('lang'):
    xs = group['family'].div(group['tokens'])
    ys = (group['marked_verb'] + group['pers_poss_s']).div(group['tokens'])
    fig, ax = plt.subplots()
    ax.plot(xs[~x['mother']], ys[~x['mother']], 
        marker='o', ls='', label='non-mother')
    ax.plot(xs[x['mother']], ys[x['mother']], 
        marker='o', ls='', label='mother')
    ax.loglog()
    ax.set_title(lang)
plt.show()

In [None]:
x.to_clipboard()

In [None]:
pd.crosstab(x['mother'], x['family'] == 0)

In [None]:
x.loc[x['mother'] & (x['family'] + x['pers_poss_s'] == 0)]

In [None]:
zeros = x.loc[x['mother'] & (x['family'] + x['pers_poss_s'] == 0)].index.values
cltk_tokens.loc[cltk_tokens['speech_id'].isin(zeros)].groupby('speech_id').agg(
    author = ('author', 'first'),
    work = ('work', 'first'),
    l_fi = ('l_fi', 'first'),
    l_la = ('l_la', 'first'),
    spkr = ('spkr', 'first'),
    addr = ('addr', 'first'),
)

In [None]:
spacy_tokens['tense'].unique()

In [None]:
cltk_tokens['tense'].unique()

In [None]:
cltk_tokens['marked_verb'] = cltk_tokens['mood'].isin(['subjunctive', 'optative', 'imperative'])

In [None]:
x.loc[x['spkr']=='Maria'].sort_values('family', ascending=False)[:50]

In [None]:
x.loc[x['mother']].sort_values('family', ascending=False)[:25]

In [None]:
spacy_tokens.loc[spacy_tokens['mother']].groupby('speech_id').agg(
    lang = ('lang', 'first'),
    author = ('author', 'first'),
    work = ('work', 'first'),
    l_fi = ('l_fi', 'first'),
    l_la = ('l_la', 'first'),
    spkr = ('spkr', 'first'),
    addr = ('addr', 'first'),    
    mother = ('mother', 'first'),
    family = ('family', 'sum'),
    pers_poss_s = ('pers_poss_s', 'sum'),
    marked_verb = ('marked_verb', 'sum'),
    tokens = ('token', 'count'),
    lines = ('line', 'nunique'),
).sort_values('lines', ascending=False)[:25]

In [None]:
spacy_tokens.loc[spacy_tokens['mother']]['speech_id'].unique()

In [None]:
feats = ['pers_poss', 'family']
for speech_id in spacy_tokens.loc[spacy_tokens['mother']]['speech_id'].unique():
    x = (spacy_tokens
        .loc[(spacy_tokens['speech_id']==speech_id) & (spacy_tokens['pos']!='PUNCT')]
        .groupby('line')
        .agg(
            family = ('family', 'sum'),
            pers_poss = ('pers_poss', 'sum'),
            marked_verb = ('marked_verb', 'sum'),
            tokens = ('token', 'count'),
        )
    )
    
    labels = (spacy_tokens
        .loc[spacy_tokens['speech_id']==speech_id]
        .groupby('speech_id')
        .agg(
            author = ('author', 'first'),
            work = ('work', 'first'),
            l_fi = ('l_fi', 'first'),
            l_la = ('l_la', 'first'),
            spkr = ('spkr', 'first'),
            addr = ('addr', 'first'),
        )
    )
    
    title = f"{labels.iloc[0]['spkr']} to {labels.iloc[0]['addr']}"
    subtitle = f"{labels.iloc[0]['author']} {labels.iloc[0]['work']} {labels.iloc[0]['l_fi']}-{labels.iloc[0]['l_la']}"

    try:
        xs = x.index.values.astype('int')
    except:
        xs = range(1, len(x.index.values)+1)

    if len(xs) < 5:
        continue
    
    fig, ax = plt.subplots(figsize=(8,4))
    for feat in feats:
        ax.bar(xs, x[feat], label=feat)
    if len(xs) < 15:
        ax.set_xticks(xs)
    ax.set_xlabel('line')
    ax.set_ylabel('tokens')
    fig.suptitle(title)
    ax.set_title(subtitle)
    ax.legend()
    
    plt.savefig(os.path.join('fig', f'fig_{speech_id}.png'))

In [None]:
labels.iloc[0]['author']