# Preliminaries

### Import statements

In [1]:
import os
import pickle
import re
import numpy as np
import math
from collections import Counter
from cltk import NLP
from dicesapi import DicesAPI
from dicesapi.text import CtsAPI, Passage
from dicesapi.jupyter import NotebookPBar
import pandas as pd
from sklearn.decomposition import PCA
from scipy.stats import f_oneway, tukey_hsd
from matplotlib import pyplot as plt

# custom code for this notebook
#   - see seneca_experiment.py
from seneca_experiment import SenecaSpeech, getTags, tagtype

### Connections to remote databases, local files

In [2]:
# Seneca text
seneca_text_file = os.path.join('data', 'seneca_speeches.txt')

# remote endpoints
api = DicesAPI(logfile='dices.log')
cts = CtsAPI(dices_api=api)

In addition to the hand-curated input from Bernhardt for Seneca's speeches, I'm using local csv files to store the NLP annotations for the tokenized speeches, since parsing all that text takes some time. 

**NB**: Delete the csv files to rerun everything from scratch. This might run prohibitively slowly on binder, but it only takes half an hour or so on my old laptop. You could try just deleting the Seneca cache to see the parsing in operation on a smaller speech set.

In [3]:
# cache files
cache_seneca = os.path.join('data', 'seneca_tokens.csv')
cache_flavians = os.path.join('data', 'flavian_tokens.csv')

# Part 1: Seneca

### Use cache if it's present

If the CSV files with all the parsed tokens is here, we just use that and skip the next several steps.

In [4]:
if os.path.exists(cache_seneca):
    sen_tokens = pd.read_csv(cache_seneca)
    print(f'Loaded {len(sen_tokens)} records from {cache_seneca}')
    SKIP_SENECA = True
else:
    SKIP_SENECA = False

Loaded 5599 records from data/seneca_tokens.csv


### Read Bernhardt's text and parse into speech-like objects

In [5]:
if not SKIP_SENECA:
    with open(seneca_text_file) as f:
        text = f.read()
        if text:
            sen_texts = re.split('\n\n+', text)

    # how many speeches did we get?
    print(f'Loaded {len(sen_texts)} speeches from {seneca_text_file}')

In [6]:
if not SKIP_SENECA:
    sen_speeches = [SenecaSpeech(id=i, text=s) for i, s in enumerate(sen_texts)]

### Run CLTK's NLP pipeline

In [7]:
if not SKIP_SENECA:
    for s in sen_speeches:
        print (s, '...', end='')

        if (not hasattr(s.passage, 'cltk')) or s.passage.cltk is None:
            s.passage.runCltkPipeline(remove_punct=True)
        if s.passage.cltk is not None:
            print('OK')
        else:
            print('fail')

### Convert to tabular form

`sen_tokens` is a table with one row per token. It includes basic details on the passage as well as universal part of speech label and lemma.

In [8]:
if not SKIP_SENECA:
    sen_tokens = pd.DataFrame(dict(
        id = s.id,
        auth = 'Seneca',
        tags = 'trag',
        l_fi = s.l_fi,
        l_la = s.l_la,
        spkr = s.spkr,
        lem = w.lemma,
        pos = w.upos,
    ) for s in sen_speeches for w in s.passage.cltk)
    
sen_tokens.to_csv(cache_seneca)
print(f'Writing {len(sen_tokens)} records to {cache_seneca}')

Writing 5599 records to data/seneca_tokens.csv


### Example of tabular data

`sen_tokens` is a table with one row per token. It includes basic details on the passage as well as universal part of speech label and lemma. This is what is stored in the cache file.

In [9]:
sen_tokens

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,auth,tags,l_fi,l_la,spkr,lem,pos
0,0,0,0,Seneca,trag,1,278,Iuno,Soror,NOUN
1,1,1,0,Seneca,trag,1,278,Iuno,Tonantis,VERB
2,2,2,0,Seneca,trag,1,278,Iuno,(,PUNCT
3,3,3,0,Seneca,trag,1,278,Iuno,hic,PRON
4,4,4,0,Seneca,trag,1,278,Iuno,enim,ADV
...,...,...,...,...,...,...,...,...,...,...
5594,5594,5594,52,Seneca,trag,1341,1344,Theseus,innocens,ADJ
5595,5595,5595,52,Seneca,trag,1341,1344,Theseus,terra,NOUN
5596,5596,5596,52,Seneca,trag,1341,1344,Theseus,qui,PRON
5597,5597,5597,52,Seneca,trag,1341,1344,Theseus,super,NOUN


### Shape of the data

#### Basics

In [10]:
print(f'There are {len(sen_speeches)} speeches, totalling {len(sen_tokens)} tokens')

NameError: name 'sen_speeches' is not defined

#### Speech length distribution

How many long speeches? How many short speeches?

In [None]:
sen_tokens.groupby('id').size().hist()
plt.show()

One outlier is making it hard to see fine details---let's zoom in on the left part of the graph:

In [None]:
sen_tokens.groupby('id').size().hist(range=[0,400])
plt.show()

Most of these speeches are pretty short... about 100 words or fewer.

### Part of speech counts

How much does Seneca use each of the parts of speech (according to CLTK's classification)?

In [None]:
ax = sen_tokens.groupby('pos').size().plot.bar()
ax.set_xlabel('part of speech')
ax.set_ylabel('count')
plt.show()

**Two important notes:**

1. CLTK's part of speech tags don't include interjections here! I don't know why not... I'm fairly confident that I've seen results on Greek texts that do include interjections. But for whatever reason, any words that we consider interjections in this text are being labelled as other parts of speech.

2. Even though I tried to filter out punctuation before parsing (it generally improves the lemmatization), CLTK is labelling some tokens as `PUNCT`. These seem mostly to be actual punctuation marks that my initial efforts missed.

### Tabulate POS counts per speech

This table tallies part of speech tags by speech. Each row is one speech. Speeches with more words will have higher counts.

In [None]:
sen_pos = pd.crosstab(sen_tokens.id, sen_tokens.pos)
sen_pos

### Same, but normalized for speech length

This is the same table, but we divide each row by the total number of tokens, making long speeches and short speeches more comparable.

In [None]:
sen_norm = pd.crosstab(sen_tokens.id, sen_tokens.pos, normalize='index')
sen_norm

### Comparing speeches

Does part of speech use tell us anything interesting about the text? Let's try a simple comparison: how often does each character use adjectives?

In [None]:
sen_labels = sen_tokens.groupby('id').agg({'auth':'first', 'spkr':'first', 'tags':'first'})
pd.concat([sen_norm, sen_labels], axis=1).boxplot(column='ADJ', by='spkr')
plt.show()

There's a lot of overlap, but it looks like Theseus uses adjectives more than Hercules, for example. Let's look at the two distributions in more detail:

In [None]:
fig, ax = plt.subplots()
ax.hist(sen_norm.loc[sen_labels.spkr=='Hercules']['ADJ'], alpha=0.5, label='Hercules')
ax.hist(sen_norm.loc[sen_labels.spkr=='Theseus']['ADJ'], alpha=0.5, label='Theseus')
ax.legend()
ax.set_title('Character use of adjectives')
ax.set_xlabel('ADJ / all tokens')
ax.set_ylabel('speeches')
plt.show()

The two peaks are definitely different, but we also see how patchy the data is, particularly for Theseus.
At the end of the day, there aren't a lot of speeches here, but this is at least a sign that we might look profitably look more closely at Theseus' use of adjectives.

**But on the other hand...**

Hercules uses more subordinating conjunctions than Theseus. So it's not just that he's a less-sophisticated speaker...

In [None]:
pd.concat([sen_norm, sen_labels], axis=1).boxplot(column='SCONJ', by='spkr')
plt.show()

# Part 2: Flavians

### Use cache if it's present

If the CSV files with all the parsed tokens is here, we just use that and skip the next several steps.

In [None]:
if os.path.exists(cache_flavians):
    flav_tokens = pd.read_csv(cache_flavians)
    print(f'Loaded {len(flav_tokens)} records from {cache_flavians}')
    SKIP_FLAV = True
else:
    SKIP_FLAV = False

### Retrieve the speeches from DICES

In [None]:
if not SKIP_FLAV:
    flav_speeches = sorted(
                    api.getSpeeches(author_name='Statius') + \
                    api.getSpeeches(author_name='Silius') + \
                    api.getSpeeches(author_name='Valerius Flaccus'))
    print(f'Retrieved {len(flav_speeches)} speeches')

### Retrieve text from Perseus

In [None]:
if not SKIP_FLAV:
    pbar = NotebookPBar(max=len(flav_speeches))

    for i, s in enumerate(flav_speeches):
        if (not hasattr(s, 'passage')) or s.passage == None:
            s.passage = cts.getPassage(s)
            if s.passage is None:
                print(f'failed: {s}')
        pbar.update(i)

### Parse with CLTK

In [None]:
if not SKIP_FLAV:
    pbar = NotebookPBar(max=len(flav_speeches))

    for i, s in enumerate(flav_speeches):
        if s.passage is not None:
            if (not hasattr(s.passage, 'cltk')) or s.passage.cltk == None:
                s.passage.runCltkPipeline(remove_punct=True)
        pbar.update(i)

### Generate a bit table of tokens

As with `sen_tokens` above, this has one row per token.

**Note:** Both the `spkr` and `tags` columns can theoretically contain multiple values, since it's pretty common for speeches to be tagged with multiple speech types, and on rare occasions a single speech has two speakers.

In [None]:
if not SKIP_FLAV:
    flav_tokens = []
    for s in flav_speeches:
        if s.passage is not None and s.passage.cltk is not None:
            for w in s.passage.cltk:
                flav_tokens.append(dict(
                    id = s.id,
                    auth = s.author.name,
                    work = s.work.title,
                    l_fi = s.l_fi,
                    l_la = s.l_la,
                    tags = getTags(s),
                    spkr = [spkr.name for spkr in s.spkr],
                    lem = w.lemma,
                    pos = w.upos,
                ))
    flav_tokens = pd.DataFrame(flav_tokens)
    
flav_tokens.to_csv(cache_flavians)
print(f'Writing {len(flav_tokens)} records to {cache_flavians}')

In [None]:
flav_tokens

### Tag counts

#### How many tokens are there for each tag type?

Note that any time we're comparing between speech types, we need to break down the multiple values in the `tags` column. That means creating multiple copies of these rows, one for each tag. This is done with the pandas `explode` method.

Note that in tables comparing types like this, the column totals aren't reflective of true number of speeches/tokens in the corpus, since some speeches are considered more than once.

In [None]:
tag_count = flav_tokens[['id','tags']].explode('tags').groupby('tags').agg(
    tokens = pd.NamedAgg(column='id', aggfunc='count'),
    speeches = pd.NamedAgg(column='id', aggfunc='nunique'),
).sort_values(by='speeches', ascending=False)
tag_count['label'] = [tagtype[t] for t in tag_count.index]
tag_count

**Distribution of speech lengths across types**

I know that some speeches are much longer than others. Is there a significant difference in speech length across the speech type tags?

In [None]:
x = []

for s in flav_speeches:
    if s.passage is not None and s.passage.cltk is not None:
        x.append(dict(
            length = len([w for w in s.passage.cltk]),
            tags = getTags(s),
        ))
x = pd.DataFrame([row for row in x if row['length'] < 500])
x = x.explode('tags')

x.boxplot(by='tags', figsize=(20,10))
plt.show()

It's not a surprise that, on average, narrative speeches are the longest.


### Calculate POS feature vectors for speeches

As we did above for Seneca, here we tally part of speech counts for all the Flavians' speeches.

#### Raw counts

In [None]:
flav_pos = pd.crosstab(flav_tokens.id, flav_tokens.pos)
flav_pos

#### Normalized by speech



In [None]:
flav_norm = pd.crosstab(flav_tokens.id, flav_tokens.pos, normalize='index')
flav_norm

### Distribution of POS tags by speech type

Here we examine visually whether the proportion of different parts of speech varies much between speech types.

First, collect the speech metadata we might use to compare speeches.

In [None]:
flav_labels = flav_tokens.groupby('id').agg({'auth':'first', 'spkr':'first', 'tags':'first'})

#### Build a table with one row per tag

We start with the normalized POS features above, but speeches with multiple types are broken out into multiple (duplicate) rows.

In [None]:
x = flav_norm.copy()
x['tags'] = flav_labels.tags
x = x.explode('tags')

#### Draw some box plots

In [None]:
for feat in ['VERB', 'NOUN', 'ADJ']:
    x[[feat, 'tags']].boxplot(by='tags', figsize=(20,10))
plt.show()

# Part 3: Comparison

First, join the Seneca and Flavians tables.

In [None]:
all_tokens = pd.concat([sen_tokens, flav_tokens], ignore_index=True)
all_labels = pd.concat([sen_labels, flav_labels])
all_pos = pd.concat([sen_norm, flav_norm], ignore_index=True).set_index(all_labels.index)
all_pos = all_pos.fillna(0)

### Calculate PCA features

Instead of using individual POS tags, we can create a more holistic featureset using principal components analysis. Each of the resulting features incorporates elements of all the POS parameters.

In [None]:
pca_model = PCA(n_components=3)
pca_features = pca_model.fit_transform(all_pos)
all_pca = pd.DataFrame(
    index=all_pos.index,
    data=pca_features, 
    columns=['PC1', 'PC2', 'PC3'])
all_pca

### Visualize

Let's plot all the speeches according to the first two principal components, grouped by author. We might expect that if the authors have very different syntactic styles, we could see a noticeable difference between the clouds of coloured dots.

In [None]:
groups = all_pca.groupby(all_labels.auth.values)

feat_x = 'PC1'
feat_y = 'PC2'

fig, ax = plt.subplots(figsize=(10,5))
for name, group in groups:
    ax.plot(group[[feat_x]], group[[feat_y]], marker='o', linestyle='', ms=3, label=name)
ax.set_xlabel(feat_x)
ax.set_ylabel(feat_y)
ax.legend()

plt.show()


For my part,  I don't think I see any interesting separation between the groups, which suggests that the primary stylistic difference between these speeches is elsewhere.

### PCA features by type tag

This series of plots shows one speech type at a time, according to the first two principal components:

In [None]:
x = pd.concat([all_pca, all_labels.tags], axis=1).explode('tags')

groups = x.groupby('tags')

ncols = 3
nrows = math.ceil(len(groups)/ncols)

fig, axs = plt.subplots(nrows, ncols, figsize=(8,20), layout='constrained')

i = 0
j = 0
for name, group in groups:
    ax = axs[i, j]
    ax.plot(group.PC1, group.PC2, marker='o', linestyle='', ms=2, label=name)
    ax.set_xlim((-0.2, 0.2))
    ax.set_ylim((-0.2, 0.2))
    ax.set_title(tagtype[name][:18])
    i = i + 1
    if i >= nrows:
        i = 0
        j = j + 1
plt.show()

It would have been cool if some types of speeches were more like Seneca than others, but I don't think I see a big difference here.

The one thing that stands out visually is a separation between instructions and greetings. Let's take a closer look at that, just for fun:

In [None]:
x_feat = 'PC1'
y_feat = 'PC2'
names = ['pra', 'gre']

fig, ax = plt.subplots(figsize=(8,4))
for name in names:
    ax.plot(x.loc[x.tags==name][x_feat], x.loc[x.tags==name][y_feat], marker='o', linestyle='', label=tagtype[name])
    ax.set_xlabel(x_feat)
    ax.set_ylabel(y_feat)
ax.legend()
plt.show()

## Lemma-based features

Putting aside POS tags, let's look at specific lemmata for our features.

**Feature selection**

We have the option of hand-selecting a feature set---that is, a bundle of lemmata that we care about. One trick is that word frequency declines exponentially, so the number of samples containing a given word declines very rapidly as we look at less frequent words. If the words we choose aren't in the samples, then they're not useful in measuring how the samples compare to one another.

### Lemma counts

Let's begin by creating a tally of how often each lemma occurs. 

In [None]:
all_tokens

In [None]:
# omit punctuation
mask = all_tokens.pos != 'PUNCT'

lem_count = all_tokens[mask].groupby('lem').agg(
    tokens = pd.NamedAgg(column='id', aggfunc='count'),
    speeches = pd.NamedAgg(column='id', aggfunc='nunique'),
).sort_values(by='tokens', ascending=False)
lem_count[:50]

### An example featureset 

I've hand-selected some of the most frequent words here. You can replace these with anything you want and try it out.

In [None]:
keywords = [
    'et', 'qui', 'hic', 'tu', 'ego', 'sum', 'in', 'non', 'nec', 'atque', 
    'do', 'ille', 'noster', 'si', 'iam', 'ad', 'quis', 'nunc', 'tuus', 
    'ipse', 'sed', 'meus', 'fero', 'per', 'magnus', 'bellum', 'deus', 
    'cum', 'aut', 'manus', 'pater', 'o', 'nos', 'omnis', 'arma', 'sic',
    'ab', 'ut', 'ago', 'nascor', 'dexter', 'sanguis', 'labor', 'terra', 
    'facio', 'eo', 'primus', 'aio', 'gens',
]

### Feature extraction

Let's extract the lemma frequencies for each speech, considering only these features.

#### A table of feature vectors

The resulting table has one row per speech, and one column for each of the lemmata in our featureset. The speech is  represented by *n* feature frequencies, so we can think of it as a point, or vector, in an *n*-dimensional space.

In [None]:
all_vec = pd.crosstab(all_tokens.id, all_tokens.lem, normalize='index')

In [None]:
x = all_vec.loc[:, keywords]

**Now add the tag data**

In order to compare between speech types, we're going to add in a tag column. Then as we did above, we have to break out any rows with multiple tags. 

In [None]:
x['tags'] = all_labels.tags
x = x.explode('tags')
x

**Example: how does use of *sum* vary across speech types?**

Answer: not much. But compare, for example, consolation (`con`) and challenge (`cha`).

In [None]:
x.boxplot(column='sum', by='tags', figsize=(20,10))
plt.show()

### Log frequencies

The distributions of lemmata are not normal across these samples, because of the exponential rate of decrease mentioned above. That's one reason why so many of the boxes in the plot above have a really low mean but then a bunch of outliers at the top. 

If we consider not the frequencies but the log of the frequencies, it's a little easier to see the variation. Any samples where a given word does not occur will have a frequency of `0`; in the log version we'll replace that with the placeholder value `NaN`, since the log of 0 can't be calculated.

In [None]:
x_log = x.copy()
for col in x_log.columns:
    if col != 'tags':
        x_log[col] = x[col].apply(np.log).values
x_log[x==0] = np.nan
x_log

### Comparing log lemma frequencies between speech types

We already know that our main interest in terms of speech types is oracular speech. Let's see how it compares with other speech types using the new featureset.

#### Visualizing with boxplots

Here we compare a couple of our features across oracular speech, taunts and challenges.

In [None]:
features = ['et', 'qui', 'sum']
row_select = ['ora', 'tau', 'cha']

for feat in features:
    mask = x_log.tags.isin(row_select) 
    x_log[mask].boxplot(column=feat, by='tags', figsize=(4,2))
    plt.suptitle('')
    plt.xlabel('')
    plt.show()

#### Visualizing with histograms

Here's a comparison of the distribution of *et* in oracular speech versus in the "taunt" and "challenge" categories combined.

The distributions overlap, but the taunt/challenge group tends to use *et* more frequently.

In [None]:
feat = 'et'
label = 'oracular speech'
comp_mask = (x_log.tags=='tau')|(x_log.tags=='cha')
comp_label = 'taunt, challenge'

fig, ax = plt.subplots()
ax.hist(x_log[x_log.tags=='ora'][feat], bins=15, alpha=0.5, label=label)
ax.hist(x_log[comp_mask][feat], bins=15, alpha=0.5, label=comp_label)
ax.legend()
ax.set_title(feat)
ax.set_xlabel('log term frequency')
ax.set_ylabel('speeches')
plt.show()

**Visualizing as a 2d feature-space**

We can also plot each speech as a point in a cartesian space defined by two features. For example, here we look at the log frequencies of 'et' and 'hic'.

Now we're starting to see some separation...

In [None]:
feat_x = 'et'
feat_y = 'qui'

comp_label = 'taunt, challenge'
comp_mask = x_log.tags.isin(['cha', 'tau'])

targ_label = 'oracular speech'
targ_mask = x_log.tags == 'ora'

fig, ax = plt.subplots()
ax.plot(x_log[targ_mask][feat_x], x_log[targ_mask][feat_y], marker='o', linestyle='', label=targ_label)
ax.plot(x_log[comp_mask][feat_x], x_log[comp_mask][feat_y], marker='o', linestyle='', label=comp_label)
ax.set_xlabel(feat_x)
ax.set_ylabel(feat_y)
ax.legend()
plt.show()

## What makes oracular speech different?

Let's choose a new features set -- this time we'll use all the words that occur in oracular speech.

In [None]:
ora_lems = all_tokens.explode('tags').groupby('tags').get_group('ora').lem.unique()

Now we redo the feature vectors using these features.

In [None]:
# subset the complete vector space
x = all_vec.loc[:, ora_lems]

# add tags and explode
x['tags'] = all_labels.tags
x = x.explode('tags')

# take the log of all frequencies
x_log = x.copy()
for col in x_log.columns:
    if col != 'tags':
        x_log[col] = x[col].apply(np.log).values
x_log[x==0] = np.nan

In [None]:
feat = 'fero'
label = 'oracular speech'
targ_mask = x_log.tags=='ora'
comp_mask = x_log.tags=='del'
comp_label = 'deliberation'


fig, ax = plt.subplots()
ax.hist(x_log[targ_mask.values][feat], bins=15, alpha=0.5, label=label)
ax.hist(x_log[comp_mask.values][feat], bins=15, alpha=0.5, label=comp_label)
ax.legend()
ax.set_title(feat)
ax.set_xlabel('log term frequency')
ax.set_ylabel('speeches')
plt.show()

In [None]:
min_samples = 10
a = 0.05

sig_feats={}
pbar = NotebookPBar(max=len(ora_lems))

for feat in ora_lems:
    pbar.update()
    groups = []
    labels = []

    for name, df in x_log.groupby('tags'):
        vals = df[feat]
        vals = vals[vals.notna()]
        if len(vals) > min_samples:
            groups.append(vals)
            labels.append(name)
    
    if len(groups) < 2:
        continue
        
    # perform omnibus anova first
    stat, pval = f_oneway(*groups)
    if pval > a:
        continue
    
    # perform pairwise tests
    tukey = tukey_hsd(*groups)
    for i in range(len(groups)):
        for j in range(i):
            if tukey.pvalue[i,j] < a:
                key = tuple(sorted([labels[i], labels[j]]))
                sig_feats.setdefault(key, [])
                sig_feats[key].append((feat, round(tukey.pvalue[i,j], 4)))

In [None]:
with pd.option_context('display.max_rows', None):
    display(pd.DataFrame(dict(
        tag1 = tag1,
        tag2 = tag2,
        features = [feat for feat, pval in sig_feats[(tag1, tag2)]],
    ) for tag1, tag2 in sig_feats))

In [None]:
sig_feats[('del', 'exh')]

In [None]:
from sklearn.feature_selection import chi2

In [None]:
foo = all_tokens.explode('tags')
foo = foo.loc[foo.pos != 'PUNCT']
tag_norm = pd.crosstab(foo.tags, foo.lem, normalize='index')

In [None]:
feat_x = 'nefas'
feat_y = 'deus'

fig, ax = plt.subplots()
ax.plot(tag_norm[feat_x], tag_norm[feat_y], marker='o', linestyle='', ms=2)
for x, y, s in zip(tag_norm[feat_x], tag_norm[feat_y], tag_norm.index):
    ax.text(x,y,s)
ax.set_xlabel(feat_x)
ax.set_ylabel(feat_y)
plt.show()

In [None]:
all_labels

In [None]:
foo = pd.concat([all_vec, all_labels.tags], axis=1).explode('tags')
labels = foo.tags
foo = foo.drop('tags', axis=1)

In [None]:
keyness, _ = chi2(foo, labels=='trag')
keyness = pd.Series(keyness, index=foo.columns).sort_values(ascending=False)
keyness[100:150]

In [None]:
keyness, _ = chi2(tag_norm, tag_norm.index=='')
keyness = pd.Series(keyness, index=tag_norm.columns).sort_values(ascending=False)
keyness[50:100]

In [None]:
foo = tag_norm.loc[tag_norm.index != 'inv']
pca_model = PCA(n_components=3)
tag_pca = pca_model.fit_transform(foo)
tag_pca = pd.DataFrame(
    data=tag_pca,
    columns=['PC1', 'PC2', 'PC3'],
    index=tag_norm.index[tag_norm.index!='inv'])
tag_pca

In [None]:
feat_x = 'PC1'
feat_y = 'PC2'

fig, ax = plt.subplots()
ax.plot(tag_pca[feat_x], tag_pca[feat_y], marker='o', linestyle='', ms=2)
for x, y, s in zip(tag_pca[feat_x], tag_pca[feat_y], tag_pca.index):
    ax.text(x,y,s)
ax.set_xlabel(feat_x)
ax.set_ylabel(feat_y)
plt.show()

### lemma-based PCA features

In [None]:
pca_model = PCA(n_components=3)
lem_pca = pca_model.fit_transform(x)
lem_pca = pd.DataFrame(
    data=lem_pca,
    columns=['PC1', 'PC2', 'PC3'])
lem_pca['tags'] = [getTags(s) for s in flav_speeches]
lem_pca = lem_pca.explode('tags')

In [None]:
feat_x = 'PC1'
feat_y = 'PC3'

names = ['ora', 'del', 'res']


fig, ax = plt.subplots(figsize=(8,4))
for name in names:
    selecter = lem_pca.tags==name
    ax.plot(lem_pca[selecter][feat_x], lem_pca[selecter][feat_y], marker='o', linestyle='', ms=4, label=tagtype[name][:18])
ax.set_xlabel(feat_x)
ax.set_ylabel(feat_y)
#ax.set_xlim((-0.05, 0.05))
ax.legend()

plt.show()

### Same thing but with Seneca

In [None]:
tagtype['sen'] = 'Seneca'

In [None]:
sen_rows = pd.DataFrame(buildFeatures(s, target_lems) for s in sen_speeches)
sen_rows['tags'] = 'sen'
flav_rows = x.copy()
flav_rows.index = [s.id for s in flav_speeches]
flav_rows['tags'] = [getTags(s) for s in flav_speeches]
all_rows = pd.concat([sen_rows, flav_rows])

In [None]:
all_rows

In [None]:
pca_model = PCA(n_components=3)
lem_pca = pca_model.fit_transform(all_rows.drop('tags', axis=1))
lem_pca = pd.DataFrame(
    data=lem_pca,
    columns=['PC1', 'PC2', 'PC3'])
lem_pca.index = all_rows.index
lem_pca['tags'] = all_rows['tags']
lem_pca = lem_pca.explode('tags')

In [None]:
lem_pca

In [None]:
feat_x = 'PC1'
feat_y = 'PC2'

names = ['ora', 'nar', 'sen']


fig, ax = plt.subplots(figsize=(8,4))
for name in names:
    selecter = lem_pca.tags==name
    ax.plot(lem_pca[selecter][feat_x], lem_pca[selecter][feat_y], marker='o', linestyle='', ms=4, label=tagtype[name][:18])
ax.set_xlabel(feat_x)
ax.set_ylabel(feat_y)
#ax.set_xlim((-0.05, 0.05))
ax.legend()

plt.show()

In [None]:
lem_pca[lem_pca.tags=='del'].PC3.hist(alpha=0.5)
lem_pca[lem_pca.tags=='sen'].PC3.hist(alpha=0.5)