# Preliminaries

## Import statements

This gives us access to code that isn't part of base Python.

In [1]:
import os
import pandas as pd
from matplotlib import pyplot as plt

## Load the data

Once again, we're starting from the token table created in notebook 3.

In [3]:
# load the token table
csv_file = os.path.join('data', 'tokens.csv')
token_table = pd.read_csv(csv_file, dtype=str)

# drop punctuation tokens
no_punct = token_table.loc[token_table.upos!='PUNCT'].reset_index(drop=True)

display(no_punct)

Unnamed: 0,urn,author,title,line,token,lemma,upos,mood,tense,voice,person,number,case,gender
0,urn:cts:latinLit:phi1017.phi007,Seneca,Agamemnon,1,Opaca,Opaca,PROPN,,,,,Sing,Nom,Masc
1,urn:cts:latinLit:phi1017.phi007,Seneca,Agamemnon,1,linquens,linquens,VERB,,,Act,,Sing,Nom,Masc
2,urn:cts:latinLit:phi1017.phi007,Seneca,Agamemnon,1,Ditis,Dis,PROPN,,,,,Sing,Gen,Masc
3,urn:cts:latinLit:phi1017.phi007,Seneca,Agamemnon,1,inferni,infernus,ADJ,,,,,Sing,Gen,Masc
4,urn:cts:latinLit:phi1017.phi007,Seneca,Agamemnon,1,loca,locus,NOUN,,,,,Plur,Acc,Neut
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100104,urn:cts:latinLit:phi1035.phi001:8,Val_Flac,Thebaid_08,467,meruisse,mereo,VERB,Inf,Pres,Act,,,,
100105,urn:cts:latinLit:phi1035.phi001:8,Val_Flac,Thebaid_08,467,putas,puto,VERB,Ind,Pres,Act,2,Sing,,
100106,urn:cts:latinLit:phi1035.phi001:8,Val_Flac,Thebaid_08,467,me,ego,PRON,,,,,Sing,Acc,
100107,urn:cts:latinLit:phi1035.phi001:8,Val_Flac,Thebaid_08,467,talia,talis,DET,,,,,Plur,Acc,Neut


## Exclude very uncommon words

Let's keep only words that occur at least 10 times.

In [4]:
# calculate corpus-wide counts for all lemmata
lemma_count = no_punct.lemma.value_counts()

# create stoplist
lemma_kept = lemma_count.index.values[lemma_count>=10]

# Measuring internal variations of style

For this experiment, we are going to calculate a **rolling** style signal—something that changes as we move through each document. In previous examples, each text was represented by a single sample. Here, we're going to create a **sliding window** that moves through the text, sampling as it moves. The samples will have a fixed size (the size of the window) but will overlap at the edges.

That lets us measure internal variability within a document, while hopefully keeping the samples large enough for the signal to be robust. We expect the signal to change relatively smoothly, and for the changes to correspond to meaningful divisions in the text. If we see something weird, we might need to change the size of the window or the features we're looking at.

## Sampling

How do we create multiple, overlapping samples from our token table?

### Create unique ids for lines

In [5]:
line_urns = []

for i, row in no_punct[['urn', 'author', 'line']].iterrows():
    if 'author' == 'Seneca':
        this_id = row.urn + ':' + row.line
    else:
        this_id = row.urn + '.' + row.line
    line_urns.append(this_id)

In [6]:
no_punct['line_id'] = pd.Categorical(line_urns, categories=pd.unique(line_urns), ordered=True)

In [7]:
display(no_punct)

Unnamed: 0,urn,author,title,line,token,lemma,upos,mood,tense,voice,person,number,case,gender,line_id
0,urn:cts:latinLit:phi1017.phi007,Seneca,Agamemnon,1,Opaca,Opaca,PROPN,,,,,Sing,Nom,Masc,urn:cts:latinLit:phi1017.phi007.1
1,urn:cts:latinLit:phi1017.phi007,Seneca,Agamemnon,1,linquens,linquens,VERB,,,Act,,Sing,Nom,Masc,urn:cts:latinLit:phi1017.phi007.1
2,urn:cts:latinLit:phi1017.phi007,Seneca,Agamemnon,1,Ditis,Dis,PROPN,,,,,Sing,Gen,Masc,urn:cts:latinLit:phi1017.phi007.1
3,urn:cts:latinLit:phi1017.phi007,Seneca,Agamemnon,1,inferni,infernus,ADJ,,,,,Sing,Gen,Masc,urn:cts:latinLit:phi1017.phi007.1
4,urn:cts:latinLit:phi1017.phi007,Seneca,Agamemnon,1,loca,locus,NOUN,,,,,Plur,Acc,Neut,urn:cts:latinLit:phi1017.phi007.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100104,urn:cts:latinLit:phi1035.phi001:8,Val_Flac,Thebaid_08,467,meruisse,mereo,VERB,Inf,Pres,Act,,,,,urn:cts:latinLit:phi1035.phi001:8.467
100105,urn:cts:latinLit:phi1035.phi001:8,Val_Flac,Thebaid_08,467,putas,puto,VERB,Ind,Pres,Act,2,Sing,,,urn:cts:latinLit:phi1035.phi001:8.467
100106,urn:cts:latinLit:phi1035.phi001:8,Val_Flac,Thebaid_08,467,me,ego,PRON,,,,,Sing,Acc,,urn:cts:latinLit:phi1035.phi001:8.467
100107,urn:cts:latinLit:phi1035.phi001:8,Val_Flac,Thebaid_08,467,talia,talis,DET,,,,,Plur,Acc,Neut,urn:cts:latinLit:phi1035.phi001:8.467


In [8]:
len(no_punct.line_id.unique())

16574

### Calculate line-based lemma counts

The lemma-based cross-tabulation takes a long time because there are 14000 unique lemmata. But most of these are going to be thrown out immediately because they're not in the `lemma_kept` list. Here we make a **mask** based on which rows fit a criterion (their lemma is in the kept list). Then we use that mask to filter just the rows we want before doing the `crosstab()`. That saves computing a lot of data we don't need.

In [9]:
# identify rows that meet criterion
mask = no_punct.lemma.isin(lemma_kept)

# do cross-tabulation on masked table
lemma_count_line = pd.crosstab(no_punct.line_id, no_punct.lemma.loc[mask])

# reorder columns by frequency
lemma_count_line = lemma_count_line[lemma_kept]

In [10]:
display(lemma_count_line)

lemma,que,et,qui,sum,hic,in,tu,non,ego,iam,...,offero,auus,Iove,quodque,redux,aequoreus,advolo,alumnus,magnanimus,patruus
line_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
urn:cts:latinLit:phi1017.phi007.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
urn:cts:latinLit:phi1017.phi007.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
urn:cts:latinLit:phi1017.phi007.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
urn:cts:latinLit:phi1017.phi007.4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
urn:cts:latinLit:phi1017.phi007.5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
urn:cts:latinLit:phi1035.phi001:8.463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
urn:cts:latinLit:phi1035.phi001:8.464,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
urn:cts:latinLit:phi1035.phi001:8.465,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
urn:cts:latinLit:phi1035.phi001:8.466,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Calculate line-based counts for part-of-speech tags

In [11]:
# calculate pos counts
pos_count_line = pd.crosstab(no_punct.line_id, no_punct.upos)

# rename columns with a prefix
pos_count_line = pos_count_line.rename(columns = lambda name: 'pos_' + name)

display(pos_count_line)

upos,pos_ADJ,pos_ADP,pos_ADV,pos_AUX,pos_CCONJ,pos_DET,pos_INTJ,pos_NOUN,pos_NUM,pos_PART,pos_PRON,pos_PROPN,pos_SCONJ,pos_VERB,pos_X
line_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
urn:cts:latinLit:phi1017.phi007.1,1,0,0,0,0,0,0,1,0,0,0,2,0,1,0
urn:cts:latinLit:phi1017.phi007.2,1,0,0,0,0,0,0,1,0,0,0,1,0,2,0
urn:cts:latinLit:phi1017.phi007.3,1,0,1,0,0,0,0,2,0,0,0,0,0,1,0
urn:cts:latinLit:phi1017.phi007.4,0,0,0,0,0,0,0,4,0,0,0,0,0,1,0
urn:cts:latinLit:phi1017.phi007.5,0,0,0,0,1,0,1,3,0,0,0,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
urn:cts:latinLit:phi1035.phi001:8.463,1,0,0,0,0,0,0,3,0,0,0,1,0,1,0
urn:cts:latinLit:phi1035.phi001:8.464,2,0,2,0,1,0,0,2,0,0,0,0,0,1,0
urn:cts:latinLit:phi1035.phi001:8.465,1,0,2,0,0,0,0,1,0,0,0,0,0,2,0
urn:cts:latinLit:phi1035.phi001:8.466,0,0,0,0,2,1,0,0,0,0,1,0,0,4,0


### Calculate line-based counts for morphological features

In [14]:
# a list of columns to process
feature_names = ['mood', 'voice', 'tense', 'person', 'number', 'gender', 'case']

# an empty list to gather the resulting tables
morph_counts = []

# iterate over the columns, using `feat` as a stand-in for the current feature
for feat in feature_names:
    
    # tally feature counts and normalize
    this_count = pd.crosstab(no_punct.line_id, no_punct[feat], dropna=False)

    # rename columns with a prefix
    this_count = this_count.rename(columns = lambda name: feat + '_' + name.upper())
    
    # add table to the list
    morph_counts.append(this_count)

### Join all the tables together

In [18]:
# join all the tables together
feat_count_line = pos_count_line.join(morph_counts).join(lemma_count_line).fillna(0).astype(int)
    
# show results
display(feat_count_line)

Unnamed: 0_level_0,pos_ADJ,pos_ADP,pos_ADV,pos_AUX,pos_CCONJ,pos_DET,pos_INTJ,pos_NOUN,pos_NUM,pos_PART,...,offero,auus,Iove,quodque,redux,aequoreus,advolo,alumnus,magnanimus,patruus
line_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
urn:cts:latinLit:phi1017.phi007.1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
urn:cts:latinLit:phi1017.phi007.2,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
urn:cts:latinLit:phi1017.phi007.3,1,0,1,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
urn:cts:latinLit:phi1017.phi007.4,0,0,0,0,0,0,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0
urn:cts:latinLit:phi1017.phi007.5,0,0,0,0,1,0,1,3,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
urn:cts:latinLit:phi1035.phi001:8.463,1,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
urn:cts:latinLit:phi1035.phi001:8.464,2,0,2,0,1,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
urn:cts:latinLit:phi1035.phi001:8.465,1,0,2,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
urn:cts:latinLit:phi1035.phi001:8.466,0,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## The sliding window

In [25]:
size = 5

pd.DataFrame(
    row = feat_count_line.index.codes,
    feat_count_line.index.codes // size * size)[:30]

array([ 0,  0,  0,  0,  0,  5,  5,  5,  5,  5, 10, 10, 10, 10, 10, 15, 15,
       15, 15, 15, 20, 20, 20, 20, 20, 25, 25, 25, 25, 25], dtype=int16)