In [3]:
import pandas as pd
import spacy
import os

In [4]:
sen_tokens = pd.read_csv(os.path.join('data', 'seneca_tokens.csv'))
vf_tokens = pd.read_csv(os.path.join('data', 'vf_tokens.csv'))
token_table = pd.concat([sen_tokens, vf_tokens])
token_table['line_id'] = pd.Categorical(token_table['line_id'], categories=pd.unique(token_table['line_id']), ordered=True)

In [5]:
token_table

Unnamed: 0,perseus_n,speaker,elision,speech,token,file,line_id,elided,lemma,reps,upos,mood,tense,voice,person,number,case,gender,comp,pc_n
0,1,thyestis,0,100001,Opaca,Sen_Agamemnon,Sen_Agamemnon:1,False,opacus,1,VERB,Imp,Pres,Act,2.0,Sing,,,,
1,1,thyestis,0,100001,linquens,Sen_Agamemnon,Sen_Agamemnon:1,False,linquo,1,NOUN,,,,,Sing,Nom,Neut,,
2,1,thyestis,0,100001,Ditis,Sen_Agamemnon,Sen_Agamemnon:1,False,Dis,1,VERB,,,Pass,,Plur,Abl,Neut,,
3,1,thyestis,0,100001,inferni,Sen_Agamemnon,Sen_Agamemnon:1,False,infernus,1,ADJ,,,,,Sing,Gen,Masc,,
4,1,thyestis,0,100001,loca,Sen_Agamemnon,Sen_Agamemnon:1,False,locus,1,NOUN,,,,,Plur,Acc,Neut,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14026,467,Jason,1,7195,meruisse,VF_Argonautica_8,VF_Argonautica_8:467,False,mereo,1,VERB,,Perf,Act,,,,,0.923077,467.0
14027,467,Jason,1,7195,putas,VF_Argonautica_8,VF_Argonautica_8:467,False,puto,1,VERB,Ind,Pres,Act,2.0,Sing,,,0.923077,467.0
14028,467,Jason,1,7195,me,VF_Argonautica_8,VF_Argonautica_8:467,False,ego,1,PRON,,,,1.0,Sing,Acc,,0.923077,467.0
14029,467,Jason,1,7195,talia,VF_Argonautica_8,VF_Argonautica_8:467,False,talis,1,DET,,,,,Plur,Acc,Neut,0.923077,467.0


## feature extraction

### question words

In [6]:
keywords = ['cur', 'ne', 'qua', 'qualis', 'quam', 'quando', 'quantus', 'quare', 'quis', 'quo', 'quomodo', 'quot', 'quotiens', 'ubi', 'unde',]
token_table['interrog'] = token_table.lemma.isin(keywords)

feat_interrog_speech = pd.crosstab(token_table['speech'], token_table.interrog).drop(columns=False).rename(columns={True:'oth_INTERROG'})
display(feat_interrog_speech)

interrog,oth_INTERROG
speech,Unnamed: 1_level_1
7001,0
7002,1
7003,0
7004,0
7005,2
...,...
180047,1
180048,3
180049,0
180050,0


#### repetitions

In [7]:
feat_reps_speech = pd.crosstab(token_table['speech'], token_table.reps>1).drop(columns=False).rename(columns={True:'oth_REPS'})
display(feat_reps_speech)

reps,oth_REPS
speech,Unnamed: 1_level_1
7001,12
7002,11
7003,12
7004,6
7005,6
...,...
180047,25
180048,2
180049,5
180050,22


### elisions

In [8]:
feat_elis_speech = pd.crosstab(token_table['speech'], token_table['elided']).drop(columns=False).rename(columns={True: 'oth_ELIS'})
# feat_elis_line.oth_ELIS = feat_elis_line.oth_ELIS.apply(float).apply(int)
display(feat_elis_speech)

elided,oth_ELIS
speech,Unnamed: 1_level_1
7001,10
7002,3
7003,7
7004,4
7005,1
...,...
180047,12
180048,0
180049,3
180050,16


### POS features

In [9]:
# calculate pos counts
pos_count_speech = pd.crosstab(token_table['speech'], token_table.upos)

# rename columns with a prefix
pos_count_speech = pos_count_speech.rename(columns = lambda name: 'pos_' + name)
#pos_count_line = pos_count_line.drop(columns = 'pos_')

display(pos_count_speech)

upos,pos_ADJ,pos_ADP,pos_ADV,pos_AUX,pos_CCONJ,pos_DET,pos_INTJ,pos_NOUN,pos_NUM,pos_PART,pos_PRON,pos_PROPN,pos_SCONJ,pos_VERB,pos_X
speech,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
7001,13,1,8,0,13,11,1,38,1,2,9,8,3,19,0
7002,11,4,0,0,8,4,0,18,0,0,6,4,2,14,0
7003,3,1,7,2,7,4,0,15,0,2,2,2,0,9,0
7004,8,4,1,0,5,3,1,9,0,1,1,2,0,9,0
7005,4,3,5,2,6,3,1,15,0,3,6,7,3,17,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180047,34,11,12,1,20,18,0,70,0,1,4,8,2,38,0
180048,6,0,0,0,3,3,0,11,0,2,5,5,0,8,0
180049,6,1,3,1,3,1,0,16,0,0,0,0,0,10,0
180050,38,8,20,3,19,24,0,86,0,2,2,10,7,67,0


### morphological features

In [10]:
# a list of columns to process
feature_names = ['mood', 'voice', 'tense', 'person', 'number', 'gender', 'case']

# an empty list to gather the resulting tables
morph_counts = []

# iterate over the columns, using `feat` as a stand-in for the current feature
for feat in feature_names:
    
    # tally feature counts and normalize
    this_count = pd.crosstab(token_table['speech'], token_table[feat], dropna=False)

    # rename columns with a prefix
    this_count = this_count.rename(columns = lambda name: feat + '_' + str(name).upper())
    
    # add table to the list
    morph_counts.append(this_count)

### The full feature set

In [11]:
# join all the tables together
feat_count_speech = pos_count_speech.join(morph_counts).join([feat_interrog_speech, feat_reps_speech, feat_elis_speech]).fillna(0).astype(int)
feat_count_speech['lemma_ALL'] = token_table.groupby('speech').agg(lemma_ALL=('lemma', 'count'))
display(feat_count_speech)

Unnamed: 0_level_0,pos_ADJ,pos_ADP,pos_ADV,pos_AUX,pos_CCONJ,pos_DET,pos_INTJ,pos_NOUN,pos_NUM,pos_PART,...,case_DAT,case_GEN,case_LOC,case_NOM,case_VOC,case_NAN,oth_INTERROG,oth_REPS,oth_ELIS,lemma_ALL
speech,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7001,13,1,8,0,13,11,1,38,1,2,...,8,12,0,22,0,46,0,12,10,127
7002,11,4,0,0,8,4,0,18,0,0,...,2,1,0,13,0,26,1,11,3,71
7003,3,1,7,2,7,4,0,15,0,2,...,2,8,0,4,1,27,0,12,7,53
7004,8,4,1,0,5,3,1,9,0,1,...,1,1,0,5,0,19,0,6,4,42
7005,4,3,5,2,6,3,1,15,0,3,...,3,2,0,14,1,36,2,6,1,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180047,34,11,12,1,20,18,0,70,0,1,...,6,17,0,50,0,77,1,25,12,219
180048,6,0,0,0,3,3,0,11,0,2,...,1,6,0,12,0,10,3,2,0,43
180049,6,1,3,1,3,1,0,16,0,0,...,1,3,0,8,0,11,0,5,3,40
180050,38,8,20,3,19,24,0,86,0,2,...,6,22,0,70,0,112,0,22,16,283


### labels for the speeches

In [12]:
labels = token_table.groupby('speech').agg(
    speech_id = ('speech', 'first'),
    file = ('file', 'first'),
    speaker = ('speaker', 'first'),
    first_line = ('line_id', 'first'),
    last_line = ('line_id', 'last'),
    lines = ('line_id', 'nunique'),
    tokens = ('token', 'count'),
)
display(labels)

Unnamed: 0_level_0,speech_id,file,speaker,first_line,last_line,lines,tokens
speech,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7001,7001,VF_Argonautica_1,Pelias,VF_Argonautica_1:40,VF_Argonautica_1:57,17,127
7002,7002,VF_Argonautica_1,Jason,VF_Argonautica_1:81,VF_Argonautica_1:90,10,71
7003,7003,VF_Argonautica_1,Juno,VF_Argonautica_1:113,VF_Argonautica_1:119,7,54
7004,7004,VF_Argonautica_1,Jason,VF_Argonautica_1:150,VF_Argonautica_1:155,6,44
7005,7005,VF_Argonautica_1,Jason,VF_Argonautica_1:164,VF_Argonautica_1:173,10,75
...,...,...,...,...,...,...,...
180047,180047,Sen_Troades,nuntius,Sen_Troades:1068,Sen_Troades:1102,35,219
180048,180048,Sen_Troades,andromacha,Sen_Troades:1104,"Sen_Troades:1110, 1110b",7,43
180049,180049,Sen_Troades,nuntius,Sen_Troades:1111,"Sen_Troades:1117, 1117b",7,41
180050,180050,Sen_Troades,nuntius,Sen_Troades:1118,Sen_Troades:1164,47,286


### Small composite feature set

In [13]:
sm_feat_count_speech = pd.DataFrame(dict(
    interjections = feat_count_speech.pos_INTJ,
    repetitions = feat_count_speech.oth_REPS,
    elisions = feat_count_speech.oth_ELIS,
    questions = feat_count_speech.oth_INTERROG,
    vocatives = feat_count_speech.case_VOC,
    imperatives = feat_count_speech.mood_IMP,
))
sm_feat_count_speech['composite'] = sm_feat_count_speech.apply(sum, axis=1)
labels.join(sm_feat_count_speech).to_csv('concise_count.csv', index=False)

sm_feat_freq_speech = sm_feat_count_speech.div(labels['tokens'], axis=0)
labels.join(sm_feat_freq_speech).to_csv('concise_freq.csv', index=False)

display(sm_feat_count_speech.sort_values('composite', ascending=False).iloc[:25])

Unnamed: 0_level_0,interjections,repetitions,elisions,questions,vocatives,imperatives,composite
speech,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
160003,0,105,70,21,0,15,211
160018,0,106,50,11,0,13,180
100021,0,111,57,5,2,4,179
110048,1,105,32,19,3,4,164
130023,0,87,41,10,2,12,152
120001,0,66,54,9,2,11,142
110001,0,83,27,12,1,5,128
140015,0,70,43,2,6,5,126
110035,1,65,32,14,0,5,117
110036,0,70,28,7,0,9,114


### Experimental

In [14]:
def formatText(table):
    spans = []
    for row in table.itertuples():
        span = row.token
        if row.upos == 'INTJ':
            span = f'<span style="color:gold;font-weight:700">[</span>{span}<span style="color:gold;font-weight:700">]</span>'
        if row.reps > 1:
            span = f'<span style="color:cyan;font-weight:700">[</span>{span}<span style="color:cyan;font-weight:700">]</span>'
        if row.elided:
            span = f'<span style="color:brown;font-weight:700">[</span>{span}<span style="color:brown;font-weight:700">]</span>'
        if row.lemma in keywords:
            span = f'<span style="color:lime;font-weight:700">[</span>{span}<span style="color:lime;font-weight:700">]</span>'
        if row.case == 'VOC':
            span = f'<span style="color:green;font-weight:700">[</span>{span}<span style="color:green;font-weight:700">]</span>'
        if row.mood == 'IMP':
            span = f'<span style="color:magenta;font-weight:700">[</span>{span}<span style="color:magenta;font-weight:700">]</span>'
        spans.append(span)

    return ' '.join(spans) + '<br>'

In [16]:
labels['text'] = token_table.groupby('speech').apply(formatText)
labels.join(sm_feat_count_speech).to_csv(os.path.join('data', 'concise_count.csv'), index=False)
labels.join(sm_feat_freq_speech).to_csv(os.path.join('data', 'concise_freq.csv'), index=False)

  labels['text'] = token_table.groupby('speech').apply(formatText)
