## Preliminaries
### Import statements

In [1]:
import pandas as pd
import spacy
import os
import re

### Files to parse

In [2]:
seneca_files = [
    'Sen_Agamemnon.csv',	
    'Sen_Hercules_Oetaeus.csv',
    'Sen_Hercules_furens.csv',	
    'Sen_Medea.csv',		
    'Sen_Oedipus.csv',		
    'Sen_Phaedra.csv',		
    'Sen_Phoenissae.csv',	
    'Sen_Thyestes.csv',	
    'Sen_Troades.csv',
]

### Load spaCy model

In [3]:
nlp = spacy.load('la_core_web_lg')

## Parse the texts

### Load and parse the texts

The CSV files in `data/` contain line-by-line alignments of Perseus' text and ground-truth scansion from [Fedchin, Burns, Chaudhuri, and Dexter (2022) "Senecan Trimeter and Humanist Tragedy"](https://github.com/QuantitativeCriticismLab/AJP-2022-Senecan-Trimeter). Here, we load the files, parse the text with spaCy, and create a Pandas DataFrame with one row per token.

In [4]:
tables = []
offset = 100000

for file in seneca_files:
    path = os.path.join('data', 'seneca', file)
    print(path)
    df = pd.read_csv(path)
    df = df.loc[~df.elision.isna()]
    df = df.loc[~df.speech.isna()]
    df.perseus_n = df.perseus_n.str.replace(r'[\[\]\']', '', regex=True)
    df.speaker = df.speaker.str.replace(r'[\[\]\'#]', '', regex=True)
    df.elision = df.elision.astype(int)
    df.speech = df.speech.astype(int) + offset
    df.anceps_n = df.anceps_n.astype(int)
    df['tokens'] = df.perseus_text.apply(lambda s: [tok for tok in nlp(s)])
    df['file'] = file[:-4]
    df['line_id'] = df['file'] + ':' + df['perseus_n']
    tables.append(df)
    offset += 10000

df = pd.concat(tables)
df['line_id'] = pd.Categorical(df['line_id'], categories=pd.unique(df['line_id']), ordered=True)

data/seneca/Sen_Agamemnon.csv
data/seneca/Sen_Hercules_Oetaeus.csv
data/seneca/Sen_Hercules_furens.csv
data/seneca/Sen_Medea.csv
data/seneca/Sen_Oedipus.csv
data/seneca/Sen_Phaedra.csv
data/seneca/Sen_Phoenissae.csv
data/seneca/Sen_Thyestes.csv
data/seneca/Sen_Troades.csv


### Helper function to extract elisions from the scansion

In [5]:
def normalize(s):
    return re.sub(r'[^a-z]', '', s.lower()).replace('jv', 'iu')

def getElided(df):
    '''extract elided tokens from a line-array table'''
    rows = []
    row_count = 0

    for row in df.itertuples():
        elided = [False] * len(row.tokens)
        row_count += 1
        
        if '(' in row.anceps_text:
            anceps = row.anceps_text.split()
            no_punct = [tok for tok in row.tokens if tok.pos_ != 'PUNCT']
            
            if len(no_punct) == len(anceps):
                for anc, tok in zip(anceps, no_punct):
                    if '(' in anc:
                        idx = row.tokens.index(tok)
                        elided[idx] = True
            else:
                for anc in anceps:
                    if '(' in anc:
                        normalized_anc = normalize(anc)
                        normalized_toks = [normalize(tok.text) for tok in row.tokens]
                        count = normalized_toks.count(normalized_anc)
                        if count == 1:
                            idx = normalized_toks.index(normalized_anc)
                        elif anc.endswith('qv(e)') and normalized_toks.count('que') == 1:
                            idx = normalized_toks.index('que')
                        else:
                            print(f'[{row_count}]\t' + ' '.join([f'{i}.{tok.text}' for i, tok in enumerate(row.tokens)]))
                            idx = int(input(f'Which is {anc}? '))
                        elided[idx] = True
        rows.append(elided)
    return rows

### Calculate elisions

In [7]:
df['elided'] = getElided(df)

[65]	0.te 1.cum 2.ipsa 3.nunc 4.evolve 5.femineos 6.dolos 7.,


Which is te*c(u)m?  1


[170]	0.iacens 1.que 2.vultu 3.languido 4.optutus 5.stupet 6.?


Which is la_ngvi^d(o)?  3


[225]	0.Exilia 1.mihi 2.sunt 3.haud 4.nova 5., 6.assuevi 7.malis 8..


Which is no^u(a)?  4


[256]	0.iam 1.que 2.ense 3.fessum 4.miles 5.exonerat 6.latus 7.,


Which is ia*mqu(e)?  1


[531]	0.textos 1.amictus 2.— 3.horreo 4.atque 5.animo 6.tremo 7.:


Which is a_tqv(e)?  4


[696]	0.haud 1.dum 2.astra 3.merui 4., 5.Siculus 6.Hesperium 7.latus


Which is h[au]dd(u)m?  1


[803]	0.Graiorum 1.in 2.istos 3.terra 4.consurget 5.lares


Which is to_t(a)?  0


[909]	0.intendat 1.arcus 2.me 3.que 4.natum 5.que 6.opprimat 7.?


Which is na_tu*mqv(e)?  5


[1016]	0.cape 1.hunc 2.triumphum 3.solus 4.et 5.vince 6.Herculem 7..


Which is e_vi*nc(e)?  5


[1049]	0.flammis 1.que 2.Nessus 3.sanguinem 4.ostendi 5.arcuit 6.:


Which is sa_ngvi^n(e)m?  3


[1386]	0.totum 1.que 2.lentus 3.sanguinem 4.avexit 5.vapor 6.,


Which is sa_ngvi^n(e)m?  3


[1474]	0.te 1.deprecantem 2.. 3.nunc 4.milli 5.irata 6.† 7.pater


Which is mi^h(i)?  4


[1604]	0.Habet 1., 2.peractum 3.est 4., 5.fata 6.se 7.nostra 8.explicant 9.;


Which is be*n(e)?  2


[1684]	0.† 1.victrice 2.felix 3., 4.iuvenis 5., 6.has 7.numquam 8.irritas


Which is nu*mqv(a)m?  7


[1787]	0.crinem 1.que 2.iactans 3.squalidum 4.Alcmene 5.gemit 6..


Which is sqva_li^d(u)m?  3


[1796]	0.rediturus 1.ibas 2.— 3.quando 4.ab 5.inferna 6.Styge


Which is qva_nd(o)?  3


[1955]	0.minor 1.que 2.labor 3.est 4.Herculi 5.iussa 6.exequi 7.,


Which is ju*ss(a)?  5


[2011]	0.suum 1.que 2.lambens 3.sanguinem 4.Impietas 5.ferox


Which is sa_ngvi^n(e)m?  3


[2124]	0.erumpe 1.rerum 2.terminos 3.te 4.cum 5.efferens 6.,


Which is te*c(u)m?  4


[2383]	0.regno 1.capaci 2.te 3.que 4.quam 5.amotam 6.inrita


Which is to*t(o)?  5


[2562]	0.Arabes 1.que 2.odoris 3.quicquid 4.arboribus 5.legunt


Which is a^ra^be_squ(e)?  1


[2664]	0.vel 1.stipitem 2.istuc 3.caede 4.nostrorum 5.inlitum


Which is i_st(u)m?  1


[2664]	0.vel 1.stipitem 2.istuc 3.caede 4.nostrorum 5.inlitum


Which is mo_nstro*r(u)m?  4


[2799]	0.morer 1.que 2.nihil 3.est 4.: 5.cuncta 6.iam 7.amisi 8.bona 9.,


Which is j(a)m?  6


[2896]	0.similes 1.que 2.matri 3., 4.parta 5.iam 6., 7.parta 8.ultio 9.est 10.:


Which is pa*rt(a)?  4


[2978]	0.monstrum 1.que 2.saevum 3.horribile 4.iamdudum 5.avehe 6..


Which is ja_mdu*d(u)m?  4


[3050]	0.te 1.cum 2.aufer 3.herbas 4., 5.libera 6.cives 7.metu 8.,


Which is te*c(u)m?  1


[3096]	0.Siculum 1.que 2.sorbens 3.quaeve 4.anhelantem 5.premens


Which is qv[ae]u(e)?  3


[3106]	0.sermone 1.potuit 2.— 3.hoc 4.quoque 5.extimuit 6.ferox 7.;


Which is qvo^qv(e)?  4


[3173]	0.is 1.fecit 2.— 3.omnes 4.coniugem 5.infamem 6.arguant 7.,


Which is co_nju^g(e)m?  4


[3241]	0.trifidam 1.que 2.linguam 3.exertat 4.et 5.quaerit 6.quibus


Which is li_ngv(a)m?  2


[3336]	0.incumbe 1.in 2.iras 3.te 4.que 5.languentem 6.excita


Which is la_ngve*nt(e)m?  5


[3362]	0.mater 1.que 2.tota 3.coniuge 4.expulsa 5.redit 6.,


Which is co_nju^g(e)?  3


[3409]	0.tuum 1.quoque 2.ipsa 3.corpus 4.hinc 5.me 6.cum 7.aveham 8..


Which is qvo^qv(e)?  1


[3409]	0.tuum 1.quoque 2.ipsa 3.corpus 4.hinc 5.me 6.cum 7.aveham 8..


Which is me*c(u)m?  6


[3529]	0.tabifica 1.caeli 2.vitia 3.quae 4.te 5.cum 6.invehis


Which is te*c(u)m?  5


[3530]	0.infaustus 1.hospes 2., 3.profuge 4.iamdudum 5.ocius 6.â


Which is ja_mdu*d(u)m?  4


[3826]	0.et 1.me 2.cum 3.Erinyn 4.pronubam 5.thalami 6.traham 7.,


Which is me_c(u)m?  2


[4034]	0.me 1.cum 2.ite 3., 4.me 5.cum 6., 7.ducibus 8.his 9.uti 10.libet 11..


Which is me*c(u)m?  1


[4461]	0.libet 1.loqui 2.piget 3.que 4.. 5.Quodnam 6.istud 7.malum 8.est 9.?


Which is qvo_dn(a)m?  7


[4962]	0.sed 1.flecte 2.mentem 3.pectus 4.antiquum 5.ad 6.voca


Which is a_nti*qv(u)m?  4


[5377]	0.reclinis 1.hasta 2.est 3., 4.arma 5.defixa 6.incubant 7..


Which is de_fi*x(ae)?  5


[5686]	0.regnum 1.que 2.furto 3.: 4.specimen 5.antiquum 6.imperi


Which is a_nti*qv(u)m?  5


[5777]	0.miseris 1.que 2.summum 3.ac 4.maxime 5.exulibus 6.bonum 7.,


Which is ma_xi^m(u)m?  4


[6017]	0.spirant 1.que 2.venae 3.cor 4.que 5.adhuc 6.pavidum 7.salit 8.;


Which is co_rqv(e)?  4


[6035]	0.non 1.rectus 2.exit 3.se 4.que 5.in 6.excelsum 7.levat 8.:


Which is se_qu(e)?  4


[6459]	0.adhuc 1.rogare 2.— 3.tempus 4.aeramnae 5.addidit 6.,


Which is [ae]ru*mn(ae)?  4


[6468]	0.certe 1.aequa 2.mors 3.est 4.— 5.turbat 6.atque 7.agitat 8.Phrygas


Which is a_tqv(e)?  6


[6483]	0.similis 1.que 2.maesto 3., 4.squalida 5.obtectus 6.coma 7..


Which is sqva_li^d(a)?  4


[6565]	0.rerum 1.aestimator 2.: 3.si 4.tamen 5.te 6.cum 7.exigas 8.,


Which is te*c(u)m?  6


### Write output

In [8]:
df.to_csv(os.path.join('data','seneca_scanned.csv'), index=False)
df

Unnamed: 0,perseus_n,perseus_text,comp,anceps_n,anceps_text,speaker,elision,speech,tokens,file,line_id,elided
0,1,Opaca linquens Ditis inferni loca,0.900000,0,o*pa_ca^ li_nqve*ns di_ti^s i_nfe*rni_ lo^ca*,thyestis,0,100001,"[Opaca, linquens, Ditis, inferni, loca]",Sen_Agamemnon,Sen_Agamemnon:1,"[False, False, False, False, False]"
1,2,"adsum profundo Tartari emissus specu,",1.000000,1,a*dsu_m pro^fu_ndo* ta_rta^r(i) e_mi*ssu_s sp...,thyestis,1,100001,"[adsum, profundo, Tartari, emissus, specu, ,]",Sen_Agamemnon,Sen_Agamemnon:2,"[False, False, True, False, False, False]"
2,3,incertus utras oderim sedes magis:,1.000000,2,i*nce_rtu^s u_tra*s o_de^ri_m se*de_s ma^gi*s,thyestis,0,100001,"[incertus, utras, oderim, sedes, magis, :]",Sen_Agamemnon,Sen_Agamemnon:3,"[False, False, False, False, False, False]"
3,4,"fugio Thyestes inferos, superos fugo.",1.000000,3,fu^gi^o_ thy^e_ste*s i_nfe^ro_s su^pe^ro_s fu...,thyestis,0,100001,"[fugio, Thyestes, inferos, ,, superos, fugo, .]",Sen_Agamemnon,Sen_Agamemnon:4,"[False, False, False, False, False, False, False]"
4,5,en horret animus et pavor membra excutit:,1.000000,4,e*n ho_rre^t a^ni^mu*s e_t pa^vo_r me*mbr(a) ...,thyestis,1,100001,"[en, horret, animus, et, pavor, membra, excuti...",Sen_Agamemnon,Sen_Agamemnon:5,"[False, False, False, False, False, True, Fals..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1172,1173,"ubique properas, saeva: me solam times",0.909091,912,u*bi_qve^ pro^pe^ra*s s[ae]va^ me_ so*la_m ti...,hecuba,0,180051,"[ubique, properas, ,, saeva, :, me, solam, times]",Sen_Troades,Sen_Troades:1173,"[False, False, False, False, False, False, Fal..."
1173,1174,"vitasque, gladios inter ac tela et faces",0.916667,913,vi*ta_sqve^ gla^di^o*s i_nte^r a_c te*l(a) e_...,hecuba,1,180051,"[vitas, que, ,, gladios, inter, ac, tela, et, ...",Sen_Troades,Sen_Troades:1174,"[False, False, False, False, False, False, Tru..."
1174,1175,"quaesita tota nocte, cupientem fugis. ,",0.937500,914,qv[ae]si_ta^ to_ta* no_cte^ cu^pi^e*nte_m fu^...,hecuba,0,180051,"[quaesita, tota, nocte, ,, cupientem, fugis, ....",Sen_Troades,Sen_Troades:1175,"[False, False, False, False, False, False, Fal..."
1175,1176,"non hostis aut ruina, non ignis meos",1.000000,915,no*n ho_sti^s [au]t ru*i_na^ no_n i*gni_s me^o*s,hecuba,0,180051,"[non, hostis, aut, ruina, ,, non, ignis, meos]",Sen_Troades,Sen_Troades:1176,"[False, False, False, False, False, False, Fal..."


### Custom function to define local window for repetitions

We treat the words before and after as a "context"; if the present word is also found among them, then it's counted as a repetition.

In [9]:
def getContext(token_table):
    df = (token_table
        .assign(lemma = token_table.lemma.str.lower())
        .groupby('line_id', as_index = False)
        .agg(line_id = ('line_id', 'first'), lemmas = ('lemma', list))
    )

    r_context = (
        pd.concat(
            pd.DataFrame(dict(
                line_id = df.iloc[:-i].line_id.values,
                lemmas = df.iloc[i:].lemmas.values,
            )) for i in range(1, 3))
        .groupby('line_id', as_index=False)
        .agg(r_context=('lemmas', lambda lems: sum(lems, [])))
    )

    l_context = (
        pd.concat(
            pd.DataFrame(dict(
                line_id = df.iloc[i:].line_id.values,
                lemmas = df.iloc[:-i].lemmas.values,
            )) for i in range(1, 3))
        .groupby('line_id', as_index=False)
        .agg(l_context=('lemmas', lambda lems: sum(lems, [])))
    )

    context = pd.merge(l_context, r_context, how='outer', on='line_id')
    context = pd.merge(df, context, on='line_id')
    context['context'] = context['l_context'] + context['lemmas'] + context['r_context']
    context = context.drop(columns=['l_context', 'lemmas', 'r_context'])

    return context

In [12]:
tables = []
for label, group in df.groupby('speech', observed=False):
    print(label)
    token_table = group.explode(['tokens', 'elided']).rename(columns={'tokens':'token'})
    fulltext = ' '.join(group.perseus_text)
    token_table['token'] = [tok for tok in nlp(fulltext)]
    token_table['lemma'] = [tok.lemma_ for tok in token_table.token]
    context = getContext(token_table)
    reps = pd.merge(token_table, context, how='left', on='line_id')[['lemma', 'context']]
    reps['lemma'] = reps['lemma'].str.lower()
    token_table['reps'] = reps.apply(lambda row: row['context'].count(row['lemma']), axis=1).values

    tables.append(token_table)

100001


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


100002
100003


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


100004
100005


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


100006
100007


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


100008
100009


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


100010
100011


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


100012
100013


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


100014
100015


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


100016
100017


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


100018
100019


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


100020
100021


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


100022


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


100023
100024


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


100025
100026


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


100027
100028


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


100029
100030


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


100031
100032


  df = (token_table
  pd.concat(
  pd.concat(


100033
100034


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


100035
100036


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


100037
100038


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


110001


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


110002
110003


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


110004
110005


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


110006


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


110007
110008


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


110009
110010


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


110011
110012


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


110013
110014


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


110015


  pd.concat(
  pd.concat(
  df = (token_table


110016


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


110017
110018


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


110019
110020


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


110021
110022


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


110023
110024


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


110025


  pd.concat(
  pd.concat(


110026


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


110027
110028


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


110029
110030


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


110031


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


110032
110033


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


110034


  df = (token_table
  pd.concat(
  pd.concat(


110035


  df = (token_table
  pd.concat(
  pd.concat(


110036


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


110037
110038


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


110039
110040


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


110041
110042


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


110043
110044


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


110045


  df = (token_table
  pd.concat(
  pd.concat(


110046


  df = (token_table
  pd.concat(
  pd.concat(


110047


  df = (token_table
  pd.concat(
  pd.concat(


110048


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


110049
110050


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


110051
110052


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


110053
120001


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120002


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


120003
120004


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120005
120006


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


120007
120008


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120009
120010


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120011
120012


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


120013
120014


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


120015
120016


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120017
120018


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


120019
120020


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120021
120022


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


120023
120024


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120025
120026


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120027
120028


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


120029
120030


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120031
120032


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


120033
120034


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120035
120036


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


120037
120038


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120039
120040


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120041
120042


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


120043
120044


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120045
120046


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120047
120048


  df = (token_table
  pd.concat(
  pd.concat(


120049


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


120050
120051


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120052


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


120053
120054


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120055
120056


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120057
120058


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


120059
120060


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


120061
120062


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


130001


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


130002
130003


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


130004
130005


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


130006
130007


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


130008
130009


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


130010


  pd.concat(
  pd.concat(
  df = (token_table


130011
130012


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


130013
130014


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


130015
130016


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


130017
130018


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


130019
130020


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


130021


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


130022
130023


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


130024
130025


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


130026
130027


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


130028
130029


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


140001


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


140002
140003


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


140004
140005


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


140006


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


140007
140008


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


140009
140010


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


140011
140012


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


140013


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


140014
140015


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


140016
140017


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


140018
140019


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


140020
140021


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


140022
140023


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


140024


  pd.concat(
  df = (token_table


140025


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


140026
140027


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


140028
140029


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


140030
140031


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


150001


  pd.concat(
  pd.concat(
  df = (token_table


150002


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


150003
150004


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


150005
150006


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


150007
150008


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


150009
150010


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


150011


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


150012
150013


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


150014
150015


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


150016


  df = (token_table
  pd.concat(
  pd.concat(


150017


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


150018
150019


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


150020
150021


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


150022
150023


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


150024
150025


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


150026
150027


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


150028
150029


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


150030
150031


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


150032
150033


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


150034


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


150035
150036


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


150037
150038


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


150039
150040


  df = (token_table
  pd.concat(
  pd.concat(


150041


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


150042
150043


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


150044
150045


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


150046
150047


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


160001


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


160002


  pd.concat(


160003


  df = (token_table
  pd.concat(
  pd.concat(


160004
160005


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


160006
160007


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


160008
160009


  df = (token_table
  pd.concat(
  pd.concat(


160010
160011


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


160012
160013


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


160014
160015


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


160016
160017


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


160018


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


160019


  pd.concat(
  df = (token_table


160020


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


160021
160022


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


170001


  pd.concat(
  pd.concat(


170002


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


170003
170004


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


170005
170006


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


170007
170008


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


170009
170010


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


170011
170012


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


170013
170014


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


170015
170016


  df = (token_table
  pd.concat(
  pd.concat(


170017
170018


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


170019
170020


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


170021
170022


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


170023
170024


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


170025
170026


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


170027
170028


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


170029
170030


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


170031
170032


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


170033
170035


  df = (token_table
  pd.concat(
  pd.concat(


170036
170037


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


170038
170039


  df = (token_table
  pd.concat(
  pd.concat(


170040
170041


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


170042
170043


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


170044
180001


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


180002
180003


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


180004


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


180005
180006


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


180007
180008


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


180009
180010


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


180011
180012


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


180013
180014


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


180015
180016


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


180017
180018


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


180019
180020


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


180021
180022


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


180023
180024


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


180025
180026


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


180027
180028


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


180029
180030


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


180031
180032


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


180033
180034


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


180035
180036


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


180037
180038


  df = (token_table
  pd.concat(
  pd.concat(


180039
180040


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


180041
180042


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


180043
180044


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


180045
180046


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


180047


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


180048
180049


  df = (token_table
  pd.concat(
  pd.concat(


180050


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


180051


  pd.concat(
  pd.concat(


In [13]:
token_table = pd.concat(tables)

token_table['upos'] = [tok.pos_ for tok in token_table.token]
token_table['morph'] = [tok.morph.to_dict() for tok in token_table.token]
token_table['mood'] = [morph.get('Mood') for morph in token_table.morph]
token_table['tense'] = [morph.get('Tense') for morph in token_table.morph]
token_table['voice'] = [morph.get('Voice') for morph in token_table.morph]
token_table['person'] = [morph.get('Person') for morph in token_table.morph]
token_table['number'] = [morph.get('Number') for morph in token_table.morph]
token_table['case'] = [morph.get('Case') for morph in token_table.morph]
token_table['gender'] = [morph.get('Gender') for morph in token_table.morph]
token_table = token_table.drop(columns=['morph', 'perseus_text', 'anceps_text', 'anceps_n', 'comp'])
token_table = token_table.loc[token_table.upos != 'PUNCT']
token_table.to_csv(os.path.join('data', 'seneca_tokens.csv'), index=False)