In [1]:
# imports

import pandas as pd

from cltk.corpus.readers import get_corpus_reader
from cltk.tokenize.line import LineTokenizer

import pickle

# corpus / tools

corpus = get_corpus_reader(corpus_name = 'latin_text_tesserae', language = 'latin')
files = corpus.fileids()
modes = pickle.load(open('data/modes.p', 'rb'))
line_tokenizer = LineTokenizer('latin')

In [2]:
# Get line counts for all Tesserae files

data = []

docs = corpus.docs(files)
for i, doc in enumerate(docs):
    lines = line_tokenizer.tokenize(doc)
    file = files[i]
    file_ = file.replace('texts/','').replace('.tess','')
    author, work = file_.split('.')[0:2]
    part = file_.split('.')[-1]
    count = len(lines)
    data.append((file, author, work, part, count))

In [3]:
# Make dataframe of line counts 

df = pd.DataFrame(data, columns=['file', 'author', 'work', 'part', 'line count'])
df['part'] = df['part'].apply(lambda x: x if x.isnumeric() else '')
df['mode'] = df['file'].apply(lambda x: modes[x])
df.head()

Unnamed: 0,file,author,work,part,line count,mode
0,texts/ammianus.rerum_gestarum.part.14.tess,ammianus,rerum_gestarum,14,181,prose
1,texts/ammianus.rerum_gestarum.part.15.tess,ammianus,rerum_gestarum,15,172,prose
2,texts/ammianus.rerum_gestarum.part.16.tess,ammianus,rerum_gestarum,16,191,prose
3,texts/ammianus.rerum_gestarum.part.17.tess,ammianus,rerum_gestarum,17,176,prose
4,texts/ammianus.rerum_gestarum.part.18.tess,ammianus,rerum_gestarum,18,113,prose


In [4]:
# Restrict data to verse

df_verse = df[df['mode'] == 'verse']
df_verse.head()

Unnamed: 0,file,author,work,part,line count,mode
18,texts/anonymous.laudes_domini.tess,anonymous,laudes_domini,,149,verse
44,texts/ausonius.cupido_cruciatus.tess,ausonius,cupido_cruciatus,,103,verse
46,texts/ausonius.de_herediolo.tess,ausonius,de_herediolo,,33,verse
47,texts/ausonius.de_xii_caesaribus.tess,ausonius,de_xii_caesaribus,,29,verse
48,texts/ausonius.eclogarum_liber.tess,ausonius,eclogarum_liber,,445,verse


In [5]:
# Group data by work

df_verse_work = df_verse.groupby(['author', 'work'])[['line count']].agg('sum')
df_verse_work

Unnamed: 0_level_0,Unnamed: 1_level_0,line count
author,work,Unnamed: 2_level_1
anonymous,laudes_domini,149
ausonius,cupido_cruciatus,103
ausonius,de_herediolo,33
ausonius,de_xii_caesaribus,29
ausonius,eclogarum_liber,445
...,...,...
tibullus,elegies,1929
valerius_flaccus,argonautica,5592
vergil,aeneid,9896
vergil,eclogues,828


In [6]:
# Group data by verse author

df_verse_author = df_verse.groupby(['author'])[['line count']].agg('sum')
df_verse_author

Unnamed: 0_level_0,line count
author,Unnamed: 1_level_1
anonymous,149
ausonius,4471
catullus,2286
claudian,10141
corippus,4667
dracontius,5973
ennius,680
ennodius,1955
horace,7816
italicus,1071


In [7]:
# Export reports

df_verse_work.to_csv('reports/verse_work_line-counts.csv')
df_verse_author.to_csv('reports/verse_author_line-counts.csv')