In [1]:
# imports

import pandas as pd

from nltk import word_tokenize 
from cltk.corpus.readers import get_corpus_reader

# corpus / tools

latin_tess = get_corpus_reader(corpus_name = 'latin_text_tesserae', language = 'latin')
latin_files = latin_tess.fileids()

greek_tess = get_corpus_reader(corpus_name = 'greek_text_tesserae', language = 'greek')
greek_files = greek_tess.fileids()

In [2]:
# Define preprocessing script; e.g. punctation, uppercase, etc.

def preprocess(text, lower=True, remove_list=[]):
    import html, re

    # Catchall find/replace; not used here
    for pattern in remove_list:
        text = re.sub(pattern, '', text)        
    
    text = html.unescape(text) # Handle html entities
    text = re.sub(r'[\x1a-\x1a]', ' ', text) # ASCII control characters
    text = re.sub(r'&nbsp;?', ' ',text) #&nbsp; stripped incorrectly in corpus?
    text = re.sub(r'\x00',' ',text) #Another space problem?
    
    text = text.lower() # Lowercase
        
    # Remove punctuation
    punctuation ="\"#$%&\'()*+,/:;<=>@[\]^_`{|}~.?!«»—“-”"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    # Remove numbers
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    
    return text.strip() # Strip removes leading and trailing spaces

In [3]:
# Get word counts for all Latin Tesserae files

latin_data = []

docs = latin_tess.docs(latin_files)
for i, doc in enumerate(docs):
    doc = preprocess(doc)
    words = word_tokenize(doc)
    file = latin_files[i]
    file_ = file.replace('texts/','').replace('.tess','')
    author, work = file_.split('.')[0:2]
    part = file_.split('.')[-1]
    count = len(words)
    latin_data.append(('latin', file, author, work, part, count))

In [4]:
# Get word counts for all Greek Tesserae files

greek_data = []

docs = greek_tess.docs(greek_files)
for i, doc in enumerate(docs):
    doc = preprocess(doc)
    words = word_tokenize(doc)
    file = greek_files[i]
    file_ = file.replace('texts/','').replace('.tess','')
    author, work = file_.split('.')[0:2]
    part = file_.split('.')[-1]
    count = len(words)
    greek_data.append(('greek', file, author, work, part, count))

In [5]:
data = latin_data + greek_data

In [6]:
# Make dataframe of line counts 

df = pd.DataFrame(data, columns=['lang', 'file', 'author', 'work', 'part', 'word count'])
df['part'] = df['part'].apply(lambda x: x if x.isnumeric() else '')
df.head()

Unnamed: 0,lang,file,author,work,part,word count
0,latin,texts/ammianus.rerum_gestarum.part.14.tess,ammianus,rerum_gestarum,14,8386
1,latin,texts/ammianus.rerum_gestarum.part.15.tess,ammianus,rerum_gestarum,15,7024
2,latin,texts/ammianus.rerum_gestarum.part.16.tess,ammianus,rerum_gestarum,16,7834
3,latin,texts/ammianus.rerum_gestarum.part.17.tess,ammianus,rerum_gestarum,17,7559
4,latin,texts/ammianus.rerum_gestarum.part.18.tess,ammianus,rerum_gestarum,18,5147


In [7]:
# Group data by work

df_work = df.groupby(['author', 'work'])[['word count']].agg('sum')
df_work

Unnamed: 0_level_0,Unnamed: 1_level_0,word count
author,work,Unnamed: 2_level_1
achilles_tatius,leucippe_et_clitophon,43748
aelian,de_natura_animalium,104930
aelian,epistulae_rusticae,2221
aelian,varia_historia,39657
aelius_aristides,ars_rhetorica,17844
...,...,...
vergil,aeneid,83511
vergil,eclogues,7271
vergil,georgics,18520
vitruvius,de_architectura,58659


In [8]:
# Group data by verse author

df_author = df.groupby(['author'])[['word count']].agg('sum')
df_author

Unnamed: 0_level_0,word count
author,Unnamed: 1_level_1
achilles_tatius,43748
aelian,146808
aelius_aristides,319763
aeschines,71261
aeschylus,56498
...,...
valerius_flaccus,48442
valerius_maximus,82112
vergil,109302
vitruvius,58659


In [9]:
# Export reports

df.to_csv('reports/tess_word_counts.csv')
df_work.to_csv('reports/tess_work_word_counts.csv')
df_author.to_csv('reports/tess_author_word_counts.csv')