In [1]:
import random

import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy

from cltk.corpus.latin import latinlibrary

import pickle

In [2]:
# Prose authors ('business' prose, as Lyne says; though I added in some Cicero too)

caesar = [file for file in latinlibrary.fileids() if 'caesar/' in file]
cicero = [file for file in latinlibrary.fileids() if 'cicero/or' in file or 'brut.txt' in file]
varro = [file for file in latinlibrary.fileids() if 'varro.rr' in file]
vitruvius = [file for file in latinlibrary.fileids() if 'vitru' in file]

In [3]:
print('The prose selection is {} characters long.'.format(len(latinlibrary.raw(caesar)) + len(latinlibrary.raw(varro)) + len(latinlibrary.raw(vitruvius)) + len(latinlibrary.raw(cicero))))

The prose selection is 2259049 characters long.


In [4]:
# Verse authors

catullus = [file for file in latinlibrary.fileids() if 'catullus' in file]
lucretius = [file for file in latinlibrary.fileids() if 'lucr' in file]
vergil = [file for file in latinlibrary.fileids() if 'vergil/' in file]
ovid = [file for file in latinlibrary.fileids() if 'ovid' in file]

In [5]:
print('The verse selection is {} characters long.'.format(len(latinlibrary.raw(catullus)) + len(latinlibrary.raw(lucretius)) + len(latinlibrary.raw(vergil)) + len(latinlibrary.raw(ovid))))

The verse selection is 2634760 characters long.


In [6]:
def remove_ll_content(fileid):
    raw = latinlibrary.raw(fileid)
    trim = raw[1000:-1000]
    trim_start = trim.find(' ')+1
    trim_end = trim.rfind(' ')
    return trim[trim_start:trim_end]

In [7]:
caesar_raw = [remove_ll_content(file) for file in caesar]
vitruvius_raw = [remove_ll_content(file) for file in vitruvius]
cicero_raw = [remove_ll_content(file) for file in cicero]
varro_raw = [remove_ll_content(file) for file in varro]

catullus_raw = [remove_ll_content(file) for file in catullus]
lucretius_raw = [remove_ll_content(file) for file in lucretius]
vergil_raw = [remove_ll_content(file) for file in vergil]
ovid_raw = [remove_ll_content(file) for file in ovid]

In [8]:
# Script for preprocessing texts

import html
import re
import string
from cltk.stem.latin.j_v import JVReplacer

def preprocess(text):
    
    replacer = JVReplacer()
    
    text = html.unescape(text) # Handle html entities
    text = re.sub(r'&nbsp;?', ' ',text) #&nbsp; stripped incorrectly in corpus?
    text = re.sub(r'\x00',' ',text) #Another space problem?
        
    text = text.lower()
    text = replacer.replace(text) #Normalize u/v & i/j    
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    
    return text.strip()

In [9]:
caesar_pp = [preprocess(raw) for raw in caesar_raw]
cicero_pp = [preprocess(raw) for raw in cicero_raw]
varro_pp = [preprocess(raw) for raw in varro_raw]
vitruvius_pp = [preprocess(raw) for raw in vitruvius_raw]

catullus_pp = [preprocess(raw) for raw in catullus_raw]
lucretius_pp = [preprocess(raw) for raw in lucretius_raw]
vergil_pp = [preprocess(raw) for raw in vergil_raw]
ovid_pp = [preprocess(raw) for raw in ovid_raw]

In [10]:
prose_tokens = caesar_tokens + cicero_tokens + varro_tokens + vitruvius_tokens
print(f'The number of prose words is {sum([len(token) for token in prose_tokens])}.')

NameError: name 'caesar_tokens' is not defined

In [None]:
caesar_tokens = [text.split() for text in caesar_pp]
cicero_tokens = [text.split() for text in cicero_pp]
varro_tokens = [text.split() for text in varro_pp]
vitruvius_tokens = [text.split() for text in vitruvius_pp]

catullus_tokens = [text.split() for text in catullus_pp]
lucretius_tokens = [text.split() for text in lucretius_pp]
vergil_tokens = [text.split() for text in vergil_pp]
ovid_tokens = [text.split() for text in ovid_pp]

In [None]:
verse_tokens = catullus_tokens + lucretius_tokens + vergil_tokens + ovid_tokens
print(f'The number of prose words is {sum([len(token) for token in verse_tokens])}.')

In [None]:
TextArray = []

for i, text in enumerate(caesar_pp):
    TextArray.append(('prose', caesar[i], text))
for i, text in enumerate(cicero_pp):
    TextArray.append(('prose', cicero[i], text))
for i, text in enumerate(varro_pp):
    TextArray.append(('prose', varro[i], text))
for i, text in enumerate(vitruvius_pp):
    TextArray.append(('prose', vitruvius[i], text))
for i, text in enumerate(catullus_pp):
    TextArray.append(('verse', catullus[i], text))
for i, text in enumerate(lucretius_pp):
    TextArray.append(('verse', lucretius[i], text))
for i, text in enumerate(vergil_pp):
    TextArray.append(('verse', vergil[i], text))
for i, text in enumerate(ovid_pp):
    TextArray.append(('verse', ovid[i], text))

In [None]:
# Save Text Array to pickle

pickle.dump(TextArray, open("./data/text_array.p", "wb" ))

In [None]:
# Script for chunking text

def make_text_chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [None]:
# Build chunked TextArray

TextArrayChunks = []

for item in TextArray:
    mode, work, text = item
    tokens = text.split() 
    chunk_text = make_text_chunks(tokens, 250)
    chunk_text = [" ".join(chunk) for chunk in chunk_text]
    for i, chunk in enumerate(list(chunk_text)):
        chunk_name = '{}_{}'.format(work, i)
        TextArrayChunks.append((mode, work, chunk_name, chunk))

In [None]:
TextArrayChunks[1]

In [None]:
# Save Chunked Text Array to pickle

pickle.dump(TextArrayChunks, open("./data/text_array_chunks.p", "wb" ))