In [60]:
# Imports

import os
import string
import re
from collections import Counter
import html  

from pprint import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from cltk.corpus.latin import latinlibrary
from cltk.tokenize.line import LineTokenizer
from cltk.tokenize.word import WordTokenizer
from cltk.stem.latin.j_v import JVReplacer

In [None]:
# Setup CLTK tools

line_tokenizer = LineTokenizer('latin')
word_tokenizer = WordTokenizer('latin')
replacer = JVReplacer()

## Get text of the Pliny 10

In [None]:
# Get the Pliny Ep. 10 text

files = latinlibrary.fileids()
pliny_ep10_raw = latinlibrary.raw('pliny.ep10.txt')
print(pliny_ep10_raw)

In [None]:
# Preprocess texts

def preprocess(text):

    remove_list = [r'\bPliny the Younger\b',
                   r'\bThe Latin Library\b',
                   r'\bThe Classics Page',
                   r'C. PLINII CAECILII SECVNDI EPISTVLARVM LIBER DECIMVS AD TRAIANVM IMPERATOREM CVM EIVSDEM RESPONSIS',
                  ]
    
    for pattern in remove_list:
        text = re.sub(pattern, '', text)
    
    # REMOVE LETTER HEADINGS?
    
    text = html.unescape(text) # Handle html entities
    text = re.sub(r'&nbsp;?', ' ',text) #&nbsp; stripped incorrectly in corpus?
    text = re.sub(r'\x00',' ',text) #Another space problem?
    
    # Remove roman numeral headings; must be before lower & replacer
    #text = re.sub(r'\b(M{1,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|C?D|D?C{1,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,4})|M{0,4}(CM|CD|D?C{0,3})(XC|X?L|L?X{1,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|I?V|V?I{1,3}))\b[\.]',' ',text)
    
    text = text.lower()
    text = replacer.replace(text) #Normalize u/v & i/j
    
    # Herdan also normalizes 'qu' to 'c'
    #text = re.sub('qu', 'k', text)
    #text = re.sub('cui', 'ku', text)
    
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    # Remove letters standing alone on a line as leftover indicators
    text = re.sub(r'^\s*?[a-z]\s*?$','', text, flags=re.MULTILINE)

    # Trim spaces around lines
    text = re.sub(r'^\s*','', text, flags=re.MULTILINE)
    text = re.sub(r'\s*$','', text, flags=re.MULTILINE)

    text = re.sub(r'\n',r' ', text, flags=re.MULTILINE) # Replace multiple new lines with one
    text = re.sub('[ ]+',' ', text) # Remove double spaces

    # Remove text of Trajan's responses to Pliny, leaving nothing
    # Couldn't get this to consistently insert \n before \g<1>
    text = re.sub(r'traianus plinio.*?(c plinius traiano imperatori)',r'\g<1>',text, flags=re.DOTALL)

    # Split text on Pliny's salutation to Trajan, leaving each letter in one paragraph
    text = re.sub(r'c plinius traiano imperatori ',r'\n', text)

    # Remove leading blank line
    text = re.sub(r'^\n','', text, flags=re.MULTILINE)
    return text

pliny_ep10_clean = preprocess(pliny_ep10_raw)
print(pliny_ep10_clean)

In [None]:
# Assign cleaned text to variable
# Also write letters to separate files
# NB We've lost the numbers for the letters, so either
#     1. Don't delete the letters to Trajan, but just don't write them out to file
#     2. Keep that number until the final step of writing or saving
#     3. Do a lookup table to provide informative crosswalk
letters = pliny_ep10_clean.split('\n')
i = 1
for letter in letters:
    if len(letter) > 1:
        fname = './pliny_ep10/' + str(i) + '.txt'
        file = open(fname,'w')
        file.write(letter)
        file.close()
        i = i + 1

In [None]:
# PLACEHOLDER FOR Preprocess the Pliny Ep. 10 texts
# print(preprocess(pliny_ep10_raw))