In [None]:
# Imports

import os
import string
import re
from collections import Counter

from pprint import pprint

from cltk.corpus.latin import latinlibrary
from cltk.tokenize.word import WordTokenizer
from cltk.stem.latin.j_v import JVReplacer

In [None]:
# Setup CLTK tools

word_tokenizer = WordTokenizer('latin')
replacer = JVReplacer()

## Get list of words
We can use the Latin Library to generate a list of possible Latin words to match acrostics against by:
- Getting the raw text of the Latin Library
- Preproccessing the text to remove numbers, punctuation, English words, etc.
- Tokenizing the text
- Making a set of the tokens. For this experiment, I am going to limit the tokens to those that appear at least 25 times in the Latin Library; this should account for relatively rare words as well as things like typos.

In [None]:
# Get raw text of the Latin Library
#
# Note that the CLTK Latin Library was updated on 3/25/17
# to fix line breaks in some of the hexameter poems included
# in this experiment. Please delete and reimport the
# CLTK Latin Library corpus to follow along.

ll_raw = latinlibrary.raw()

In [None]:
# Preprocess texts
def preprocess(text):    

    text = re.sub(r'&aelig;','ae',text)
    text = re.sub(r'&AElig;','AE',text)
    text = re.sub(r'&oelig;','oe',text)
    text = re.sub(r'&OElig;','OE',text)
    
    text = text.lower()
    
    text = replacer.replace(text)
    

    text= re.sub(r'&lt;','<',text)
    text= re.sub(r'&gt;','>',text)    
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)

    remove_list = [r'\bthe latin library\b',
                   r'\bthe classics page\b',
                   r'\bneo-latin\b', 
                   r'\bmedieval latin\b',
                   r'\bchristian latin\b',
                   r'\bthe miscellany\b'
                  ]

    for pattern in remove_list:
        text = re.sub(pattern, '', text)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    
    return text

In [None]:
# Preprocess Latin Library

ll_text = preprocess(ll_raw)

In [None]:
# Tokenize the preprocessed text with the CLTK Latin Word Tokenizer

ll_tokens = word_tokenizer.tokenize(ll_text)

In [None]:
# Make a set of the tokens

c = Counter(ll_tokens)
ll_min = [k for k, c in c.items() if c > 25]

## Make list of possible acrostics in the Aeneid
We can also use the Latin Library to generate a list of possible acrostics ine the Aeneid by:
- Getting the text of the Aeneid from the Latin Library
- Preproccessing
- Getting a list of initial letters
- Combining the initial letters into 'words'
- Getting the intersection of the set of these 'words' with the Latin Library tokens. For this experiment, I am going to make set a range that acrostics need to be at least 4 letters long and we'll look for matches up to 10 letters long.


In [None]:
# Get the Aeneid texts

files = latinlibrary.fileids()
aeneid_files = [file for file in files if 'vergil/aen' in file]
aeneid_raw = latinlibrary.raw(aeneid_files)

In [None]:
# Preprocess the Aeneid texts

aeneid_edit = preprocess(aeneid_raw)

In [None]:
# Get a list of initial letters

def get_lines(text):
    lines = text.split('\n')
    lines = [line for line in lines if line] # Test for blank lines
    return lines

def get_initials(lines):
    return [line[0] for line in lines]


In [None]:
aen_lines = get_lines(aeneid_edit)
aen_initials = get_initials(aen_lines)

In [None]:
# Function for combining list elements into various length strings

def find_ngrams(input_list, n):
    temp = list(zip(*[input_list[i:] for i in range(n)]))
    ngrams = ["".join(t) for t in temp]
    return ngrams


In [None]:
# Combine initial letters into 'words'

min_len = 4
max_len = 10

aen_initial_words = list()
for i in range(min_len,max_len + 1):
    temp = find_ngrams(aen_initials, i)
    aen_initial_words += temp


In [None]:
# Get intersection of words with LL tokens

aen_acrostics = list(set(aen_initial_words) & set(ll_min))
aen_acrostics.sort()
print(len(aen_acrostics))
print(aen_acrostics)

In [None]:
def return_acrostic_lines(text, word):
    lines = get_lines(text)
    initials = get_initials(lines)
    grams = find_ngrams(initials, len(word))
    return lines[grams.index(word):grams.index(word)+len(word)]

In [None]:
pprint(return_acrostic_lines(aeneid_edit, 'audiant'))
pprint(return_acrostic_lines(aeneid_edit, 'posuit'))
pprint(return_acrostic_lines(aeneid_edit, 'auena'))
pprint(return_acrostic_lines(aeneid_edit, 'uitia'))
pprint(return_acrostic_lines(aeneid_edit, 'uidit'))
pprint(return_acrostic_lines(aeneid_edit, 'mars'))

## Add Ovid's Metamorphoses

In [None]:
# Get the Metamorphoses texts

files = latinlibrary.fileids()
met_files = [file for file in files if 'ovid/ovid.met' in file]
met_raw = latinlibrary.raw(met_files)
met_edit = preprocess(met_raw)

In [None]:
met_lines = get_lines(met_edit)
met_initials = get_initials(met_lines)

In [None]:
# Combine initial letters into 'words'

min_len = 4
max_len = 10

met_initial_words = list()
for i in range(min_len,max_len + 1):
    temp = find_ngrams(met_initials, i)
    met_initial_words += temp

In [None]:
met_lines = met_edit.split('\n')
met_lines = [line for line in met_lines if line]
met_initials = [line[0] for line in met_lines]

In [None]:
met_initial_words = list()
for i in range(7,8):
    temp = find_ngrams(met_initials, i)
    met_initial_words += temp

In [None]:
# Get intersection of words with LL tokens

met_acrostics = list(set(met_initial_words) & set(ll_min))
met_acrostics.sort()
print(len(met_acrostics))
print(met_acrostics)

In [None]:
pprint(return_acrostic_lines(met_edit, 'saeua'))
pprint(return_acrostic_lines(met_edit, 'disce'))
pprint(return_acrostic_lines(met_edit, 'enses'))
pprint(return_acrostic_lines(met_edit, 'urna'))
pprint(return_acrostic_lines(met_edit, 'incipe'))
pprint(return_acrostic_lines(met_edit, 'ennii'))

## Add Lucan's *Bellum Civile*

In [None]:
# Get the Metamorphoses texts

files = latinlibrary.fileids()
luc_files = [file for file in files if 'lucan/lucan' in file]
luc_raw = latinlibrary.raw(luc_files)
luc_edit = preprocess(luc_raw)

In [None]:
luc_lines = get_lines(luc_edit)
luc_initials = get_initials(luc_lines)

In [None]:
# Combine initial letters into 'words'

min_len = 4
max_len = 10

luc_initial_words = list()
for i in range(min_len,max_len + 1):
    temp = find_ngrams(luc_initials, i)
    luc_initial_words += temp

In [None]:
luc_initial_words = list()
for i in range(7,8):
    temp = find_ngrams(luc_initials, i)
    met_initial_words += temp

In [None]:
# Get intersection of words with LL tokens

luc_acrostics = list(set(luc_initial_words) & set(ll_min))
luc_acrostics.sort()
print(len(luc_acrostics))
print(luc_acrostics)

## Add *Ilias Latina*

In [None]:
files = latinlibrary.fileids()
ilias_files = [file for file in files if 'ilias' in file]
ilias_raw = latinlibrary.raw(ilias_files)
ilias_edit = preprocess(ilias_raw)

ilias_lines = get_lines(ilias_edit)
ilias_initials = get_initials(ilias_lines)

# Combine initial letters into 'words'

min_len = 4
max_len = 10

ilias_initial_words = list()
for i in range(min_len,max_len + 1):
    temp = find_ngrams(ilias_initials, i)
    ilias_initial_words += temp

# Get intersection of words with LL tokens

ilias_acrostics = list(set(ilias_initial_words) & set(ll_min))
ilias_acrostics.sort()
print(len(ilias_acrostics))
print(ilias_acrostics)

In [None]:
for word in ilias_acrostics:
    pprint(return_acrostic_lines(ilias_edit, word))