In [1]:
# Imports

import os
import string
import re
from collections import Counter

from pprint import pprint

from cltk.corpus.latin import latinlibrary
from cltk.tokenize.word import WordTokenizer
from cltk.stem.latin.j_v import JVReplacer

Arabic not supported. Install `pyarabic` library to tokenize Arabic.


In [2]:
# Setup CLTK tools

word_tokenizer = WordTokenizer('latin')
replacer = JVReplacer()

## Get list of words
We can use the Latin Library to generate a list of possible Latin words to match acrostics against by:
- Getting the raw text of the Latin Library
- Preproccessing the text to remove numbers, punctuation, English words, etc.
- Tokenizing the text
- Making a set of the tokens. For this experiment, I am going to limit the tokens to those that appear at least 25 times in the Latin Library; this should account for relatively rare words as well as things like typos.

In [3]:
# Get raw text of the Latin Library
#
# Note that the CLTK Latin Library was updated on 3/25/17
# to fix line breaks in some of the hexameter poems included
# in this experiment. Please delete and reimport the
# CLTK Latin Library corpus to follow along.

ll_raw = latinlibrary.raw()

In [4]:
# Preprocess texts
def preprocess(text):    

    text = re.sub(r'&aelig;','ae',text)
    text = re.sub(r'&AElig;','AE',text)
    text = re.sub(r'&oelig;','oe',text)
    text = re.sub(r'&OElig;','OE',text)
    
    text = text.lower()
    
    text = replacer.replace(text)
    
    text= re.sub(r'&lt;','<',text)
    text= re.sub(r'&gt;','>',text)    
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)

    remove_list = [r'\bthe latin library\b',
                   r'\bthe classics page\b',
                   r'\bneo-latin\b', 
                   r'\bmedieval latin\b',
                   r'\bchristian latin\b',
                   r'\bthe miscellany\b'
                  ]

    for pattern in remove_list:
        text = re.sub(pattern, '', text)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    
    return text

In [5]:
# Preprocess Latin Library

ll_text = preprocess(ll_raw)

In [6]:
# Tokenize the preprocessed text with the CLTK Latin Word Tokenizer

ll_tokens = word_tokenizer.tokenize(ll_text)

In [7]:
# Make a set of the tokens

c = Counter(ll_tokens)
ll_min = [k for k, c in c.items() if c > 25]

## Make list of possible acrostics in the Aeneid
We can also use the Latin Library to generate a list of possible acrostics ine the Aeneid by:
- Getting the text of the Aeneid from the Latin Library
- Preproccessing
- Getting a list of initial letters
- Combining the initial letters into 'words'
- Getting the intersection of the set of these 'words' with the Latin Library tokens. For this experiment, I am going to make set a range that acrostics need to be at least 4 letters long and we'll look for matches up to 10 letters long.


In [8]:
# Get the Aeneid texts

files = latinlibrary.fileids()
aeneid_files = [file for file in files if 'vergil/aen' in file]
aeneid_raw = latinlibrary.raw(aeneid_files)

In [9]:
# Preprocess the Aeneid texts

aeneid_edit = preprocess(aeneid_raw)

In [10]:
# Get a list of initial letters

def get_lines(text):
    lines = text.split('\n')
    lines = [line for line in lines if line] # Test for blank lines
    return lines

def get_initials(lines):
    return [line[0] for line in lines]


In [11]:
aen_lines = get_lines(aeneid_edit)
aen_initials = get_initials(aen_lines)

In [12]:
# Function for combining list elements into various length strings

def find_ngrams(input_list, n):
    temp = list(zip(*[input_list[i:] for i in range(n)]))
    ngrams = ["".join(t) for t in temp]
    return ngrams


In [13]:
# Combine initial letters into 'words'

min_len = 4
max_len = 10

aen_initial_words = list()
for i in range(min_len,max_len + 1):
    temp = find_ngrams(aen_initials, i)
    aen_initial_words += temp


In [14]:
# Get intersection of words with LL tokens

aen_acrostics = list(set(aen_initial_words) & set(ll_min))
aen_acrostics.sort()
print(len(aen_acrostics))
print(aen_acrostics)

212
['abeo', 'acsi', 'acta', 'actu', 'adae', 'addi', 'ades', 'aera', 'aeri', 'alii', 'alta', 'altas', 'alui', 'aluo', 'amet', 'anna', 'anti', 'anum', 'anus', 'apes', 'apis', 'appi', 'apto', 'arae', 'aruo', 'asse', 'assem', 'audi', 'audiant', 'auena', 'auidi', 'aula', 'aure', 'aures', 'ausa', 'ausu', 'ausum', 'beta', 'bini', 'cado', 'call', 'cane', 'cani', 'capp', 'casu', 'caua', 'cauas', 'caue', 'caui', 'cede', 'cedi', 'cena', 'cepi', 'cera', 'cham', 'char', 'citi', 'ciue', 'ciui', 'clit', 'conc', 'creo', 'dant', 'data', 'datas', 'deae', 'deas', 'dici', 'diei', 'dira', 'diro', 'diti', 'diui', 'dius', 'dona', 'donas', 'duce', 'duces', 'eant', 'eius', 'emat', 'emath', 'empto', 'enos', 'epit', 'equa', 'equae', 'erat', 'eris', 'erus', 'esca', 'escas', 'esti', 'etas', 'etsi', 'fata', 'fati', 'feta', 'fiat', 'fida', 'fila', 'filo', 'fimo', 'fine', 'fouit', 'haud', 'heia', 'hiis', 'huiu', 'indi', 'ioui', 'iouis', 'isai', 'iste', 'isti', 'istic', 'iuda', 'iussa', 'iussae', 'lata', 'latum', 'le

In [15]:
def return_acrostic_lines(text, word):
    lines = get_lines(text)
    initials = get_initials(lines)
    grams = find_ngrams(initials, len(word))
    return lines[grams.index(word):grams.index(word)+len(word)]

In [16]:
pprint(return_acrostic_lines(aeneid_edit, 'audiant'))
pprint(return_acrostic_lines(aeneid_edit, 'posuit'))
pprint(return_acrostic_lines(aeneid_edit, 'auena'))
pprint(return_acrostic_lines(aeneid_edit, 'uitia'))
pprint(return_acrostic_lines(aeneid_edit, 'uidit'))
pprint(return_acrostic_lines(aeneid_edit, 'mars'))

['anchemolum thalamos ausum incestare nouercae ',
 'uos etiam gemini rutulis cecidistis in aruis ',
 'daucia laride thymberque simillima proles ',
 'indiscreta suis gratusque parentibus error ',
 'at nunc dura dedit uobis discrimina pallas ',
 'nam tibi thymbre caput euandrius abstulit ensis ',
 'te decisa suum laride dextera quaerit ']
['praecipitate moras nec plura effatus at illi ',
 'ocius incubuere omnes pariterque laborem ',
 'sortiti fluit aes riuis aurique metallum ',
 'uulnificusque chalybs uasta fornace liquescit ',
 'ingentem clipeum informant unum omnia contra ',
 'tela latinorum septenosque orbibus orbis ']
['at medias inter caedes exsultat amazon ',
 'unum exserta latus pugnae pharetrata camilla ',
 'et nunc lenta manu spargens hastilia denset ',
 'nunc ualidam dextra rapit indefessa bipennem ',
 'aureus ex umero sonat arcus et arma dianae ']
['uulnificusque chalybs uasta fornace liquescit ',
 'ingentem clipeum informant unum omnia contra ',
 'tela latinorum septenosque o

## Add Ovid's Metamorphoses

In [17]:
# Get the Metamorphoses texts

files = latinlibrary.fileids()
met_files = [file for file in files if 'ovid/ovid.met' in file]
met_raw = latinlibrary.raw(met_files)
met_edit = preprocess(met_raw)

In [18]:
met_lines = get_lines(met_edit)
met_initials = get_initials(met_lines)

In [19]:
# Combine initial letters into 'words'

min_len = 4
max_len = 10

met_initial_words = list()
for i in range(min_len,max_len + 1):
    temp = find_ngrams(met_initials, i)
    met_initial_words += temp

In [20]:
# Get intersection of words with LL tokens

met_acrostics = list(set(met_initial_words) & set(ll_min))
met_acrostics.sort()
print(len(met_acrostics))
print(met_acrostics)

242
['abis', 'acci', 'acie', 'acsi', 'acta', 'acus', 'adae', 'adde', 'addi', 'adici', 'aeri', 'aeui', 'alio', 'amat', 'amni', 'anne', 'anni', 'apes', 'apis', 'appi', 'appio', 'apta', 'apud', 'area', 'aser', 'astu', 'auis', 'aure', 'ausi', 'ausis', 'auso', 'ausu', 'auus', 'basa', 'caes', 'call', 'calpe', 'cane', 'canes', 'capi', 'capp', 'caue', 'cauet', 'cccc', 'cepi', 'ciii', 'cita', 'cito', 'ciui', 'cott', 'cotta', 'dant', 'dati', 'deam', 'dedi', 'deis', 'delta', 'deus', 'dices', 'dici', 'diei', 'dies', 'dira', 'diri', 'disce', 'ditem', 'diti', 'dius', 'domu', 'duci', 'edat', 'edit', 'editi', 'eheu', 'eius', 'emit', 'eniti', 'enni', 'ennii', 'ense', 'enses', 'ente', 'eoas', 'equi', 'eram', 'erit', 'esca', 'esse', 'esti', 'etate', 'etsi', 'fiant', 'fida', 'fimo', 'fine', 'finees', 'fons', 'fusi', 'hanc', 'hiis', 'hiram', 'iaces', 'icta', 'icto', 'ictu', 'idea', 'idem', 'iesu', 'iiii', 'imam', 'imus', 'incipe', 'ipse', 'ipsi', 'irae', 'iram', 'iras', 'isai', 'isis', 'iste', 'isti', 'ist

In [21]:
pprint(return_acrostic_lines(met_edit, 'saeua'))
pprint(return_acrostic_lines(met_edit, 'disce'))
pprint(return_acrostic_lines(met_edit, 'enses'))
pprint(return_acrostic_lines(met_edit, 'urna'))
pprint(return_acrostic_lines(met_edit, 'incipe'))
pprint(return_acrostic_lines(met_edit, 'ennii'))

['sed uenulus turni postquam mandata peregit ',
 'auxiliumque petit uires aetolius heros ',
 'excusat nec se aut soceri committere pugnae ',
 'uelle sui populos aut quos e gente suorum ',
 'armet habere ullos neue haec commenta putetis ']
['dissimilem populo promittit origine mira',
 'iamque erat in totas sparsurus fulmina terras \r',
 'sed timuit ne forte sacer tot ab ignibus aether \r',
 'conciperet flammas longusque ardesceret axis \r',
 'esse quoque in fatis reminiscitur adfore tempus \r']
['et dedit amplexus iniustaque iusta peregit ',
 'non tulit in cineres labi sua phoebus eosdem ',
 'semina sed natum flammis uteroque parentis ',
 'eripuit geminique tulit chironis in antrum ',
 'sperantemque sibi non falsae praemia linguae ']
['uana diu uisa est uox auguris exitus illam ',
 'resque probat letique genus nouitasque furoris ',
 'namque ter ad quinos unum cephisius annum ',
 'addiderat poteratque puer iuuenisque uideri ']
['it tamen et tenebrae minuunt noxque atra pudorem ',
 'nutri

## Add Lucan's *Bellum Civile*

In [22]:
# Get the Metamorphoses texts

files = latinlibrary.fileids()
luc_files = [file for file in files if 'lucan/lucan' in file]
luc_raw = latinlibrary.raw(luc_files)
luc_edit = preprocess(luc_raw)

In [23]:
luc_lines = get_lines(luc_edit)
luc_initials = get_initials(luc_lines)

In [24]:
# Combine initial letters into 'words'

min_len = 4
max_len = 10

luc_initial_words = list()
for i in range(min_len,max_len + 1):
    temp = find_ngrams(luc_initials, i)
    luc_initial_words += temp

In [25]:
# Get intersection of words with LL tokens

luc_acrostics = list(set(luc_initial_words) & set(ll_min))
luc_acrostics.sort()
print(len(luc_acrostics))
print(luc_acrostics)

142
['abia', 'acci', 'acta', 'acuta', 'aede', 'aetna', 'alas', 'amni', 'amnis', 'anni', 'anno', 'anus', 'apes', 'aput', 'arca', 'aser', 'asse', 'ater', 'auia', 'auiae', 'auiam', 'auis', 'auita', 'auli', 'auus', 'basi', 'cape', 'capp', 'casa', 'casu', 'caui', 'cauis', 'caus', 'chai', 'cient', 'ciui', 'ciuis', 'dato', 'deas', 'dena', 'diac', 'diui', 'ecce', 'esca', 'escas', 'esse', 'este', 'esti', 'etas', 'eundi', 'faui', 'fida', 'habui', 'haec', 'heus', 'hunc', 'icti', 'icto', 'iesse', 'imus', 'ioas', 'ipse', 'ipsi', 'ipso', 'ipsos', 'isis', 'isse', 'ista', 'istam', 'istas', 'isti', 'iugi', 'iuni', 'iunio', 'iuno', 'lota', 'meas', 'meat', 'mihi', 'naui', 'neci', 'nedi', 'neue', 'nisi', 'omni', 'opes', 'osee', 'ossa', 'pari', 'piis', 'pios', 'pius', 'pura', 'pure', 'quae', 'quam', 'rude', 'salo', 'sani', 'sanus', 'sata', 'sede', 'sedi', 'semis', 'sena', 'sene', 'sepe', 'sese', 'sies', 'sina', 'sinu', 'situ', 'siue', 'soni', 'sonis', 'spem', 'stas', 'stet', 'suas', 'suci', 'suem', 'tace',

In [26]:
pprint(return_acrostic_lines(luc_edit, 'ossa'))

['orbita migrantis scindit maeotida bessi ',
 'saeua quies pelagi maestoque ignaua profundo',
 'stagna iacentis aquae ueluti deserta regente',
 'aequora natura cessant pontusque uetustas']


## Add *Ilias Latina*

In [27]:
files = latinlibrary.fileids()
ilias_files = [file for file in files if 'ilias' in file]
ilias_raw = latinlibrary.raw(ilias_files)
ilias_edit = preprocess(ilias_raw)

ilias_lines = get_lines(ilias_edit)
ilias_initials = get_initials(ilias_lines)

# Combine initial letters into 'words'

min_len = 4
max_len = 10

ilias_initial_words = list()
for i in range(min_len,max_len + 1):
    temp = find_ngrams(ilias_initials, i)
    ilias_initial_words += temp

# Get intersection of words with LL tokens

ilias_acrostics = list(set(ilias_initial_words) & set(ll_min))
ilias_acrostics.sort()
print(len(ilias_acrostics))
print(ilias_acrostics)

24
['apta', 'arce', 'caeci', 'casa', 'dici', 'feci', 'fici', 'huic', 'init', 'ipsi', 'itali', 'niuis', 'piae', 'pios', 'scripsi', 'scripsit', 'seni', 'senio', 'siue', 'suci', 'sunt', 'tali', 'uani', 'uite']


In [28]:
for word in ilias_acrostics:
    pprint(return_acrostic_lines(ilias_edit, word))

['acrius insurgunt troes ad achaica bella',
 'pulsa metu uallumque et muros aggere saeptos ',
 'transiliunt alii fossas uoluuntur in ipsas ',
 'aduolat interea danaum metus impiger hector ']
['alterius tenebrae tarde labentibus astris ',
 'restabatque super tacitae pars tertia noctis ',
 'cum danaum iussu castris aetolius heros ',
 'egreditur sociumque sibi delegit ulixem ']
['confugiunt portasque obiecto robore firmant ',
 'at phryges obsidunt inclusos aggere graios ',
 'excubituque premunt muros flammisque coronant ',
 'cetera per campos sternunt sua corpora pubes ',
 'indulgentque mero curas que animosque resoluunt ']
['cum paris exitium troiae funestaque flamma ',
 'armatum aduerso menelaum ex agmine cernit ',
 'seque uelut uiso perterritus angue recepit ',
 'ad socios amens quem postquam turpiter hector ']
['dixit et has repetit per quas modo uenerat auras ',
 'interea lucem terris dedit ignea lampas ',
 'conuocat attonitus iussis pelopeius heros ',
 'in coetum proceres remque omn

## Add VF's *Argonautica*

In [29]:
files = latinlibrary.fileids()
vf_files = [file for file in files if 'valeriusflaccus' in file]
vf_raw = latinlibrary.raw(vf_files)
vf_edit = preprocess(vf_raw)

vf_lines = get_lines(vf_edit)
vf_initials = get_initials(vf_lines)

# Combine initial letters into 'words'

min_len = 4
max_len = 10

vf_initial_words = list()
for i in range(min_len,max_len + 1):
    temp = find_ngrams(vf_initials, i)
    vf_initial_words += temp

# Get intersection of words with LL tokens

vf_acrostics = list(set(vf_initial_words) & set(ll_min))
vf_acrostics.sort()
print(len(vf_acrostics))
print(vf_acrostics)

116
['acie', 'acsi', 'acus', 'adde', 'alae', 'alit', 'alite', 'alla', 'alma', 'anni', 'apta', 'aqua', 'aquai', 'asina', 'asse', 'asses', 'assis', 'aulo', 'baco', 'cape', 'capi', 'capp', 'casa', 'caua', 'celi', 'cens', 'cera', 'citi', 'coit', 'coss', 'crus', 'cura', 'dant', 'data', 'dein', 'diac', 'dicti', 'diem', 'disp', 'eois', 'esse', 'fias', 'fici', 'hiis', 'hinc', 'hisp', 'iani', 'icti', 'iesu', 'iesus', 'iiii', 'indi', 'ipsa', 'ipse', 'isis', 'item', 'iure', 'laba', 'lite', 'litt', 'luci', 'lucia', 'meat', 'misi', 'mite', 'nati', 'natu', 'natus', 'neae', 'neas', 'nedum', 'niue', 'niuea', 'odia', 'opes', 'opis', 'otio', 'pace', 'paci', 'pares', 'passi', 'passis', 'peti', 'pici', 'piis', 'pios', 'pira', 'prata', 'quae', 'quin', 'rata', 'rudi', 'saetis', 'scio', 'seii', 'sina', 'sing', 'siti', 'sset', 'stas', 'stet', 'suis', 'tene', 'tito', 'tuas', 'tute', 'uana', 'ueri', 'uesci', 'ueto', 'uice', 'uiii', 'uisa', 'uisae', 'uiue', 'unam']


In [30]:
for word in vf_acrostics:
    pprint(return_acrostic_lines(vf_edit, word))

['arripit et longa styrus prospectat ab unda ',
 'coniugio atque iterum sponsae flammatus amore ',
 'iamque alii clipeos et tela trabalia dextris ',
 'expediunt armant alii picis unguine flammas ']
['antraque deprensae tremuerunt conscia culpae ',
 'cum trepida inachiae paelex subit ora iuuencae ',
 'sponte dei plausu fouet hanc et pectora mulcet ',
 'iuno renidenti cohibens suspiria uultu ']
['aeetae sociare manus scio perfida regis ',
 'corda quidem nullos minyis exsoluet honores ',
 'uerum alios tunc ipsa dolos alia orsa mouebo',
 'sint precor haec tua namque mihi comitanda potestas ']
['aeoliae nec opina domus sat magna laborum ',
 'dona fero satis hoc uisu quaecumque rependo ',
 'dixit et in sueten magnique in fata ceramni ',
 'emicuit clipeumque rotans hunc poplite caeso ']
['ausus et inducto cratem defendere tergo ',
 'laeta mari tum signa refert plenasque mouebant ',
 'armentis nuribusque rates et barbara uestis ',
 'et torques insigne loci sonat aequore clamor']
['arma umeris 

 'et taxi frons hirta comis ipse aeger anhelans ']
['foedera et horrenda trepidam sub uirgine puppem ',
 'impia monstriferis surgunt iam proelia campis ',
 'ante dolos ante infidi tamen exsequar astus ',
 'soligenae falli meriti meritique relinqui ']
['frigidus et uiso pallescit flamma ueneno ',
 'inicit aesonides dextram atque ardentia uincit ',
 'cornua dein totis propendens uiribus haeret ',
 'ille uirum atque ipsam tunc te medea recusans ']
['hortator postquam furiis et uoce nefanda ',
 'impulit oenides uerum cum gente domoque ',
 'ista luet saeuaeque aderunt tua numina matri ',
 'surge age et in duris haud umquam defice caelo ']
['hic et iaxarten dictis stupet hospes acerbis ',
 'immodicum linguaque grauem cui nulla minanti ',
 'non superum non praesentis reuerentia belli ',
 'contra autem aeetes non frustra magna superbo ']
['hos stimulant magnaque ratem per lustra uiasque ',
 'iussi laude canunt manifesto in lumine fauni',
 'siluarumque deae atque elatis cornibus amnes',
 'proti

['qui metus usque nouos diuae melioris ad ignes ',
 'urbe sedent laeti minyae uiduisque uacantes ',
 'indulgent thalamis nimbosque educere luxu ',
 'nec iam uelle uias zephyrosque audire uocantes ']
['raptor et aegaei super effugit alta profundi',
 'accipit augurium aesonides laetusque superbi ',
 'tecta petit peliae prior huic tum regia proles ',
 'aduolat amplexus fraternaque pectora iungens ']
['respexitque fores et adhuc inuenit euntem ',
 'uisus et heu miserae tunc pulchrior hospes amanti ',
 'discedens tales umeros ea terga relinquit ',
 'illa domum atque ipsos paulum procedere postes ']
['stridentesque iuuant aurae phrixea subibant ',
 'aequora et angustas quondam sine nomine fauces ',
 'ecce autem prima uolucrem sub luce dehiscens ',
 'terruit unda ratem uittataque constitit helle ',
 'iam panopes thetidisque soror iamque aurea laeua ',
 'sceptra tenens dum sternit aquas proceresque ducemque ']
['sic ratus ille autem celeri rapit ora sinistra ',
 'conclamant socii et subitas da

In [31]:
pprint(return_acrostic_lines(vf_edit, 'niuea'))

['natorumque locis uacuaeque in moenibus urbis ',
 'iura nouant donant solio sceptrisque paternis ',
 'ut meritam redeunt que piae sua praemia menti',
 'ecce procul ualidis lemnon tendentia remis ',
 'arma notant rapitur subito regina tumultu ']


![List of eight-letter acrostics in Latin poetry from Hilberg 1899.](img/hilberg_1.png)
*List of eight-letters acrostics in Latin poetry from Hilberg 1899.* [Link](https://books.google.com/books?id=gIz8i-tRzG0C&pg=RA1-PA264#v=onepage&q&f=false)

In [32]:
"laniabor" in ll_min

False