In [1]:
# Imports

import string
import re
from collections import Counter

from pprint import pprint

from cltk.corpus.latin import latinlibrary
from cltk.tokenize.word import WordTokenizer
from cltk.stem.latin.j_v import JVReplacer

In [2]:
# Setup CLTK tools

word_tokenizer = WordTokenizer('latin')
replacer = JVReplacer()

## Get list of words
We can use the Latin Library to generate a list of possible Latin words to match acrostics against by:
- Getting the raw text of the Latin Library
- Preproccessing the text to remove numbers, punctuation, English words, etc.
- Tokenizing the text

In [3]:
# Get raw text of the Latin Library
#
# Note that the CLTK Latin Library was updated on 3/25/17
# to fix line breaks in some of the hexameter poems included
# in this experiment. Please delete and reimport the
# CLTK Latin Library corpus to follow along.

ll_raw = latinlibrary.raw()

In [79]:
# Preprocess texts

def preprocess(text):    

    text = re.sub(r'&aelig;','ae',text)
    text = re.sub(r'&AElig;','AE',text)
    text = re.sub(r'&oelig;','oe',text)
    text = re.sub(r'&OElig;','OE',text)
    
    text = re.sub('\x00',' ',text)
    
    text = text.lower()
    
    text = replacer.replace(text)
    

    text= re.sub(r'&lt;','<',text)
    text= re.sub(r'&gt;','>',text)    
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)

    remove_list = [r'\bthe latin library\b',
                   r'\bthe classics page\b',
                   r'\bneo-latin\b', 
                   r'\bmedieval latin\b',
                   r'\bchristian latin\b',
                   r'\bthe miscellany\b'
                  ]

    for pattern in remove_list:
        text = re.sub(pattern, '', text)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    
    return text

In [80]:
# Preprocess Latin Library

ll_text = preprocess(ll_raw)

In [114]:
# Tokenize the preprocessed text on white space; no need for enclitic splitting, etc. here

ll_tokens = ll_text.split()

# Remove tokens less than 3 letters long
ll_tokens = [token for token in ll_tokens if len(token) > 2]

# Remove tokens made up of a single character, e.g. 'aaaa'
ll_tokens = [token for token in ll_tokens if token != len(token) * token[0]]


## Find palindromes

In [122]:
# Function to test for palindromes

def is_palindrome(token):
    return token == token[::-1]

In [123]:
# Filter tokens for palindromes

palindromes = [token for token in ll_tokens if is_palindrome(token)]

In [124]:
# List the 10 most frequent palindromes

c = Counter(palindromes)
print(c.most_common(10))

[('non', 166078), ('esse', 49426), ('illi', 9922), ('ibi', 7155), ('ecce', 3662), ('tot', 3443), ('sumus', 2678), ('sis', 1526), ('usu', 1472), ('tenet', 1072)]


In [129]:
# Make a list of the longest palindromes 

# Keep only tokens that appear at least 3 times
c = Counter(palindromes)
palindromes = [k for k, c in c.items()]

In [130]:
palindromes.sort(key = len, reverse=True)
print(len(palindromes))
print(palindromes[:10])

280
['massinissam', 'simillimis', 'sarabaras', 'muratarum', 'sumeremus', 'aballaba', 'suillius', 'siluulis', 'rotator', 'senones']


In [131]:
print(palindromes)

['massinissam', 'simillimis', 'sarabaras', 'muratarum', 'sumeremus', 'aballaba', 'suillius', 'siluulis', 'rotator', 'senones', 'apocopa', 'murorum', 'erepere', 'tereret', 'erexere', 'sumemus', 'sububus', 'matutam', 'mutatum', 'nomimon', 'madidam', 'sumimus', 'tingnit', 'sinonis', 'eregere', 'ccciccc', 'merorem', 'taedeat', 'sumamus', 'messem', 'mannam', 'mammam', 'mappam', 'terret', 'maiiam', 'iussui', 'marram', 'massam', 'siccis', 'iessei', 'soccos', 'saccas', 'summus', 'mullum', 'tinnit', 'murrum', 'succus', 'selles', 'reuer', 'teget', 'seres', 'ababa', 'sinis', 'atita', 'anona', 'ianai', 'acuca', 'tedet', 'eabae', 'sepes', 'siris', 'sonos', 'obibo', 'rotor', 'etate', 'neuen', 'maiam', 'ccicc', 'neten', 'subus', 'cxxxc', '\uf8ffnon\uf8ff', 'satas', 'tabat', 'aenea', 'taxat', 'inoni', 'aitia', 'mulum', 'egage', 'soros', 'tepet', 'iadai', 'simis', 'rogor', 'eumue', 'temet', 'minim', 'seces', 'illli', 'mutum', 'saxas', 'siuis', 'sicis', 'susus', 'sucus', 'surus', 'sudus', 'tioit', 'sefe