In [1]:
# Imports

import string
import re
from collections import Counter

from pprint import pprint

from cltk.corpus.latin import latinlibrary
from cltk.tokenize.word import WordTokenizer
from cltk.stem.latin.j_v import JVReplacer

In [2]:
# Setup CLTK tools

word_tokenizer = WordTokenizer('latin')
replacer = JVReplacer()

## Get list of words
We can use the Latin Library to generate a list of possible Latin words to match acrostics against by:
- Getting the raw text of the Latin Library
- Preproccessing the text to remove numbers, punctuation, English words, etc.
- Tokenizing the text

In [3]:
# Get raw text of the Latin Library
#
# Note that the CLTK Latin Library was updated on 3/25/17
# to fix line breaks in some of the hexameter poems included
# in this experiment. Please delete and reimport the
# CLTK Latin Library corpus to follow along.

ll_raw = latinlibrary.raw()

In [4]:
# Preprocess texts

def preprocess(text):    

    text = re.sub(r'&aelig;','ae',text)
    text = re.sub(r'&AElig;','AE',text)
    text = re.sub(r'&oelig;','oe',text)
    text = re.sub(r'&OElig;','OE',text)
    
    text = re.sub('\x00',' ',text)
    
    text = text.lower()
    
    text = replacer.replace(text)
    

    text= re.sub(r'&lt;','<',text)
    text= re.sub(r'&gt;','>',text)    
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)

    remove_list = [r'\bthe latin library\b',
                   r'\bthe classics page\b',
                   r'\bneo-latin\b', 
                   r'\bmedieval latin\b',
                   r'\bchristian latin\b',
                   r'\bthe miscellany\b'
                  ]

    for pattern in remove_list:
        text = re.sub(pattern, '', text)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    
    return text

In [5]:
# Preprocess Latin Library

ll_text = preprocess(ll_raw)

In [6]:
# Tokenize the preprocessed text on white space; no need for enclitic splitting, etc. here

ll_tokens = ll_text.split()

# Remove tokens less than 3 letters long
ll_tokens = [token for token in ll_tokens if len(token) > 2]

# Remove tokens made up of a single character, e.g. 'aaaa'
ll_tokens = [token for token in ll_tokens if token != len(token) * token[0]]


## Find palindromes

In [7]:
# Function to test for palindromes

def is_palindrome(token):
    return token == token[::-1]

In [8]:
# Filter tokens for palindromes

palindromes = [token for token in ll_tokens if is_palindrome(token)]

In [9]:
# List the 10 most frequent palindromes

c = Counter(palindromes)
print(c.most_common(10))

[('non', 166062), ('esse', 49426), ('illi', 9921), ('ibi', 7153), ('ecce', 3661), ('tot', 3444), ('sumus', 2678), ('sis', 1526), ('usu', 1471), ('tenet', 1072)]


In [10]:
# Make a list of the longest palindromes 

# Keep only tokens that appear at least 3 times
c = Counter(palindromes)
palindromes = [k for k, c in c.items()]

In [11]:
palindromes.sort(key = len, reverse=True)
print(len(palindromes))
print(palindromes[:10])

279
['massinissam', 'simillimis', 'sumeremus', 'sarabaras', 'muratarum', 'siluulis', 'aballaba', 'suillius', 'sumamus', 'sumimus']


In [12]:
print(palindromes)

['massinissam', 'simillimis', 'sumeremus', 'sarabaras', 'muratarum', 'siluulis', 'aballaba', 'suillius', 'sumamus', 'sumimus', 'taedeat', 'apocopa', 'murorum', 'senones', 'tereret', 'mutatum', 'matutam', 'rotator', 'ccciccc', 'sumemus', 'erexere', 'eregere', 'erepere', 'merorem', 'nomimon', 'madidam', 'sububus', 'tingnit', 'sinonis', 'siccis', 'messem', 'massam', 'terret', 'succus', 'summus', 'marram', 'saccas', 'soccos', 'mammam', 'selles', 'iessei', 'mannam', 'mappam', 'tinnit', 'iussui', 'mullum', 'maiiam', 'murrum', 'senes', 'sumus', 'solos', 'sitis', 'sedes', 'malam', 'tenet', 'inani', 'seges', 'tepet', 'murum', 'etate', 'mutum', 'sonos', 'teret', 'temet', 'sucus', 'siuis', 'mulum', 'cxxxc', 'refer', 'seres', 'aenea', 'sudus', 'aerea', 'anona', 'silis', 'sinis', 'sanas', 'satas', 'reuer', 'soros', 'ogygo', 'sefes', 'siris', 'simis', 'sepes', 'eabae', 'iadai', 'ianai', 'atita', 'aziza', 'adeda', 'teget', 'neuen', 'taxat', 'seces', 'sagas', 'rogor', 'egoge', 'sicis', 'ccicc', 'subus

### Multiple words

In [15]:
# Function for combining list elements into various length strings

def find_ngrams(input_list, n, separator=" "):
    temp = list(zip(*[input_list[i:] for i in range(n)]))
    ngrams = [separator.join(t) for t in temp]
    return ngrams

In [23]:
test = find_ngrams(ll_tokens, 3)
palinwords = [token for token in test if is_palindrome(token)]
print(Counter(palinwords))

Counter({'esse non esse': 13, 'non esse non': 11, 'non sedes non': 2, 'non sis non': 1, 'sumus non sumus': 1, 'non sumus non': 1, 'ccciccc ccciccc ccciccc': 1, 'cic cic cic': 1, 'non tenet non': 1, 'aut non tua': 1, 'ibi esse ibi': 1, 'esse ibi esse': 1})


In [24]:
test = find_ngrams(ll_tokens, 4)
palinwords = [token for token in test if is_palindrome(token)]
print(Counter(palinwords))

Counter()


In [25]:
test = find_ngrams(ll_tokens, 5)
palinwords = [token for token in test if is_palindrome(token)]
print(Counter(palinwords))

Counter()


In [26]:
test = find_ngrams(ll_tokens, 6)
palinwords = [token for token in test if is_palindrome(token)]
print(Counter(palinwords))

Counter()


In [27]:
test = find_ngrams(ll_tokens, 7)
palinwords = [token for token in test if is_palindrome(token)]
print(Counter(palinwords))

Counter()
