In [None]:
# Imports

from cltkreaders.lat import LatinTesseraeCorpusReader
from natsort import natsorted
import pickle
from pprint import pprint

In [None]:
# Load corpus, get files
T = LatinTesseraeCorpusReader()
words = T.words()

print(next(words))

In [None]:
# Helper function for preprocessing
def preprocess(text, lower=True, normalize=True, punctuation=False, 
                numbers=False, unhyphenate=False, remove_lines=False, 
                remove_spaces=False, entities=False, diacriticals=True, fill=' '):
    
    import html
    import re
    import unicodedata
    from cltk.alphabet.lat import JVReplacer
    replacer = JVReplacer()


    if not entities:
        text = html.unescape(text)

    if unhyphenate:
        text = re.sub(r'[-»—]\s?\n', '', text, flags=re.MULTILINE)    

    if lower:
        text = text.lower() # Lowercase

    if normalize:
        text = replacer.replace(text)

    if not punctuation:
        # Remove punctuation
        punctuation ="\"#$%&\'()*+,/:;<=>@[\]^_`{|}~.?!«»—“-”"
        misc = '¡£¤¥¦§¨©¯°±²³´µ¶·¸¹º¼½¾¿÷·–‘’†•ↄ∞⏑〈〉（）'
        misc += punctuation
        translator = str.maketrans({key: fill for key in misc})
        text = text.translate(translator)

    if not numbers:
        # Remove numbers
        translator = str.maketrans({key: fill for key in '0123456789'})
        text = text.translate(translator)

    if remove_lines:
        text = " ".join(text.split('\n'))

    if remove_spaces:
        text = fill.join(text.split())
    def remove_diacriticals(text):
        combining_character_table = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c)))
        text = unicodedata.normalize('NFD', text)
        text = text.translate(combining_character_table)
        return text        

    if not diacriticals:
        text = remove_diacriticals(text)

    # Fix spacing
    text = re.sub(' +', ' ', text)

    text = unicodedata.normalize('NFC', text)

    return text.strip()

In [None]:
# Get word list

# preprocessed_texts = [preprocess(text) for text in T.texts()]
# words_texts = [text.split() for text in preprocessed_texts]
# wordlist = [item for subl in words_texts for item in subl]
# wordlist = set(wordlist)
# wordlist = sorted(wordlist)

# with open('../data/verba.txt', 'w') as f:
#     for word in wordlist:
#         f.write(f'{word}\n')

# pickle.dump(wordlist, open('../data/verba.pickle', 'wb'))

wordlist = pickle.load(open('../data/verba.pickle','rb'))

In [None]:
# Get sample

ilias = 'italicus.ilias_latina.tess'
ilias_text = next(T.texts(ilias))
ilias_lines = ilias_text.split('\n')
pprint(ilias_lines[:8])
print()
pprint(ilias_lines[-8:])

In [None]:
# PSEUDO-CODE
# - ???

In [None]:
'scripsit' in wordlist

In [None]:
# Make a marginal text
margin_list = [line[0] for line in ilias_lines]
margin_list[:8]

In [None]:
margin_text = ''.join(margin_list)
margin_text

In [None]:
margin_text = "".join(preprocess(margin_text, remove_spaces=True).split())
margin_text

In [None]:
# Make ngrams

from nltk import ngrams
ilias_bigrams = list(ngrams(margin_text, 2))
pprint(ilias_bigrams[:10])

In [None]:
ilias_bigrams = [''.join(bigram) for bigram in ilias_bigrams]
pprint(ilias_bigrams[:10])

In [None]:
for bigram in ilias_bigrams[:10]:
    if bigram in wordlist:
        print(f'WE FOUND AN ACROSTIC!!!!!!! {bigram}')
        break

In [None]:
# Make 8grams

ilias_ngrams = [''.join(ngram) for ngram in ngrams(margin_text, 8)]
pprint(ilias_ngrams[:10])


In [None]:
for ngram in ilias_ngrams:
    if ngram in wordlist:
        print(f'WE FOUND AN ACROSTIC!!!!!!! {ngram}')
        break

In [None]:
# Generalize!

def find_acrostics(text, wordlist, n=5, preprocess=None):
    lines = text.split('\n')
    if preprocess:
        lines = [preprocess(line) for line in lines]
        lines = [line for line in lines if line]
    acrostic_margin = ''.join([line[0] for line in lines])
    acrostic_ngrams = [''.join(ngram) for ngram in ngrams(acrostic_margin, n)]

    acrostic_matches = []

    for acrostic_ngram in acrostic_ngrams:
        if acrostic_ngram in wordlist:
            acrostic_matches.append(acrostic_ngram)
    
    return acrostic_matches

In [None]:
find_acrostics(ilias_text, wordlist, n=8, preprocess=preprocess)

In [None]:
find_acrostics(ilias_text, wordlist, n=16, preprocess=preprocess)

In [None]:
print(find_acrostics(ilias_text, wordlist, n=4, preprocess=preprocess))

In [None]:
for i in range(6,12):
    print(f'Acrostics of length {i}')
    print(f'{find_acrostics(ilias_text, wordlist, n=i, preprocess=preprocess)}')
    print()

In [None]:
# Search over Aeneid
# NB: Cached, takes ~5 mins to run

# aeneid = [file for file in T.fileids() if 'vergil.aeneid' in file]
# aeneid = natsorted(aeneid)
# aeneid_texts = [next(T.texts(book)) for book in aeneid]

# aeneid_acrostics = []

# for text in aeneid_texts:
#     for i in range(4,12):
#         matches = find_acrostics(text, wordlist, n=i, preprocess=preprocess)
#         aeneid_acrostics.extend(matches)

# pickle.dump(aeneid_acrostics, open('../data/acrostics.pickle', 'wb'))

aeneid_acrostics = pickle.load(open('../data/acrostics.pickle', 'rb'))        

In [None]:
import random

print(random.sample(aeneid_acrostics, 25))

In [None]:
len(aeneid_acrostics)

In [None]:
from collections import defaultdict

aeneid_acrostic_lens = defaultdict(list)

for acrostic in aeneid_acrostics:
    aeneid_acrostic_lens[len(acrostic)].append(acrostic)

In [None]:
aeneid_acrostic_lens[7]

In [None]:
# import numpy as np
import matplotlib.pyplot as plt

xs, ys = zip(*aeneid_acrostic_lens.items())
xs = [str(x) for x in xs]
ys = [len(y) for y in ys]
   
fig = plt.figure(figsize = (10, 5))
plt.bar(xs, ys, color ='maroon', width = 0.4)
plt.xlabel("Length")
plt.ylabel("Matches")
plt.title("Acrostic lengths in the Aeneid")
plt.show()

In [None]:
from collections import Counter

acrostic_counter = Counter(aeneid_acrostics)
pprint(acrostic_counter.most_common(10))

In [None]:
# For free experiments...
# - Refactor for another author?
# - How could you refactor for telestichs? Mesostichs? Gamma acrostics?