In [None]:
# Imports

from cltkreaders.grc import GreekTesseraeCorpusReader
import sys
import unicodedata
import re
from natsort import natsorted
from pprint import pprint

In [None]:
# Constants

AUTHOR = 'Homer'

In [None]:
# Load corpus, get files
T = GreekTesseraeCorpusReader()
files = [file for file in T.fileids() if AUTHOR.lower() in file]

print(files[:5])

In [None]:
# Sort by book number

files = natsorted(files)
print(files)

## Discovering 'clearchics'

from Athenaeus 11.87, tr. Yonge ([Perseus](http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3Atext%3A2013.01.0003%3Abook%3D11%3Achapter%3D87))
>This, then, was what Clearchus said; and the things which he says one ought to propose, are, I imagine, such as these. For one person to quote a line in Homer beginning with Alpha, and ending with the same letter, such as...  
>  
> αγχοῦ δ᾽ ἱσταμένη ἔπεα πτερόεντα προσηύδα.  
> αλλ᾽ ἄγε νῦν μάστιγα καὶ ἡνία σιγαλόεντα.  
> ασπίδας εὐκύκλους λαισήαϊ τε πτερόεντα.  

In [None]:
# Get citation and text per line

lines = []

for doc_row in T.doc_rows(files):
    lines.extend(doc_row.items())

doc_rows = dict(lines)

citations, lines = zip(*lines)

print(citations[:5])
print(lines[:5])

In [None]:
# Describe lines in author

print(f'There are {len(lines)} lines in {AUTHOR}.')

In [None]:
# Helper function to remove diacriticals

def remove_diacriticals(text):
    combining_character_table = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c)))
    text = unicodedata.normalize('NFD', text)
    text = text.translate(combining_character_table)
    return text

In [None]:
# Helper function for preprocessing
def preprocess(text, lower=True, punctuation=False, numbers=False, remove_spaces=False, diacriticals=True):
    if lower:
        text = text.lower() # Lowercase

    if not punctuation:
        # Remove punctuation
        punctuation ="\"#$%&\'()*+,/:;<=>@[\]^_`{|}~.?!«»—“-”"
        misc = '¡£¤¥¦§¨©¯°±²³´µ¶·¸¹º¼½¾¿÷·–‘’†•ↄ∞⏑〈〉（）'
        misc += punctuation
        translator = str.maketrans({key: " " for key in misc})
        text = text.translate(translator)

    if not numbers:
        # Remove numbers
        translator = str.maketrans({key: " " for key in '0123456789'})
        text = text.translate(translator)

    if remove_spaces:
        text = "".join(text.split())

    if not diacriticals:
        text = remove_diacriticals(text)

    # Fix spacing
    text = re.sub(' +', ' ', text)

    text = unicodedata.normalize('NFC', text)

    return text.strip()

In [None]:
# Preprocess lines, i.e. case, numbers, punctuation, etc.

prep_lines = [preprocess(line, diacriticals=True) for line in lines]
pprint(prep_lines[:5])

In [None]:
# Remove diacriticals; fastest way

concatenated_lines = "\n".join(prep_lines)
concatenated_lines = preprocess(concatenated_lines, diacriticals=False)
prep_lines = concatenated_lines.split('\n')
print(len(prep_lines))

print(prep_lines[:5])

In [None]:
# Check for clearchics

clearchics = []

for citation, line in zip(citations, prep_lines):
    if line[0] == line[-1]:
        clearchics.append(citation)

print(f'There are {len(clearchics)} clearchics in {AUTHOR}.')

In [None]:
# Print sample of clearchics

import random

num = 10
clearchic_sample = random.sample(clearchics, num)

print(f'Here is a random sample of {num} clearchics from {AUTHOR}...')

for i, clearchic in enumerate(clearchic_sample, 1):
    print(f'{i}: {clearchic}: {doc_rows[clearchic]}')

In [None]:
# Note how above we made a dict with citations as keys and original lines as values to make look ups easier later

# e.g.
# doc_rows['<hom.od 23.112>']

In [None]:
# Now that you have reached this cell, I want you to rerun the experiment with a slightly different focus...
# 1. Figure out what needs to be done to find the answer for Nonnus as opposed to Homer
# 2. Figure out how to check for lines that begin and end, not just with the same first letter, but rather 
#     with the same first two letters.
