- Based on (Herdan 1962).
- same data and reasoning about expected values
- G. responding to an "elementary but serious error"
- 55: "The unwary and unnumbered classicist will be led astray."

In [42]:
# Imports

import os
import string
import re
from collections import Counter
import html  

from pprint import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from cltk.corpus.latin import latinlibrary
from cltk.tokenize.line import LineTokenizer
from cltk.tokenize.word import WordTokenizer
from cltk.stem.latin.j_v import JVReplacer

In [44]:
# Setup CLTK tools

line_tokenizer = LineTokenizer('latin')
word_tokenizer = WordTokenizer('latin')
replacer = JVReplacer()

## Get text of the Georgics

In [31]:
# Get the Aeneid texts

files = latinlibrary.fileids()
georgics_files = [file for file in files if 'vergil/geo' in file]
georgics_raw = [latinlibrary.raw(file) for file in georgics_files]

In [32]:
# Preprocess texts

def preprocess(text):

    remove_list = [r'\bVergil\b',
                   r'\bThe Latin Library\b',
                   r'\bThe Classics Page',
                   r'\bP. VERGILI MARONIS GEORGICON LIBER .+?\b'
                  ]
    
    for pattern in remove_list:
        text = re.sub(pattern, '', text)
    
    # REMOVE LETTER HEADINGS?
    
    text = html.unescape(text) # Handle html entities
    text = re.sub(r'&nbsp;?', ' ',text) #&nbsp; stripped incorrectly in corpus?
    text = re.sub(r'\x00',' ',text) #Another space problem?
    
    # Remove roman numeral headings; must be before lower & replacer
    #text = re.sub(r'\b(M{1,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|C?D|D?C{1,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,4})|M{0,4}(CM|CD|D?C{0,3})(XC|X?L|L?X{1,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|I?V|V?I{1,3}))\b[\.]',' ',text)
    
    text = text.lower()
    text = replacer.replace(text) #Normalize u/v & i/j
    
    # Herdan also normalizes 'qu' to 'c'
    #text = re.sub('qu', 'k', text)
    #text = re.sub('cui', 'ku', text)
    
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    
    return text

In [33]:
# Preprocess the Georgics texts

georgics_edit = [preprocess(raw) for raw in georgics_raw]
print(georgics_edit[0])


quid faciat laetas segetes quo sidere terram
uertere maecenas ulmisque adiungere uitis
conueniat quae cura boum qui cultus habendo
sit pecori apibus quanta experientia parcis
hinc canere incipiam uos o clarissima mundi
lumina labentem caelo quae ducitis annum
liber et alma ceres uestro si munere tellus
chaoniam pingui glandem mutauit arista
poculaque inuentis acheloia miscuit uuis
et uos agrestum praesentia numina fauni
ferte simul faunique pedem dryadesque puellae
munera uestra cano tuque o cui prima frementem
fudit equum magno tellus percussa tridenti
neptune et cultor nemorum cui pinguia ceae
ter centum niuei tondent dumeta iuuenci
ipse nemus linquens patrium saltusque lycaei
pan ouium custos tua si tibi maenala curae
adsis o tegeaee fauens oleaeque minerua
inuentrix uncique puer monstrator aratri
et teneram ab radice ferens siluane cupressum
dique deaeque omnes studium quibus arua tueri
quique nouas alitis non ullo semine fruges
quique satis largum caelo demittitis imbrem
tuque ad

In [58]:
# Get a list of initial letters

def get_initials(lines):
    temp = [line.strip() for line in lines]
    return [line[0] for line in temp]


In [89]:
georgics_lines = [line_tokenizer.tokenize(edit) for edit in georgics_edit]
georgics_initials = [get_initials(lines) for lines in georgics_lines]

In [64]:
# Function for combining list elements into various length strings

def find_ngrams(input_list, n):
    temp = list(zip(*[input_list[i:] for i in range(n)]))
    ngrams = ["".join(t) for t in temp]
    return ngrams


None


In [66]:
georgics_bigrams = [find_ngrams(initial,2) for initial in georgics_initials]
print(georgics_bigrams[0])

['qu', 'uc', 'cs', 'sh', 'hl', 'll', 'lc', 'cp', 'pe', 'ef', 'fm', 'mf', 'fn', 'nt', 'ti', 'ip', 'pa', 'ai', 'ie', 'ed', 'dq', 'qq', 'qt', 'tc', 'ct', 'ta', 'aa', 'aa', 'an', 'nt', 'ta', 'aq', 'qp', 'ps', 'sq', 'qn', 'nq', 'qn', 'nd', 'di', 'ii', 'iu', 'ul', 'ld', 'di', 'ii', 'ia', 'ai', 'ia', 'au', 'uc', 'ce', 'eh', 'ha', 'ag', 'gi', 'ia', 'ac', 'cc', 'ci', 'id', 'du', 'up', 'pf', 'fp', 'pa', 'aa', 'ai', 'ih', 'ha', 'ae', 'ea', 'au', 'ua', 'as', 'su', 'uu', 'us', 'sn', 'ne', 'es', 'sn', 'ns', 'sa', 'as', 'sp', 'pe', 'es', 'ss', 'ss', 'sn', 'na', 'am', 'mu', 'uf', 'fe', 'er', 're', 'eu', 'ua', 'al', 'li', 'iq', 'qi', 'id', 'de', 'ee', 'ee', 'es', 'sq', 'ql', 'lc', 'cc', 'cp', 'pe', 'eu', 'un', 'nu', 'us', 'so', 'oh', 'hm', 'mn', 'na', 'an', 'nf', 'fo', 'oi', 'ip', 'pm', 'me', 'eu', 'up', 'pu', 'ut', 'tn', 'np', 'pt', 'ti', 'ia', 'aa', 'at', 'tn', 'nt', 'ti', 'ip', 'pi', 'id', 'dm', 'me', 'ec', 'cl', 'li', 'iq', 'qe', 'ef', 'fh', 'hc', 'cd', 'dq', 'qu', 'ut', 'tt', 'tu', 'ua', 'ao', 'os

In [73]:
def get_x_match(bigram):
    return bigram[0] == bigram[1]

In [82]:
def gap_match(bigrams):
    temp = []
    for gram in bigrams:
        #print(gram)
        if get_x_match(gram):
            #print(gram)
            temp.append(gram[0])
    return(temp)

            
georgics_gap_matches = [gap_match(bigrams) for bigrams in georgics_bigrams]    
print(gap_match(georgics_bigrams[0]))
print(georgics_gap_matches)

['l', 'q', 'a', 'a', 'i', 'i', 'c', 'a', 'u', 's', 's', 'e', 'e', 'c', 'a', 't', 'c', 'e', 'i', 'm', 'n', 'i', 'a', 't', 'c', 'a', 'c', 'a', 'a', 'n', 'n', 'i', 's', 'a', 'a', 'i']
[['l', 'q', 'a', 'a', 'i', 'i', 'c', 'a', 'u', 's', 's', 'e', 'e', 'c', 'a', 't', 'c', 'e', 'i', 'm', 'n', 'i', 'a', 't', 'c', 'a', 'c', 'a', 'a', 'n', 'n', 'i', 's', 'a', 'a', 'i'], ['p', 's', 'n', 'n', 'n', 'a', 'n', 'h', 'a', 'a', 'q', 'e', 'i', 'q', 'a', 'a', 'a', 'a', 'a', 'n', 'u', 's', 'a', 'p', 'e', 'a', 'p', 'i', 'n', 'n', 'n', 'a', 'i', 'i', 'h', 's'], ['i', 'c', 'c', 'i', 'i', 'e', 's', 't', 'c', 'c', 't', 'i', 'f', 'i', 's', 's', 'n', 'p', 'i', 's', 'a', 'c', 'u', 'e', 's', 'i', 'a', 'a', 'i', 'c', 'n', 'd', 'p', 'n', 'p', 'i', 'u'], ['m', 'f', 'o', 'a', 'u', 'u', 'p', 'e', 's', 't', 't', 'i', 'c', 'e', 'i', 'd', 'a', 'i', 'i', 's', 'a', 'i', 'e', 's', 'e', 's', 'a', 'a', 'e', 'e', 's', 'm', 'a', 'e', 'p', 'p', 'o', 't', 'q', 'c', 'o', 's', 'i', 'a', 't', 't', 'm', 'i', 'q', 'q', 'c']]


In [126]:
georgics_matches = []

for bigrams in georgics_bigrams:
    xx = []

    for bigram in bigrams:
        if get_x_match(bigram) and bigram.startswith('a'):
            xx.append(bigram[0])
    
    georgics_matches.append(xx)

print(georgics_matches)

[['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a'], ['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a'], ['a', 'a', 'a'], ['a', 'a', 'a', 'a', 'a', 'a', 'a']]


In [129]:
georgics_match_counts = [Counter(match) for match in georgics_matches]
pprint([sorted(count.items()) for count in georgics_match_counts])

[[('a', 10)], [('a', 11)], [('a', 3)], [('a', 7)]]


In [130]:
sum(georgics_match_counts, Counter())

Counter({'a': 31})

In [202]:
def find_skipgrams(input_list, n, step=0):
    lists = [input_list[i:] for i in range(n+step)]
    skipgrams = list(zip(lists[0],lists[-1]))
    return skipgrams

In [203]:
test = " "
test = [l for l in test]
print(find_skipgrams(test,2,4))

[('a', 'v'), ('r', 'i'), ('m', 'r'), ('a', 'u'), (' ', 'm'), ('v', 'q'), ('i', 'u'), ('r', 'e')]


In [204]:
test = """Arma virumque cano, Troiae qui primus ab oris
Italiam, fato profugus, Laviniaque venit
litora, multum ille et terris iactatus et alto
vi superum saevae memorem Iunonis ob iram;
multa quoque et bello passus, dum conderet urbem,               5
inferretque deos Latio, genus unde Latinum,
Albanique patres, atque altae moenia Romae."""

In [205]:
test = preprocess(test).split()

In [207]:
print(find_skipgrams(test,2,1))

[('arma', 'cano'), ('uirumque', 'troiae'), ('cano', 'qui'), ('troiae', 'primus'), ('qui', 'ab'), ('primus', 'oris'), ('ab', 'italiam'), ('oris', 'fato'), ('italiam', 'profugus'), ('fato', 'lauiniaque'), ('profugus', 'uenit'), ('lauiniaque', 'litora'), ('uenit', 'multum'), ('litora', 'ille'), ('multum', 'et'), ('ille', 'terris'), ('et', 'iactatus'), ('terris', 'et'), ('iactatus', 'alto'), ('et', 'ui'), ('alto', 'superum'), ('ui', 'saeuae'), ('superum', 'memorem'), ('saeuae', 'iunonis'), ('memorem', 'ob'), ('iunonis', 'iram'), ('ob', 'multa'), ('iram', 'quoque'), ('multa', 'et'), ('quoque', 'bello'), ('et', 'passus'), ('bello', 'dum'), ('passus', 'conderet'), ('dum', 'urbem'), ('conderet', 'inferretque'), ('urbem', 'deos'), ('inferretque', 'latio'), ('deos', 'genus'), ('latio', 'unde'), ('genus', 'latinum'), ('unde', 'albanique'), ('latinum', 'patres'), ('albanique', 'atque'), ('patres', 'altae'), ('atque', 'moenia'), ('altae', 'romae')]


In [28]:
def gap_probability(p, r):
    temp = p * ((1-p) ** r)
    return temp

a_count = 268
lines = 2188

a_prob = a_count/lines
print('p = %f' %a_prob)

a_gap_prob = gap_probability(a_prob,gap)

gap = 0
print("We arrive at an expected value for the number of recurrences of initial \"a\" with the gap \
of length %d by the calculation %d x %f = %f." % (gap, a_count, a_prob, a_count * a_gap_prob ))

p = 0.122486
We arrive at an expected value for the number of recurrences of initial "a" with the gap of length 0 by the calculation 268 x 0.122486 = 13.152238.


In [17]:
gap_probability(a_prob,gap)

0.12248628884826325

In [60]:
georgics_initials_flat = [item for sublist in georgics_initials for item in sublist]

In [61]:
c = Counter(georgics_initials_flat)

In [62]:
c

Counter({'a': 267,
         'b': 14,
         'c': 166,
         'd': 80,
         'e': 200,
         'f': 84,
         'g': 12,
         'h': 94,
         'i': 230,
         'l': 38,
         'm': 67,
         'n': 160,
         'o': 52,
         'p': 156,
         'q': 103,
         'r': 27,
         's': 205,
         't': 126,
         'u': 107})

In [96]:
georgics_text = "\n".join(georgics_edit)

In [99]:
georgics_lines = line_tokenizer.tokenize(georgics_text)

#### Total lines in *Georgics*

In [101]:
georgics_line_count = len(georgics_lines)