In [1]:
# Imports

import os
import string
import re
from collections import Counter

from cltk.corpus.latin import latinlibrary
#from cltk.tokenize.sentence import TokenizeSentence
#from cltk.tokenize.word import WordTokenizer
#from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.stem.latin.j_v import JVReplacer
#from cltk.utils.file_operations import open_pickle

In [None]:


# Set up training sentences

rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)

# Check for presence of latin_pos_lemmatized_sents
file = 'latin_pos_lemmatized_sents.pickle'      

latin_pos_lemmatized_sents_path = os.path.join(path, file)
if os.path.isfile(latin_pos_lemmatized_sents_path):
    latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
else:
    latin_pos_lemmatized_sents = []
    print('The file %s is not available in cltk_data' % file)



In [2]:
# Setup CLTK tools

#word_tokenizer = WordTokenizer('latin')
#sent_tokenizer = TokenizeSentence('latin')
#lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)
replacer = JVReplacer()

In [3]:
# Get raw text of the Latin Library

ll_raw = latinlibrary.raw()

In [4]:
from cltk.corpus.latin import latinlibrary
files = latinlibrary.fileids()

In [5]:
aeneid_files = [file for file in files if 'vergil/aen' in file]

In [6]:
print(aeneid_files)

['vergil/aen1.txt', 'vergil/aen10.txt', 'vergil/aen11.txt', 'vergil/aen12.txt', 'vergil/aen2.txt', 'vergil/aen3.txt', 'vergil/aen4.txt', 'vergil/aen5.txt', 'vergil/aen6.txt', 'vergil/aen7.txt', 'vergil/aen8.txt', 'vergil/aen9.txt']


In [7]:
aeneid_raw = latinlibrary.raw(aeneid_files)

In [55]:
ll_words = latinlibrary.words()

In [56]:
ll_list = list(ll_words)

In [57]:
ll_words = set(ll_words)

In [11]:
from pprint import pprint

In [12]:
ll_words =[word.lower() for word in ll_words]

In [106]:
# Preprocess texts
def preprocess(text):    

    text = re.sub(r'&aelig;','ae',text)
    text = re.sub(r'&AElig;','AE',text)
    
    text = text.lower()
    
    text = replacer.replace(text)
    

    text= re.sub(r'&lt;','<',text)
    text= re.sub(r'&gt;','>',text)    
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)

    remove_list = [r'\bthe latin library\b', r'\bthe classics page\b', r'\bcicero\s+?$'] 
    for pattern in remove_list:
        text = re.sub(pattern, '', text)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    
    return text

In [107]:
ll_words_ = " ".join(ll_words)

In [108]:
ll_words_ = preprocess(ll_words_)

In [109]:
ll_words = ll_words_.split()

In [110]:
aeneid_edit = preprocess(aeneid_raw)

In [111]:
lines = aeneid_edit.split('\n')
lines = [line for line in lines if line]
initials = [line[0] for line in lines]

In [112]:
matches = []

In [113]:
def find_ngrams(input_list, n):
    temp = list(zip(*[input_list[i:] for i in range(n)]))
    ngrams = ["".join(t) for t in temp]
    return ngrams


In [114]:
initial_words = list()
for i in range(6,8):
    temp = find_ngrams(initials, i)
    initial_words += temp

In [115]:
list(set(initial_words) & set(ll_words))

['posuit',
 'cuppis',
 'pinasi',
 'calcar',
 'concis',
 'dantia',
 'sagaci',
 'cainis',
 'sagacis',
 'drancae',
 'arabic',
 'nactas',
 'iussae',
 'carpas',
 'poseae',
 'coemat',
 'quaene',
 'cerata',
 'trinae',
 'aethei',
 'audiant',
 'tatiae']

In [116]:
sixgrams = find_ngrams(initials,6)
sevengrams = find_ngrams(initials,7)

In [117]:
print(sevengrams.index('audiant'))
print(sevengrams.index('sagacis'))

print(lines[1149:1149+7])
print(lines[7656:7656+7])

1149
7656
['anchemolum thalamos ausum incestare nouercae ', 'uos etiam gemini rutulis cecidistis in aruis ', 'daucia laride thymberque simillima proles ', 'indiscreta suis gratusque parentibus error ', 'at nunc dura dedit uobis discrimina pallas ', 'nam tibi thymbre caput euandrius abstulit ensis ', 'te decisa suum laride dextera quaerit ']
['sed circum late uolitans iam fama per urbes ', 'ausonias tulerat cum laomedontia pubes ', 'gramineo ripae religauit ab aggere classem', 'aeneas primique duces et pulcher iulus ', 'corpora sub ramis deponunt arboris altae ', 'instituuntque dapes et adorea liba per herbam ', 'subiciunt epulis sic iuppiter ipse monebat ']


In [118]:
met_files = [file for file in files if 'ovid/ovid.met' in file]

In [119]:
print(met_files)

['ovid/ovid.met1.txt', 'ovid/ovid.met10.txt', 'ovid/ovid.met11.txt', 'ovid/ovid.met12.txt', 'ovid/ovid.met13.txt', 'ovid/ovid.met14.txt', 'ovid/ovid.met15.txt', 'ovid/ovid.met2.txt', 'ovid/ovid.met3.txt', 'ovid/ovid.met4.txt', 'ovid/ovid.met5.txt', 'ovid/ovid.met6.txt', 'ovid/ovid.met7.txt', 'ovid/ovid.met8.txt', 'ovid/ovid.met9.txt']


In [120]:
met_raw = latinlibrary.raw(met_files)

In [121]:
met_edit = preprocess(met_raw)

In [122]:
met_lines = met_edit.split('\n')
met_lines = [line for line in met_lines if line]
met_initials = [line[0] for line in met_lines]

In [123]:
met_initial_words = list()
for i in range(7,8):
    temp = find_ngrams(met_initials, i)
    met_initial_words += temp

In [124]:
list(set(met_initial_words) & set(ll_words))

['nestini']

In [125]:
'nestini' in ll_words

True

In [126]:
ll_list.index('nestini')

14080201

In [127]:
ll_list[14080201:14080201+20]

['nestini',
 'sonant',
 'libri',
 ',',
 'a',
 'C&aelig',
 ';',
 'culo',
 ',',
 'quem',
 'juxta',
 'ignes',
 'fortuitos',
 'invenerunt',
 ',',
 'ut',
 'fama',
 'est',
 ',',
 'Digitorum']