In [1]:
# Imports

import os
import string
import re
from collections import Counter

from cltk.corpus.latin import latinlibrary
from cltk.tokenize.sentence import TokenizeSentence
from cltk.tokenize.word import WordTokenizer
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.stem.latin.j_v import JVReplacer
from cltk.utils.file_operations import open_pickle

In [3]:


# Set up training sentences

rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)

# Check for presence of latin_pos_lemmatized_sents
file = 'latin_pos_lemmatized_sents.pickle'      

latin_pos_lemmatized_sents_path = os.path.join(path, file)
if os.path.isfile(latin_pos_lemmatized_sents_path):
    latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
else:
    latin_pos_lemmatized_sents = []
    print('The file %s is not available in cltk_data' % file)



In [4]:
# Setup CLTK tools

word_tokenizer = WordTokenizer('latin')
sent_tokenizer = TokenizeSentence('latin')
lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)
replacer = JVReplacer()

In [5]:
# Get raw text of the Latin Library

ll_raw = latinlibrary.raw()

In [6]:
from cltk.corpus.latin import latinlibrary
files = latinlibrary.fileids()

In [7]:
aeneid_files = [file for file in files if 'vergil/aen' in file]

In [8]:
print(aeneid_files)

['vergil/aen1.txt', 'vergil/aen10.txt', 'vergil/aen11.txt', 'vergil/aen12.txt', 'vergil/aen2.txt', 'vergil/aen3.txt', 'vergil/aen4.txt', 'vergil/aen5.txt', 'vergil/aen6.txt', 'vergil/aen7.txt', 'vergil/aen8.txt', 'vergil/aen9.txt']


In [9]:
aeneid_raw = latinlibrary.raw(aeneid_files)

In [11]:
ll_words = latinlibrary.words()

In [12]:
ll_words = set(ll_words)

In [13]:
from pprint import pprint

In [14]:
pprint(aeneid_raw[:1000])

('Vergil: Aeneid I\n'
 '\t\t \n'
 '\n'
 '\t\t \n'
 '\t\t\n'
 '\t\t \n'
 '\t\t\n'
 '\t\t \n'
 '\t\t \n'
 '\t \n'
 '\t\n'
 ' \n'
 '\n'
 '\n'
 ' P. VERGILI MARONIS AENEIDOS LIBER PRIMVS \n'
 '\n'
 '\n'
 ' \n'
 '\n'
 'Arma virumque cano, Troiae qui primus ab oris \n'
 'Italiam, fato profugus, Laviniaque venit \n'
 'litora, multum ille et terris iactatus et alto \n'
 'vi superum saevae memorem Iunonis ob iram; \n'
 'multa quoque et bello passus, dum conderet urbem,    5 \n'
 'inferretque deos Latio, genus unde Latinum, \n'
 'Albanique patres, atque altae moenia Romae.\n'
 ' \n'
 '\n'
 ' \n'
 'Musa, mihi causas memora, quo numine laeso, \n'
 'quidve dolens, regina deum tot volvere casus \n'
 'insignem pietate virum, tot adire labores    10 \n'
 'impulerit. Tantaene animis caelestibus irae?\n'
 ' \n'
 '\n'
 ' \n'
 'Urbs antiqua fuit, Tyrii tenuere coloni, \n'
 'Karthago, Italiam contra Tiberinaque longe \n'
 'ostia, dives opum studiisque asperrima belli; \n'
 'quam Iuno fertur terris magis om

In [20]:


def prepro

# 1. Make the whole text lowercase
# Use 'lower' string method

aeneid_edit = aeneid_edit.lower()

# 2. Remove punctuation
# Use 'translate'

from string import punctuation

translator = str.maketrans({key: " " for key in punctuation})
aeneid_edit = aeneid_edit.translate(translator)

# 3. Remove numbers
# Again, use 'translate'

translator = str.maketrans({key: " " for key in '0123456789'})
aeneid_edit = aeneid_edit.translate(translator)

# 4. Normalize u/v
# Use CLTK 'JVReplacer'

from cltk.stem.latin.j_v import JVReplacer
replacer = JVReplacer()

aeneid_edit = replacer.replace(aeneid_edit)

# 5. Remove English words that appear in our plaintext file
# Use 'replace'

remove_list = ['the', 'latin', 'library', 'classics', 'page']
remove_dict = {key: ' ' for key in remove_list}

for k, v in remove_dict.items():
    aeneid_edit = aeneid_edit.replace(k,v)
    
aeneid_edit = re.sub('[ ]+',' ', aeneid_edit) # Remove double spaces
aeneid_edit = re.sub('\s+\n+\s+','\n', aeneid_edit) # Remove double lines and trim spaces around new lines

In [21]:
pprint(aeneid_edit[:1000])

('uergil aeneid i\n'
 'p uergili maronis aeneidos liber primus\n'
 'arma uirumque cano troiae qui primus ab oris \n'
 'italiam fato profugus lauiniaque uenit \n'
 'litora multum ille et terris iactatus et alto \n'
 'ui superum saeuae memorem iunonis ob iram \n'
 'multa quoque et bello passus dum conderet urbem \n'
 'inferretque deos latio genus unde um \n'
 'albanique patres atque altae moenia romae\n'
 'musa mihi causas memora quo numine laeso \n'
 'quidue dolens regina deum tot uoluere casus \n'
 'insignem pietate uirum tot adire labores \n'
 'impulerit tantaene animis caelestibus irae\n'
 'urbs antiqua fuit tyrii tenuere coloni \n'
 'karthago italiam contra tiberinaque longe \n'
 'ostia diues opum studiisque asperrima belli \n'
 'quam iuno fertur terris magis omnibus unam \n'
 'posthabita coluisse samo hic illius arma \n'
 'hic currus fuit hoc regnum dea gentibus esse \n'
 'si qua fata sinant iam tum tenditque fouetque \n'
 'progeniem sed enim troiano a sanguine duci \n'
 'audierat 

In [22]:
ll_words =[word.lower() for word in ll_words]

In [34]:
# Preprocess texts
def preprocess(text):    
    
    text = text.lower()
    
    text = replacer.replace(text)
    
    text= re.sub(r'&lt;','<',text)
    text= re.sub(r'&gt;','>',text)    
        
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)

    remove_list = [r'\bthe latin library\b', r'\bthe classics page\b', r'\bcicero\s+?$'] 
    for pattern in remove_list:
        text = re.sub(pattern, '', text)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    
    return text

In [35]:
ll_words_ = " ".join(ll_words)

In [36]:
ll_words_ = preprocess(ll_words_)

In [87]:
ll_words = ll_words_.split()

In [88]:
print(ll_words[:100])

['elixatas', 'damma', 'partiti', 'summissis', 'forcipes', 'salsas', 'uiridario', 'umerale', 'amanti', 'inparem', 'uectore', 'immundabile', 'foedissime', 'uellaeae', 'circumstructum', 'purgabant', 'gratifcabatur', 'flammatis', 'adigat', 'perge', 'interuertendi', 'uaticinantem', 'condar', 'renouato', 'dignari', 'repudiati', 'gener', 'clamet', 'polyxenam', 'pr', 'diuinume', 'ursa', 'consumpsisses', 'reparandis', 'uideto', 'uelabitis', 'nsium', 'frenaret', 'nicomachus', 'meliboeus', 'aestuent', 'piternus', 'presule', 'unitate', 'transsubstantiauit', 'incipis', 'aselli', 'petrum', 'thaumaturgus', 'cogendorum', 'amputatam', 'predum', 'stipulas', 'lectum', 'praetermissio', 'instincto', 'uid', 'i', 'capillum', 'gattam', 'anselle', 'sacrarum', 'armigerorum', 'abstrahendos', 'docuit', 'remeatum', 'pr', 'migrauit', 'michea', 'castae', 'publicitu', 'fereuente', 'promotionum', 'infestiua', 'segnitatem', 'pleuto', 'monent', 'lai', 'rettuli', 'exossa', 'suptadictum', 'insolescentes', 'liberalesartes'

In [38]:
aeneid_edit = preprocess(aeneid_raw)

In [40]:
pprint(aeneid_edit[:1000])

('uergil aeneid i\n'
 'p uergili maronis aeneidos liber primus\n'
 'arma uirumque cano troiae qui primus ab oris \n'
 'italiam fato profugus lauiniaque uenit \n'
 'litora multum ille et terris iactatus et alto \n'
 'ui superum saeuae memorem iunonis ob iram \n'
 'multa quoque et bello passus dum conderet urbem \n'
 'inferretque deos latio genus unde latinum \n'
 'albanique patres atque altae moenia romae\n'
 'musa mihi causas memora quo numine laeso \n'
 'quidue dolens regina deum tot uoluere casus \n'
 'insignem pietate uirum tot adire labores \n'
 'impulerit tantaene animis caelestibus irae\n'
 'urbs antiqua fuit tyrii tenuere coloni \n'
 'karthago italiam contra tiberinaque longe \n'
 'ostia diues opum studiisque asperrima belli \n'
 'quam iuno fertur terris magis omnibus unam \n'
 'posthabita coluisse samo hic illius arma \n'
 'hic currus fuit hoc regnum dea gentibus esse \n'
 'si qua fata sinant iam tum tenditque fouetque \n'
 'progeniem sed enim troiano a sanguine duci \n'
 'audi

In [41]:
lines = aeneid_edit.split('\n')

In [47]:
lines = [line for line in lines if line]

In [78]:
initials = [line[0] for line in lines]

In [52]:
matches = []

In [126]:
def find_ngrams(input_list, n):
    temp = list(zip(*[input_list[i:] for i in range(n)]))
    ngrams = ["".join(t) for t in temp]
    return ngrams


In [133]:
initial_words = list()
for i in range(6,8):
    temp = find_ngrams(initials, i)
    initial_words += temp

In [134]:
list(set(initial_words) & set(ll_words))

['drancae',
 'poseae',
 'sagaci',
 'tatiae',
 'quaene',
 'cerata',
 'cuppis',
 'posuit',
 'pinasi',
 'sagacis',
 'audiant',
 'arabic',
 'calcar',
 'trinae',
 'aethei',
 'coemat',
 'concis',
 'cainis',
 'carpas',
 'dantia']

In [117]:
sixgrams = find_ngrams(initials,6)
sevengrams = find_ngrams(initials,7)

In [119]:
sixgrams.index('audiant')

ValueError: 'audiant' is not in list

In [124]:
print(sevengrams.index('audiant'))
print(sevengrams.index('sagacis'))

print(lines[1149:1154])
print(lines[6788:6795])

1149
6788
['anchemolum thalamos ausum incestare nouercae ', 'uos etiam gemini rutulis cecidistis in aruis ', 'daucia laride thymberque simillima proles ', 'indiscreta suis gratusque parentibus error ', 'at nunc dura dedit uobis discrimina pallas ']
['sed circum late uolitans iam fama per urbes ', 'ausonias tulerat cum laomedontia pubes ', 'gramineo ripae religauit ab aggere classem', 'aeneas primique duces et pulcher iulus ', 'corpora sub ramis deponunt arboris altae ', 'instituuntque dapes et adorea liba per herbam ', 'subiciunt epulis sic iuppiter ipse monebat ']


In [None]:
sevengrams.index('sagacis')