In [1]:
# # Install CLTK Latin Models

# # Commented out, because you only need to run it once to download
# # the model to the local machine

# from cltk.corpus.utils.importer import CorpusImporter
# corpus_importer = CorpusImporter('latin')
# corpus_importer.list_corpora
# corpus_importer.import_corpus('latin_models_cltk')

In [2]:
# Import a data model to train the lemmatizer

import os
from cltk.utils.file_operations import open_pickle

# Set up training sentences

rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)

# Check for presence of latin_pos_lemmatized_sents
file = 'latin_pos_lemmatized_sents.pickle'      

latin_pos_lemmatized_sents_path = os.path.join(path, file)
if os.path.isfile(latin_pos_lemmatized_sents_path):
    latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
else:
    latin_pos_lemmatized_sents = []
    print('The file %s is not available in cltk_data' % file)

In [3]:
# Set up CLTK Latin backoff lemmatizer

# Prebuilt backoff chain—let me know if you want more information
# on building you own chain.

from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)

In [4]:
# Set up CLTK Latin word tokenizer

from cltk.tokenize.word import WordTokenizer
tokenizer = WordTokenizer('latin')

In [5]:
# Get sample text

cat1 = 'Quo usque tandem abutere, Catilina, patientia nostra? quam diu etiam furor iste tuus nos eludet? quem ad finem sese effrenata iactabit audacia? Nihilne te nocturnum praesidium Palati, nihil urbis vigiliae, nihil timor populi, nihil concursus bonorum omnium, nihil hic munitissimus habendi senatus locus, nihil horum ora voltusque moverunt? Patere tua consilia non sentis, constrictam iam horum omnium scientia teneri coniurationem tuam non vides? Quid proxima, quid superiore nocte egeris, ubi fueris, quos convocaveris, quid consilii ceperis, quem nostrum ignorare arbitraris? O tempora, o mores! Senatus haec intellegit. Consul videt; hic tamen vivit. Vivit? immo vero etiam in senatum venit, fit publici consilii particeps, notat et designat oculis ad caedem unum quemque nostrum. Nos autem fortes viri satis facere rei publicae videmur, si istius furorem ac tela vitemus. Ad mortem te, Catilina, duci iussu consulis iam pridem oportebat, in te conferri pestem, quam tu in nos [omnes iam diu] machinaris.'.lower()

In [6]:
# Get tokens

tokens = tokenizer.tokenize(cat1)
print(tokens[:10])

['quo', 'usque', 'tandem', 'abutere', ',', 'catilina', ',', 'patientia', 'nostra', '?']


In [7]:
# Get lemmas

lemmas = lemmatizer.lemmatize(tokens)
print(lemmas[:10])

[('quo', 'quo'), ('usque', 'usque'), ('tandem', 'tandem'), ('abutere', 'abutor'), (',', 'punc'), ('catilina', 'catilina'), (',', 'punc'), ('patientia', 'patientia'), ('nostra', 'noster'), ('?', 'punc')]


In [8]:
# Save lemmatizer to pickle for quick setup

import pickle
pickle.dump(lemmatizer, open("backoff.p", "wb"))

In [9]:
# # Load lemmatizer from pickle

# import pickle
# lemmatizer = pickle.load(open('backoff.p', 'rb'))