In [13]:
from cltk.data.fetch import FetchCorpus

In [26]:
corpus_downloader = FetchCorpus(language='lat')

corpus_downloader.import_corpus('lat_models_cltk')

In [1]:
from latin_author_learning.berts import LatinBert, LatinTokenizer

In [2]:
from pathlib import Path

In [3]:
tokenizer_path = Path("../latin_tokenizer/latin.subword.encoder")
bert_path = Path("../latin_bert/")

In [4]:
tokenizer = LatinTokenizer(LatinBert.get_subtoken_strings(tokenizer_path))

In [5]:
tokenizer.encode("cogito")

tensor([16398, 23733, 16049, 25666])

In [6]:
tokenizer.encode("Cogito ergo sum.")

tensor([11708, 23733, 16049, 25666, 31383, 17041, 16361])

In [7]:
tokenizer.encode("ergo")

tensor([31383])

In [43]:
tokenizer.encode("sum")

tensor([17041])

In [9]:
tokenizer.encode("sum.")

tensor([17041, 16361])

In [10]:
tokenizer.encode(".")

tensor([16361])

In [11]:
tokenizer.encode("Errare humanum est.")

tensor([  432, 23504, 11632, 22407, 27150, 16361])

In [12]:
tokenizer.encode("Cogito ergo sum. Errare humanum est.")

tensor([11708, 23733, 16049, 25666, 31383, 17041, 19083,  5761,  4386, 14739,
        24611,   432, 23504, 11632, 22407, 27150, 16361])

In [33]:
tokenizer.encode("sum. Errare")

tensor([17041, 19083,  5761,  4386, 14739, 24611,   432, 23504, 11632])

In [36]:
tokenizer.encode("sum.Errare")

tensor([17041, 16361,   432, 23504, 11632])

In [37]:
tokenizer.encode(". ")

tensor([19083,  5761,  4386, 14739, 24611])

In [39]:
tokenizer.encode(" ")

tensor([ 5761,  4386, 14739, 24611])

In [40]:
tokenizer.encode(".")

tensor([16361])

In [34]:
tokenizer.encode("Errare")

tensor([  432, 23504, 11632])

In [42]:
tokenizer.encode("errare")

tensor([15795])

In [35]:
tokenizer.encode("sum.")

tensor([17041, 16361])

In [31]:
tokenizer.encode("humanum est.")

tensor([22407, 27150, 16361])

In [45]:
tokenizer.encode("Cogito ergo sum. Errare humanum est.".lower())

tensor([16398, 23733, 16049, 25666, 31383, 17041, 19083,  5761,  4386, 14739,
        24611, 15795, 22407, 27150, 16361])

In [63]:
dummy_tokenizer = LatinTokenizer(["cogito_", "o_"])

In [64]:
dummy_tokenizer.encode("cogito")

tensor([29])

In [65]:
dummy_tokenizer.encode("o")

tensor([46])

In [66]:
dummy_tokenizer.encode("Cogito")

tensor([23, 15, 37, 41,  5, 46])

In [99]:
dummy_tokenizer.encode("cogit")

tensor([19, 15, 37, 41,  5, 32])

In [67]:
from cltk.tokenizers.lat.lat import LatinPunktSentenceTokenizer as SentenceTokenizer
from cltk.tokenizers.lat.lat import LatinWordTokenizer as WordTokenizer

In [68]:
text = "Cogito ergo sum. Errare humanum est.".lower()

In [73]:
sentences = SentenceTokenizer().tokenize(text)

In [74]:
sentences

['cogito ergo sum.', 'errare humanum est.']

In [86]:
words = WordTokenizer().tokenize(" ".join(sentences))

In [87]:
words

['cogito', 'ergo', 'sum', '.', 'errare', 'humanum', 'est', '.']

In [None]:
tokenizer.encode(" ".join(words)) 

In [106]:
tokenizer.decode([16398, 23733, 16049, 25666, 31383, 17041,  5761,  4386, 14739,  3319,
        19083,  5761,  4386, 14739, 24611, 15795, 22407, 27150,  5761,  4386,
        14739,  3319, 16361])

'cogito ergo sum . errare humanum est .'

In [90]:
tokenizer.encode("cogito")

tensor([16398, 23733, 16049, 25666])

In [105]:
tokenizer.decode([16398, 23733, 16049, 25666])

'cogito'

In [97]:
tokenizer.encode("cogitare")

tensor([12932])

In [98]:
tokenizer.encode("cogit")

tensor([25099])

In [100]:
tokenizer.encode("cogitas")

tensor([16398, 23733, 16049,  9077])

In [101]:
tokenizer.encode("est")

tensor([27150])

In [103]:
tokenizer.encode("errat")

tensor([17245])

In [104]:
tokenizer.encode("cogitat")

tensor([8081])

In [23]:
corpus_downloader.all_corpora_for_lang

[{'type': 'text',
  'name': 'lat_text_perseus',
  'origin': 'https://github.com/cltk/lat_text_perseus.git'},
 {'name': 'lat_treebank_perseus',
  'origin': 'https://github.com/cltk/lat_treebank_perseus.git',
  'type': 'treebank'},
 {'name': 'lat_text_latin_library',
  'origin': 'https://github.com/cltk/lat_text_latin_library.git',
  'type': 'text'},
 {'location': 'local', 'name': 'phi5', 'origin': None, 'type': 'text'},
 {'origin': None, 'name': 'phi7', 'location': 'local', 'type': 'text'},
 {'name': 'latin_proper_names_cltk',
  'origin': 'https://github.com/cltk/latin_proper_names_cltk.git',
  'type': 'lexicon'},
 {'origin': 'https://github.com/cltk/lat_models_cltk.git',
  'name': 'lat_models_cltk',
  'type': 'model'},
 {'name': 'latin_pos_lemmata_cltk',
  'origin': 'https://github.com/cltk/latin_pos_lemmata_cltk.git',
  'type': 'lemma'},
 {'name': 'latin_treebank_index_thomisticus',
  'origin': 'https://github.com/cltk/latin_treebank_index_thomisticus.git',
  'type': 'treebank'},
 {'n

In [24]:
corpus_downloader.import_corpus("latin_text_antique_digiliblt")

Downloaded 100% 7.52 MiB | 2.09 MiB/s 

In [29]:
from cltk.

ModuleNotFoundError: No module named 'cltk.corpus'