In [75]:
from latin_author_learning.tokenize import SentenceAwareEncoder, get_subtoken_strings
from torch import tensor

In [76]:
from pathlib import Path

In [77]:
tokenizer_path = Path("../latin_tokenizer/latin.subword.encoder")

In [78]:
tokenizer = SentenceAwareEncoder(get_subtoken_strings(tokenizer_path))

In [79]:
tokenizer.encode("cogito")

tensor([ 3006, 20715, 15334, 19495])

In [80]:
tokenizer.decode(tensor([3006]))

'c'

In [81]:
tokenizer.decode(tensor([19495]))

'ito'

In [82]:
tokenizer.decode(tensor([20715]))

'o'

In [83]:
tokenizer.decode(tensor([15334]))

'g'

In [84]:
tokenizer.encode("Cogito ergo sum.")

tensor([29578,  6883, 12808,  1511, 20715, 15334, 19495,  5696, 23691, 19399])

In [85]:
tokenizer.decode(tensor([29578,  6883, 12808,  1511]))

'C'

In [86]:
tokenizer.encode("Cogito ergo sum.".lower())

tensor([ 3006, 20715, 15334, 19495,  5696, 23691, 19399])

In [87]:
vocab = get_subtoken_strings(tokenizer_path)

In [88]:
vocab += ["cogit"]

In [89]:
tokenizer = SentenceAwareEncoder(vocab)

In [90]:
tokenizer.encode("cogito")

tensor([ 3006, 20715, 15334, 19495])

In [91]:
tokenizer.encode("ergo")

tensor([5696])

In [92]:
tokenizer.encode("sum")

tensor([23691])

In [93]:
tokenizer.encode("sum.")

tensor([23691, 19399])

In [94]:
tokenizer.encode(".")

tensor([19399])

In [95]:
tokenizer.encode("Errare humanum est.".lower())

tensor([10507,   564,  6321, 19399])

In [96]:
tokenizer.encode("Cogito ergo sum. Errare humanum est.".lower())

tensor([ 3006, 20715, 15334, 19495,  5696, 23691,  9841, 29578,  2702, 31127,
        15565, 10507,   564,  6321, 19399])

In [97]:
tokenizer.encode("sum. Errare".lower())

tensor([23691,  9841, 29578,  2702, 31127, 15565, 10507])

In [98]:
tokenizer.encode("sum.Errare".lower())

tensor([23691, 19399, 10507])

In [99]:
tokenizer.encode(". ")

tensor([ 9841, 29578,  2702, 31127, 15565])

In [100]:
tokenizer.encode(" ")

tensor([29578,  2702, 31127, 15565])

In [101]:
tokenizer.encode(".")

tensor([19399])

In [102]:
tokenizer.encode(" . ")

tensor([29578,  2702, 31127,  1511,  9841, 29578,  2702, 31127, 15565])

In [106]:
vocab.append(" . _")

In [107]:
tokenizer = SentenceAwareEncoder(vocab)

In [109]:
from latin_author_learning.tokenize import STARTS, ENDS

In [113]:
tokenizer.encode(f" . {ENDS} {STARTS}")

tensor([31263,  9842, 31263,  8906, 22522, 24491])

In [114]:
tokenizer.encode(f" . ")

tensor([31263,  9842, 31263,  8906])

In [115]:
tokenizer.decode([31263])

' '

In [116]:
tokenizer.decode([9842])

'.'

In [117]:
tokenizer.decode([8906])

''

In [118]:
tokenizer.encode("sum.")

tensor([23692, 19400])

In [119]:
tokenizer.decode([23692])

'sum'

In [120]:
tokenizer.decode([19400])

'.'

In [121]:
tokenizer.encode("sum?")

tensor([23692,  5186])

In [122]:
tokenizer.encode(" ? ")

tensor([31263,  8470, 31263,  8906])

In [123]:
tokenizer.encode(" , ")

tensor([31263, 18397, 31263,  8906])

In [124]:
tokenizer.encode(" ; ")

tensor([31263,  1511, 31263,  8906])

In [49]:
tokenizer.encode("errare")

tensor([10507])

In [50]:
tokenizer.encode("sum.")

tensor([23691, 19399])

In [51]:
tokenizer.encode("humanum est.")

tensor([  564,  6321, 19399])

In [52]:
tokenizer.encode("Cogito ergo sum. Errare humanum est.".lower())

tensor([ 3006, 20715, 15334, 19495,  5696, 23691,  9841, 29578,  2702, 31127,
        15565, 10507,   564,  6321, 19399])

In [53]:
dummy_tokenizer = SentenceAwareEncoder(["cogito_", "o_"])

In [54]:
dummy_tokenizer.encode("cogito")

tensor([25])

In [55]:
dummy_tokenizer.encode("o")

tensor([20])

In [56]:
dummy_tokenizer.encode("Cogito")

tensor([ 5, 23, 24,  6, 31,  8, 11, 32, 20])

In [57]:
dummy_tokenizer.encode("cogit")

tensor([27, 31,  8, 11, 32, 34])

In [58]:
from cltk.tokenizers.lat.lat import LatinPunktSentenceTokenizer as SentenceTokenizer
from cltk.tokenizers.lat.lat import LatinWordTokenizer as WordTokenizer

In [59]:
text = "Cogito ergo sum. Errare humanum est.".lower()

In [60]:
sentences = SentenceTokenizer().tokenize(text)

In [61]:
sentences

['cogito ergo sum.', 'errare humanum est.']

In [62]:
words = WordTokenizer().tokenize(" ".join(sentences))

In [63]:
words

['cogito', 'ergo', 'sum', '.', 'errare', 'humanum', 'est', '.']

In [64]:
tokenizer.encode(" ".join(words)) 

tensor([ 3006, 20715, 15334, 19495,  5696, 23691, 29578,  2702, 31127,  1511,
         9841, 29578,  2702, 31127, 15565, 10507,   564,  6321, 29578,  2702,
        31127,  1511, 19399])

In [65]:
tokenizer.decode([ 3006, 20715, 15334, 19495,  5696, 23691, 29578,  2702, 31127,  1511,
         9841, 29578,  2702, 31127, 15565, 10507,   564,  6321, 29578,  2702,
        31127,  1511, 19399])

'cogito ergo sum . errare humanum est .'

In [66]:
tokenizer.encode("cogito")

tensor([ 3006, 20715, 15334, 19495])

In [67]:
tokenizer.decode([16398, 23733, 16049, 25666])

'adora estu fronto vienn'

In [68]:
tokenizer.encode("cogitare")

tensor([31897])

In [69]:
tokenizer.encode("cogit")

tensor([6668])

In [70]:
tokenizer.encode("cogitas")

tensor([ 3006, 20715, 15334, 21547])

In [71]:
tokenizer.encode("est")

tensor([6321])

In [72]:
tokenizer.encode("errat")

tensor([31232])

In [73]:
tokenizer.encode("cogitat")

tensor([2045])