In [1]:
from transformers import RobertaTokenizer

# Loading the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("tokenizer")

In [2]:
# The text we want to tokenize
text = "Farboslepú červenovlásku ohrozili protiidúce autá, a preto núdzovo zaparkovala svoje auto v močarine.".lower()

# Tokenization of the lowercase text
tokens = tokenizer.tokenize(text)

# Conversion of tokens to their IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print("Tokens:", tokens)
print("Token ID:", token_ids)

Tokens: ['far', 'bo', 'sle', 'pÃº', 'ĠÄįerven', 'ovlÃ¡', 'sku', 'Ġohro', 'zili', 'Ġproti', 'idÃº', 'ce', 'ĠautÃ¡', ',', 'Ġa', 'Ġpreto', 'ĠnÃºdz', 'ovo', 'Ġzapark', 'ovala', 'Ġsvoje', 'Ġauto', 'Ġv', 'Ġmo', 'Äįa', 'rine', '.']
Token ID: [5627, 347, 547, 4121, 5220, 26657, 656, 18508, 8934, 1115, 23320, 333, 8373, 16, 266, 668, 16807, 719, 32729, 1108, 781, 1212, 264, 393, 435, 19539, 18]


In [3]:
# Pre-prepared encodings for entering the language model
encoding = tokenizer(text, max_length=256, padding='max_length', return_tensors="pt")
print(encoding)

{'input_ids': tensor([[    0,  5627,   347,   547,  4121,  5220, 26657,   656, 18508,  8934,
          1115, 23320,   333,  8373,    16,   266,   668, 16807,   719, 32729,
          1108,   781,  1212,   264,   393,   435, 19539,    18,     2,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,  

In [4]:
with open("tokenizer/kodovanie.json", "r", encoding="utf-8") as f:
    dictionary = json.load(f)

def decode(tokens):
    decoded_tokens = []
    for token in tokens:
        for k, v in dictionary.items():
            if k in token:
                token = token.replace(k, v)
        decoded_tokens.append(token)
    return decoded_tokens

In [5]:
print(decode(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])))

['<s>', 'far', 'bo', 'sle', 'pú', 'Ġčerven', 'ovlá', 'sku', 'Ġohro', 'zili', 'Ġproti', 'idú', 'ce', 'Ġautá', ',', 'Ġa', 'Ġpreto', 'Ġnúdz', 'ovo', 'Ġzapark', 'ovala', 'Ġsvoje', 'Ġauto', 'Ġv', 'Ġmo', 'ča', 'rine', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '