In [7]:
import sentencepiece as spm
from tokenizers import Tokenizer, normalizers
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from tokenizers.processors import TemplateProcessing
from datasets import Dataset
from transformers import PreTrainedTokenizerFast, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments


class CustomSPTokenizer(PreTrainedTokenizerFast):
    def __init__(self, tokenizer_path):
        super().__init__(tokenizer_file=tokenizer_path)
        self._tokenizer = Tokenizer.from_file(tokenizer_path)

    def _tokenize(self, text):
        return self._tokenizer.encode(text).tokens

    def _convert_token_to_id(self, token):
        return self._tokenizer.token_to_id(token)

    def _convert_id_to_token(self, index):
        return self._tokenizer.id_to_token(index)

    def convert_tokens_to_string(self, tokens):
        return self._tokenizer.decode(tokens)

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep + [self.sep_token_id]

    @property
    def cls_token_id(self):
        return self._tokenizer.token_to_id("<cls>")

    @property
    def sep_token_id(self):
        return self._tokenizer.token_to_id("<sep>")

    @property
    def pad_token_id(self):
        return self._tokenizer.token_to_id("<pad>")

    @property
    def unk_token_id(self):
        return self._tokenizer.token_to_id("<unk>")



In [17]:
model_path = 'model/bartbase_model2'
tokenizer_path = "data/bpe_tokenizer.json"

model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = CustomSPTokenizer(tokenizer_path)
tokenizer.add_special_tokens({'pad_token': '<pad>'})
tokenizer.pad_token = '<pad>'

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [18]:


def translate(text):

    inputs = tokenizer(text, return_tensors="pt")
    inputs = {key: value.to(model.device) for key, value in inputs.items() if key != 'token_type_ids'}

    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=100,
    )
    
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text
    
    
text = "반갑수다"
translated_text = translate(text)

print(translated_text)
    

반 갑 습니다
