In [1]:
from ragcar import Ragcar

In [2]:
Ragcar.available_models("tokenization")

"Available models for tokenization are {'huggingface': ['klue/roberta-large', 'jinmang2/kpfbert', 'MODELS_SUPPORTED(https://huggingface.co/models?library=transformers)'], 'tiktoken': ['cl100k_base', 'p50k_base', 'r50k_base', 'gpt2', 'MODELS_SUPPORTED(https://github.com/openai/tiktoken/blob/main/tiktoken/model.py)'], 'openai': ['gpt-4-turbo', 'gpt-4', 'gpt-3.5-turbo', 'MODELS_SUPPORTED(https://platform.openai.com/docs/models)'], 'clova': ['YOUR_MODEL(https://www.ncloud.com/product/aiService/clovaStudio)'], 'kiwi': [None, 'YOUR_MODEL']}([src]: huggingface, [model]: klue/roberta-large, jinmang2/kpfbert, MODELS_SUPPORTED(https://huggingface.co/models?library=transformers)), ([src]: tiktoken, [model]: cl100k_base, p50k_base, r50k_base, gpt2, MODELS_SUPPORTED(https://github.com/openai/tiktoken/blob/main/tiktoken/model.py)), ([src]: openai, [model]: gpt-4-turbo, gpt-4, gpt-3.5-turbo, MODELS_SUPPORTED(https://platform.openai.com/docs/models)), ([src]: clova, [model]: YOUR_MODEL(https://www.ncl

## [Hugging Face 🤗](https://huggingface.co/models?library=transformers)

In [3]:
tokenizer = Ragcar(tool="tokenization", src="huggingface")

In [4]:
tokenizer('네이버 "하이퍼클로바X, 한국어 역량은 빅테크 AI보다 뛰어나"')

['네이버',
 '"',
 '하이퍼',
 '##클로',
 '##바',
 '##X',
 ',',
 '한국어',
 '역량',
 '##은',
 '빅',
 '##테크',
 'AI',
 '##보',
 '##다',
 '뛰어나',
 '"']

In [5]:
tokenizer = Ragcar(tool="tokenization", src="huggingface", model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [6]:
tokenizer('네이버 "하이퍼클로바X, 한국어 역량은 빅테크 AI보다 뛰어나"')

['▁네이버',
 '▁"',
 '하이',
 '퍼',
 '클',
 '로',
 '바',
 'X',
 ',',
 '▁한국어',
 '▁역',
 '량',
 '은',
 '▁',
 '빅',
 '테크',
 '▁AI',
 '보다',
 '▁뛰어',
 '나',
 '"']

## [Tiktoken](https://github.com/openai/tiktoken/tree/main)

In [7]:
tokenizer = Ragcar(tool="tokenization", src="tiktoken")

In [8]:
tokenizer('네이버 "하이퍼클로바X, 한국어 역량은 빅테크 AI보다 뛰어나"')

[b'\xeb\x84',
 b'\xa4',
 b'\xec\x9d\xb4',
 b'\xeb\xb2\x84',
 b' "',
 b'\xed\x95\x98',
 b'\xec\x9d\xb4',
 b'\xed',
 b'\x8d',
 b'\xbc',
 b'\xed\x81',
 b'\xb4',
 b'\xeb\xa1\x9c',
 b'\xeb\xb0',
 b'\x94',
 b'X',
 b',',
 b' \xed\x95\x9c',
 b'\xea\xb5',
 b'\xad',
 b'\xec\x96\xb4',
 b' \xec\x97',
 b'\xad',
 b'\xeb\x9f',
 b'\x89',
 b'\xec\x9d\x80',
 b' \xeb',
 b'\xb9',
 b'\x85',
 b'\xed',
 b'\x85\x8c',
 b'\xed\x81\xac',
 b' AI',
 b'\xeb\xb3\xb4',
 b'\xeb\x8b\xa4',
 b' \xeb',
 b'\x9b',
 b'\xb0',
 b'\xec\x96\xb4',
 b'\xeb\x82\x98',
 b'"']

In [9]:
tokenizer = Ragcar(tool="tokenization", src="tiktoken", model="p50k_base")

In [10]:
tokenizer('네이버 "하이퍼클로바X, 한국어 역량은 빅테크 AI보다 뛰어나"')

[b'\xeb',
 b'\x84',
 b'\xa4',
 b'\xec\x9d',
 b'\xb4',
 b'\xeb',
 b'\xb2',
 b'\x84',
 b' "',
 b'\xed\x95',
 b'\x98',
 b'\xec\x9d',
 b'\xb4',
 b'\xed',
 b'\x8d',
 b'\xbc',
 b'\xed',
 b'\x81',
 b'\xb4',
 b'\xeb',
 b'\xa1',
 b'\x9c',
 b'\xeb',
 b'\xb0',
 b'\x94',
 b'X',
 b',',
 b' ',
 b'\xed\x95',
 b'\x9c',
 b'\xea',
 b'\xb5',
 b'\xad',
 b'\xec',
 b'\x96',
 b'\xb4',
 b' \xec',
 b'\x97',
 b'\xad',
 b'\xeb',
 b'\x9f',
 b'\x89',
 b'\xec\x9d',
 b'\x80',
 b' \xeb',
 b'\xb9',
 b'\x85',
 b'\xed',
 b'\x85',
 b'\x8c',
 b'\xed',
 b'\x81',
 b'\xac',
 b' AI',
 b'\xeb',
 b'\xb3',
 b'\xb4',
 b'\xeb\x8b',
 b'\xa4',
 b' \xeb',
 b'\x9b',
 b'\xb0',
 b'\xec',
 b'\x96',
 b'\xb4',
 b'\xeb',
 b'\x82',
 b'\x98',
 b'"']

## [OpenAI](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb)

In [11]:
tokenizer = Ragcar(tool="tokenization", src="openai")

In [12]:
tokenizer('네이버 "하이퍼클로바X, 한국어 역량은 빅테크 AI보다 뛰어나"')

[b'\xeb\x84',
 b'\xa4',
 b'\xec\x9d\xb4',
 b'\xeb\xb2\x84',
 b' "',
 b'\xed\x95\x98',
 b'\xec\x9d\xb4',
 b'\xed',
 b'\x8d',
 b'\xbc',
 b'\xed\x81',
 b'\xb4',
 b'\xeb\xa1\x9c',
 b'\xeb\xb0',
 b'\x94',
 b'X',
 b',',
 b' \xed\x95\x9c',
 b'\xea\xb5',
 b'\xad',
 b'\xec\x96\xb4',
 b' \xec\x97',
 b'\xad',
 b'\xeb\x9f',
 b'\x89',
 b'\xec\x9d\x80',
 b' \xeb',
 b'\xb9',
 b'\x85',
 b'\xed',
 b'\x85\x8c',
 b'\xed\x81\xac',
 b' AI',
 b'\xeb\xb3\xb4',
 b'\xeb\x8b\xa4',
 b' \xeb',
 b'\x9b',
 b'\xb0',
 b'\xec\x96\xb4',
 b'\xeb\x82\x98',
 b'"']

In [13]:
tokenizer = Ragcar(tool="tokenization", src="openai", model="text-davinci-003")

In [14]:
tokenizer('네이버 "하이퍼클로바X, 한국어 역량은 빅테크 AI보다 뛰어나"')

[b'\xeb',
 b'\x84',
 b'\xa4',
 b'\xec\x9d',
 b'\xb4',
 b'\xeb',
 b'\xb2',
 b'\x84',
 b' "',
 b'\xed\x95',
 b'\x98',
 b'\xec\x9d',
 b'\xb4',
 b'\xed',
 b'\x8d',
 b'\xbc',
 b'\xed',
 b'\x81',
 b'\xb4',
 b'\xeb',
 b'\xa1',
 b'\x9c',
 b'\xeb',
 b'\xb0',
 b'\x94',
 b'X',
 b',',
 b' ',
 b'\xed\x95',
 b'\x9c',
 b'\xea',
 b'\xb5',
 b'\xad',
 b'\xec',
 b'\x96',
 b'\xb4',
 b' \xec',
 b'\x97',
 b'\xad',
 b'\xeb',
 b'\x9f',
 b'\x89',
 b'\xec\x9d',
 b'\x80',
 b' \xeb',
 b'\xb9',
 b'\x85',
 b'\xed',
 b'\x85',
 b'\x8c',
 b'\xed',
 b'\x81',
 b'\xac',
 b' AI',
 b'\xeb',
 b'\xb3',
 b'\xb4',
 b'\xeb\x8b',
 b'\xa4',
 b' \xeb',
 b'\x9b',
 b'\xb0',
 b'\xec',
 b'\x96',
 b'\xb4',
 b'\xeb',
 b'\x82',
 b'\x98',
 b'"']

### HyperCLOVA 모델
`.env` 파일 또는 환경 변수로 `X-NCP-APIGW-API-KEY`, `X-NCP-CLOVASTUDIO-API-KEY`를 설정하거나 다음과 같이 직접 변수를 입력합니다. 
HyperCLOVA API 사용방법은 [여기서](https://guide.ncloud-docs.com/docs/clovastudio-explorer03) 참고해주세요.
* model_n: API URL
* api_key: X-NCP-APIGW-API-KEY
* app_key: X-NCP-CLOVASTUDIO-API-KEY

In [15]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [16]:
tokenizer = Ragcar(
    tool="tokenization", 
    src="clova",
    model="https://clovastudio.apigw.ntruss.com/testapp/v1/api-tools/chat-tokenize/HCX-003/{}".format(os.getenv('TOKENIZE_HCX_APP_ID'))
)

In [17]:
tokenizer('네이버 "하이퍼클로바X, 한국어 역량은 빅테크 AI보다 뛰어나"')

18

In [18]:
tokenizer = Ragcar(
    tool="tokenization", 
    src="clova",
    model={
        "model_n": "https://clovastudio.apigw.ntruss.com/testapp/v1/api-tools/tokenize/LK-D2/{}".format(os.getenv('TOKENIZE_APP_ID')),
        "api_key": os.getenv('X-NCP-APIGW-API-KEY'),
        "app_key": os.getenv('X-NCP-CLOVASTUDIO-API-KEY')
    }
)

In [19]:
tokenizer('네이버 "하이퍼클로바X, 한국어 역량은 빅테크 AI보다 뛰어나"')

18

## [Kiwipiepy](https://github.com/bab2min/kiwipiepy)

In [20]:
tokenizer = Ragcar(tool="tokenization", src="kiwi")

In [21]:
tokenizer('네이버 "하이퍼클로바X, 한국어 역량은 빅테크 AI보다 뛰어나"')

[Token(form='네이버', tag='NNP', start=0, len=3),
 Token(form='"', tag='SSO', start=4, len=1),
 Token(form='하이퍼클로바', tag='NNG', start=5, len=6),
 Token(form='X', tag='SL', start=11, len=1),
 Token(form=',', tag='SP', start=12, len=1),
 Token(form='한국어', tag='NNP', start=14, len=3),
 Token(form='역량', tag='NNG', start=18, len=2),
 Token(form='은', tag='JX', start=20, len=1),
 Token(form='빅', tag='NNG', start=22, len=1),
 Token(form='테크', tag='NNG', start=23, len=2),
 Token(form='AI', tag='SL', start=26, len=2),
 Token(form='보다', tag='JKB', start=28, len=2),
 Token(form='뛰어나', tag='VA', start=31, len=3),
 Token(form='어', tag='EF', start=33, len=1),
 Token(form='"', tag='SSC', start=34, len=1)]

In [22]:
# Open the file in write mode
with open('user_dict_sample.txt', 'w') as f:
    # Write the string to the file
    f.write('하이퍼클로바X\tNNP\t1.0\n')

user_tokenizer = Ragcar(tool="tokenization", src="kiwi", model="user_dict_sample.txt")

user_tokenizer('네이버 "하이퍼클로바X, 한국어 역량은 빅테크 AI보다 뛰어나"')

[Token(form='네이버', tag='NNP', start=0, len=3),
 Token(form='"', tag='SSO', start=4, len=1),
 Token(form='하이퍼클로바X', tag='NNP', start=5, len=7),
 Token(form=',', tag='SP', start=12, len=1),
 Token(form='한국어', tag='NNP', start=14, len=3),
 Token(form='역량', tag='NNG', start=18, len=2),
 Token(form='은', tag='JX', start=20, len=1),
 Token(form='빅', tag='NNG', start=22, len=1),
 Token(form='테크', tag='NNG', start=23, len=2),
 Token(form='AI', tag='SL', start=26, len=2),
 Token(form='보다', tag='JKB', start=28, len=2),
 Token(form='뛰어나', tag='VA', start=31, len=3),
 Token(form='어', tag='EF', start=33, len=1),
 Token(form='"', tag='SSC', start=34, len=1)]