In [115]:
[b for b in "నే".encode("utf-8")]

[224, 176, 168, 224, 177, 135]

In [107]:
ls = [b for b in "క".encode("utf-8")]

In [122]:
ls.append(135)

In [123]:
bytes(ls).decode("utf-8", errors="replace")

'కే'

In [11]:
from base import BaseTokenizer, get_stats, merge
import regex as re

In [13]:


# the main GPT text split patterns, see
# https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py
GPT2_SPLIT_PATTERN = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""


class RegexTokenizer(BaseTokenizer):
    """
    pattern : str -> regex pattern , default: gpt4 regex pattern
    """

    def __init__(self, pattern = None) -> None:
        super().__init__()
        self.pattern = GPT4_SPLIT_PATTERN if pattern is None else pattern
        self.compiled_pattern = re.compile(self.pattern)

    def train(self, text: str, vocab_size: int):
        assert vocab_size >= 256 
        num_merges = vocab_size - 256

        text_chunks = re.findall(self.compiled_pattern, text) 
        int_tokens = [list(i.encode("utf-8")) for i in text_chunks]
        merges = {}
        vocab = {bytes([idx]) for idx in range(256)}
        for i in range(num_merges):
            stats = {}
            for word in text_chunks:
                get_stats(word, stats)

            max_pair = max(stats, key=stats.get)
            idx = i + 256
            int_tokens = [merge(word, max_pair, idx) for word in int_tokens]

            merges[max_pair] = idx
            vocab[idx] = vocab[max_pair[0]] + vocab[max_pair[1]]

        self.merges = merges
        self.vocab = vocab
    
    def encode(self, text):
        """Encoding that ignores any special tokens."""
        # split text into chunks of text by categories defined in regex pattern
        text_chunks = re.findall(self.compiled_pattern, text)
        # all chunks of text are encoded separately, then results are joined
        ids = []
        for chunk in text_chunks:
            chunk_ids = super().encode(chunk)
            ids.extend(chunk_ids)
        return ids

    def decode(self, ids):
    # given ids (list of integers), return Python string
        part_bytes = []
        for idx in ids:
            if idx in self.vocab:
                part_bytes.append(self.vocab[idx])
            else:
                raise ValueError(f"invalid token id: {idx}")
        text_bytes = b"".join(part_bytes)
        text = text_bytes.decode("utf-8", errors="replace")
        return text





# # t.save("taytoken")
# # t.load("taytoken.model")


# print("""Copy paste of the Wikipedia article on Taylor Swift, as of Feb 16, 2024. oiwjeovhoweh 344 2""" ==
#        t.decode(t.encode("""Copy paste of the Wikipedia article on Taylor Swift, as of Feb 16, 2024. oiwjeovhoweh 344 2""")))

# # print("""రూపొందుతోందని """ ==
# #        t.decode(t.encode("""రూపొందుతోందని """)))

# print(t.encode("""రూపొందుతోందని """))

In [2]:
with open("taylorswift.txt", "r") as f:
    text = f.read()

In [3]:
vocab_size = 300
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""

In [14]:
assert vocab_size >= 256 
num_merges = vocab_size - 256

text_chunks = re.findall(GPT4_SPLIT_PATTERN, text) 

int_tokens = [list(i.encode("utf-8")) for i in text_chunks]
int_tokens

[[67, 111, 112, 121],
 [32, 112, 97, 115, 116, 101],
 [32, 111, 102],
 [32, 116, 104, 101],
 [32, 87, 105, 107, 105, 112, 101, 100, 105, 97],
 [32, 97, 114, 116, 105, 99, 108, 101],
 [32, 111, 110],
 [32, 84, 97, 121, 108, 111, 114],
 [32, 83, 119, 105, 102, 116],
 [44],
 [32, 97, 115],
 [32, 111, 102],
 [32, 70, 101, 98],
 [32],
 [49, 54],
 [44],
 [32],
 [50, 48, 50],
 [52],
 [46, 10],
 [45, 45, 45, 10, 10],
 [77, 97, 105, 110],
 [32, 109, 101, 110, 117],
 [10, 10],
 [87, 105, 107, 105, 112, 101, 100, 105, 97, 84, 104, 101],
 [32, 70, 114, 101, 101],
 [32, 69, 110, 99, 121, 99, 108, 111, 112, 101, 100, 105, 97],
 [10, 10],
 [83, 101, 97, 114, 99, 104],
 [10],
 [67, 114, 101, 97, 116, 101],
 [32, 97, 99, 99, 111, 117, 110, 116],
 [10],
 [76, 111, 103],
 [32, 105, 110],
 [10, 10],
 [80, 101, 114, 115, 111, 110, 97, 108],
 [32, 116, 111, 111, 108, 115],
 [10],
 [67, 111, 110, 116, 101, 110, 116, 115],
 [32],
 [32, 104, 105, 100, 101],
 [10],
 [40, 84, 111, 112],
 [41, 10],
 [76, 105, 102