In [1]:
ord('牛')

29275

In [1]:
ord('\n')

10

In [6]:
chr(29275)

'牛'

In [8]:
print("this is a test" + chr(0)+" string")


this is a test  string


In [9]:
test_string = "hello! こんにちは!"
u8_encoded = test_string.encode('utf-8')
print(u8_encoded)

b'hello! \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf!'


In [3]:
print(type(u8_encoded))
list(u8_encoded)

<class 'bytes'>


[104,
 101,
 108,
 108,
 111,
 33,
 32,
 227,
 129,
 147,
 227,
 130,
 147,
 227,
 129,
 171,
 227,
 129,
 161,
 227,
 129,
 175,
 33]

In [None]:
print(len(test_string))
print(len(u8_encoded))
# 一个byte不一定对应一个字符
print((u8_encoded.decode('utf-8')))
# chr（0）不显示 但是仍然占用字符串长度
test_string_0 = test_string+chr(0)
print(len(test_string_0))

13
23
hello! こんにちは!
14


## Problem unicode2:Unicode Encodings

In [None]:
def decode_utf8_bytes_to_str(bytestring:bytes):
    return "".join([bytes([b]).decode("utf-8") for b in bytestring])
# 无法解码duobyte组成的字符
decode_utf8_bytes_to_str("hello啊".encode("utf-8"))

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe5 in position 0: unexpected end of data

c

In [19]:
bs = bytes([0xc3, 0x28])  # Invalid UTF-8 sequence
bs.decode("utf-8")

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 0: invalid continuation byte

## 2.3 Subword Tokenization

## 2.4 BPE Tokenizer Training

In [2]:
PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""


import regex as re

text = "some text that i'll pre-tokenize"
re.findall(PAT, text)

['some', ' text', ' that', ' i', "'ll", ' pre', '-', 'tokenize']

### pre-tokenize过程和合并过程

In [1]:
from collections import Counter
import regex as re

text = "some text that i'll pre-tokenize"
with open('../tests/fixtures/tinystories_sample.txt', 'r') as file:
    text = file.read()

def get_segments(text):
    PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
    
    segments = []
    # pre-tokenize 防止词越界
    for t in re.finditer(PAT,text):
        pre_token = t.group()
        # print("group:",pre_token)
        u8_bytes = (pre_token.encode("utf-8"))
        segments.append(list(u8_bytes))
        
    return segments
       
# 找到频率最高的对,如果频率一致,那么选择字典序最大的一对
# 每次合并过程只有一个合并,也就是一个新的token_id
def apply_merge(segments, merge_pair, new_token_id):
    results = []
    for tokens in segments:
        result  = [] # 合并后的新token序列
        i = 0
        while i < len(tokens) :
            # 检查当前位置是否是要合并的pair
            if (i<len(tokens) -1) and tokens[i] == merge_pair[0] and tokens[i+1] == merge_pair[1]:
                result.append(new_token_id)
                i += 2
            else:
                # 普通token 保留
                result.append(tokens[i])
                i += 1
        results.append(result)
    return results

# 找到频率最高的字节对，如果频率相同则选择字典序最大的
def get_best_pair(counter):
    if not counter:
        return None
    
    # 找到最高频率
    max_freq = max(counter.values())
    
    # 找到所有具有最高频率的字节对
    candidates = [pair for pair, freq in counter.items() if freq == max_freq]
    
    # 按字典序排序，选择最大的（最后一个）
    best_pair = max(candidates)
    
    return best_pair, max_freq

def count_pairs(segments):
    byte_pair_count = Counter()
    for tokens in segments:
        for i in range(len(tokens) - 1):
            byte_pair_count[(tokens[i], tokens[i+1])] += 1
    return byte_pair_count

def train_bpe(text, num_merges):
    # 初始化bytes
    segments = get_segments(text)
    # 初始化token到bytes的映射
    token_to_bytes = {}
    for i in range(256):
        token_to_bytes[i] = bytes([i])
    
    merges = []
    base_token_id = 256
    bias_token_count = 0
    for i in range(num_merges):
        # 计算字节对频率并返回最大的字节对
        pair, _ = get_best_pair(count_pairs(segments))
        # print("pair:",pair)
        new_token_id = base_token_id + bias_token_count
        # 保存合并规则
        merges.append((pair,new_token_id))
        # 映射新token到bytes
        token_to_bytes[new_token_id] = token_to_bytes[pair[0]] + token_to_bytes[pair[1]]
        # 更新合并后的tokens
        segments = apply_merge(segments, pair, new_token_id)
        bias_token_count += 1
        # print("segments:",segments)
    token_ids = []
    for segment in segments:
        token_ids.extend(segment)
    return token_ids,token_to_bytes,merges

def decode(token_ids,token_to_bytes):
    result = bytes([])
    for token_id in token_ids:
        result += token_to_bytes[token_id]
    return result.decode('utf-8')

def encode(text,merges):
    PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
    
    segments = []
    for t in re.finditer(PAT,text):
        pre_token = t.group()
        segments.append(list(pre_token.encode("utf-8")))
    
    # 按照训练时顺序应用合并
    for merge_pair,token_id  in merges:
        segments = apply_merge(segments,merge_pair,token_id)
    
    # 把segments合并为一个token_ids
    token_ids = []
    for tokens in segments:
        token_ids.extend(tokens)
    
    return token_ids

num_merges = 100

token_ids ,token_to_bytes,merges= train_bpe(text,num_merges)

result = decode(token_ids,token_to_bytes)
# print(result)
# for k,v in token_to_bytes.items():
#     if k >=256:
#         print(f'token id:{k},raw bytes:{v} ',end='')
#         print(f'token:{v.decode("utf-8")}')
print('trained :',token_ids)
print('encoded :',encode(text,merges))

trained : [10, 79, 110, 99, 101, 347, 111, 110, 258, 256, 105, 287, 265, 275, 280, 258, 282, 266, 116, 108, 101, 273, 111, 121, 281, 323, 262, 290, 292, 46, 290, 292, 282, 111, 118, 262, 269, 327, 120, 112, 108, 111, 275, 265, 261, 340, 108, 100, 258, 114, 278, 260, 264, 317, 46, 344, 307, 119, 326, 302, 121, 258, 348, 299, 286, 299, 115, 44, 282, 105, 342, 337, 97, 117, 116, 105, 102, 117, 108, 353, 115, 286, 289, 261, 267, 101, 285, 110, 271, 272, 112, 308, 32, 270, 258, 349, 275, 46, 328, 297, 330, 44, 290, 292, 280, 261, 97, 108, 107, 299, 286, 114, 278, 103, 104, 265, 349, 275, 261, 257, 110, 300, 288, 97, 287, 258, 99, 114, 111, 115, 115, 258, 303, 338, 320, 101, 99, 105, 97, 108, 353, 46, 32, 87, 257, 110, 290, 292, 307, 119, 325, 300, 280, 258, 348, 262, 33, 10, 324, 351, 44, 32, 226, 128, 156, 87, 296, 44, 286, 289, 32, 272, 258, 32, 275, 97, 276, 121, 258, 348, 299, 353, 33, 32, 67, 302, 32, 73, 273, 117, 121, 325, 63, 226, 128, 157, 10, 84, 257, 306, 111, 112, 342, 101, 112,

# 双向链表

In [1]:
from collections import deque
linked_list = deque()
for i in range(5):
    linked_list.appendleft(i)
    
print(linked_list)

deque([4, 3, 2, 1, 0])
