In [2]:
import urllib.request
import re

### 读取数据

In [3]:
url = ("https://raw.githubusercontent.com/rasbt/" 
 "LLMs-from-scratch/main/ch02/01_main-chapter-code/" 
 "the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x21d99d72870>)

### 读取文档

In [4]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number", len(raw_text))
print(raw_text[:99])

Total number 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


### 构建vocabulary(按单词切分)

In [5]:
processed_text = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
processed_text = [item.strip() for item in processed_text if item.strip()]
print(len(processed_text))

4690


In [6]:
# 转化token id
all_words = sorted(set(processed_text))
vocab_size = len(all_words)
print(vocab_size)

1130


In [7]:
# 构建词汇表
vocab = {token:id for id, token in enumerate(all_words)}
for id, token in enumerate(vocab.items()) :
    print(token)
    if id > 5:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)


### 构建分词器

In [8]:
# 构建文本分词器
class SimpleTokenizer1:
    def __init__(self, vocab):
        self.id_to_token = {id:token for token, id in vocab.items()}
        self.token_to_id = vocab
    def encoder(self, text):
        processed_text = re.split(r'([,.?_!"()\']|--|\s)', text)
        processed_text = [item.strip() for item in processed_text if item.strip()]
        ids = [self.token_to_id[s] for s in processed_text]
        return ids
    def decoder(self, ids):
        text = " ".join([self.id_to_token[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [9]:
tokenizer = SimpleTokenizer1(vocab) 
text = """"It's the last he painted, you know," 
 Mrs. Gisburn said with pardonable pride.""" 
ids = tokenizer.encoder(text) 
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [10]:
print(tokenizer.decoder(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


### 引入特殊token（一般有bos，eos，pad，unk，endoftext）

In [11]:
all_tokens = sorted(list(set(processed_text)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:id for id, token in enumerate(all_tokens)}
print(len(vocab.items()))

1132


In [12]:
# 构建文本分词器
class SimpleTokenizer:
    def __init__(self, vocab):
        self.id_to_token = {id:token for token, id in vocab.items()}
        self.token_to_id = vocab
    def encoder(self, text):
        processed_text = re.split(r'([,.?_!"()\']|--|\s)', text)
        processed_text = [item.strip() for item in processed_text if item.strip()]
        processed_text = [token if token in self.token_to_id else "<unk>" for token in processed_text]
        ids = [self.token_to_id[token] for token in processed_text]
        return ids
    def decoder(self, ids):
        text = " ".join([self.id_to_token[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

## 不同Tokenizer算法的学习

#### BPE算法

In [16]:
# 使用
!pip install tiktoken
import tiktoken

Collecting tiktoken
  Downloading tiktoken-0.12.0-cp312-cp312-win_amd64.whl.metadata (6.9 kB)
Downloading tiktoken-0.12.0-cp312-cp312-win_amd64.whl (878 kB)
   ---------------------------------------- 0.0/878.7 kB ? eta -:--:--
   ---------------------------------------- 0.0/878.7 kB ? eta -:--:--
   ----------- ---------------------------- 262.1/878.7 kB ? eta -:--:--
   ---------------------------------------- 878.7/878.7 kB 1.7 MB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.12.0


In [17]:
tokenizer = tiktoken.get_encoding("gpt2")
text = ( 
 "Hello, do you like tea? <|endoftext|> In the sunlit terraces" 
 "of someunknownPlace." 
) 
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"}) 
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


##### BPE分词就是将词划分为子词，因为按照词分，词表过大，且无法解决OOV问题，按字符分无法把握语义结构，子词划分不仅词表较小，对OOV的词也能拆解成子词理解。

### 过程
#### 1. 结尾加上特殊字符_
#### 2. 统计相邻字符出现的频率
#### 3. 合并出现频率最高的相邻字符，将两个字符合并成一个重新加入
#### 4. 重复3， 4，直到达到预计的词表大小