In [5]:
# %%
import os

# Get the current working directory
current_directory = os.getcwd()

# Construct the full path to the file
file_path = os.path.join(current_directory, "Training Set", "result.txt")

# Open the file and read its contents
with open(file_path, "r") as file:
    text = file.read()


tokens = text.encode("utf-8") # raw bytes
tokens = list(map(int, tokens)) # convert to a list of integers in range 0..255 for convenience
print (text[:52])
print (tokens[:52])




ܘܡܠܟܐ ܕܘܝܕ ܣܐܒ ܘܥܠ ܒܫܢܝܐ ܘܡܟܣܝܢ ܗܘܘ ܠܗ ܒܠܒܘܫܐ ܘܠܐ ܫܚ
[220, 152, 220, 161, 220, 160, 220, 159, 220, 144, 32, 220, 149, 220, 152, 220, 157, 220, 149, 32, 220, 163, 220, 144, 220, 146, 32, 220, 152, 220, 165, 220, 160, 32, 220, 146, 220, 171, 220, 162, 220, 157, 220, 144, 32, 220, 152, 220, 161, 220, 159, 220]


In [6]:
import regex as re
pattern = re.compile(r""" ?ܘ(?=\p{L}+)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
# This regex pattern is used to tokenize the text into words and other meaningful units
# Here's a breakdown of the pattern:

# ܘ(?=\p{L}+)     : Matches 'ܘ' (Syriac Waw) only if followed by one or more letters
#                   This is likely for a specific linguistic feature in Syriac

# \p{L}+          : Matches one or more Unicode letters

# \p{N}+          : Matches one or more Unicode numbers

# [^\s\p{L}\p{N}]+: Matches one or more characters that are not whitespace, letters, or numbers

# \s+(?!\S)       : Matches one or more whitespace characters not followed by a non-whitespace character
#                   This is likely to catch trailing spaces

# \s+             : Matches one or more whitespace characters

# Each part of the pattern is preceded by a space and a question mark ( ?)
# This makes the leading space optional for each token

# The pipe symbol (|) separates each part of the pattern, allowing any of these to match



text_words = re.findall(pattern, text) # devide the text into words according to gpt2pat pattern
print(text_words[:52])
print(type(text_words))
words_tokens = []
for i in text_words:
    token_word = i.encode("utf-8")
    token = list(map(int, token_word))
    words_tokens.append(token)
print(words_tokens[:3])



['ܘ', 'ܡܠܟܐ', ' ܕܘܝܕ', ' ܣܐܒ', ' ܘ', 'ܥܠ', ' ܒܫܢܝܐ', ' ܘ', 'ܡܟܣܝܢ', ' ܗܘܘ', ' ܠܗ', ' ܒܠܒܘܫܐ', ' ܘ', 'ܠܐ', ' ܫܚܢ', ' ܘ', 'ܐܡܪܘ', ' ܠܗ', ' ܥܒܕܘܗܝ', ' ܗܐ', ' ܥܒܕܝܟ', ' ܩܕܡܝܟ', ' ܢܒܥܘܢ', ' ܠܡܪܢ', ' ܡܠܟܐ', ' ܥܠܝܡܬܐ', ' ܒܬܘܠܬܐ', ' ܘ', 'ܬܩܘܡ', ' ܩܕܡ', ' ܡܠܟܐ', ' ܘ', 'ܬܗܘܐ', ' ܠܗ', ' ܡܫܡܫܢܝܬܐ', ' ܘ', 'ܬܫܟܒ', ' ܒܥܘܒܟ', ' ܘ', 'ܢܫܚܢ', ' ܠܡܪܢ', ' ܡܠܟܐ', ' ܘ', 'ܒܥܘ', ' ܥܠܝܡܬܐ', ' ܕܫܦܝܪܐ', ' ܒܟܠܗ', ' ܬܚܘܡܐ', ' ܕܐܝܣܪܝܠ', ' ܘ', 'ܐܫܟܚܘ', ' ܠܐܒܝܫܓ']
<class 'list'>
[[220, 152], [220, 161, 220, 160, 220, 159, 220, 144], [32, 220, 149, 220, 152, 220, 157, 220, 149]]


In [60]:
def get_stats(ids):
    counts = {}
    for id in ids:    
        for pair in zip(id, id[1:]): # Pythonic way to iterate consecutive elements
            if pair[1] != 220:  # Check to avoid pairs ending with 220
                counts[pair] = counts.get(pair, 0) + 1
    return counts


def simple_get_stats(ids):
    counts = {}
    for pair in zip(ids, ids[1:]):
        if pair[1] != ord(' ') and pair[1] != 220:  # Check to avoid pairs ending with 220
            counts[pair] = counts.get(pair, 0) + 1
    return counts

stats = get_stats(words_tokens)
top_pair = max(stats, key=stats.get)
print (top_pair, stats[top_pair])

(220, 144) 44750


In [61]:
def merge(ids, pair, idx):
  # in the list of ints (ids), replace all consecutive occurences of pair with the new token idx
  newids = []
  for sublist in ids:
        i = 0
        new_sublist = []
        while i < len(sublist):
            # if we are not at the very last position AND the pair matches, replace it
            if i < len(sublist) - 1 and sublist[i] == pair[0] and sublist[i + 1] == pair[1]:
                new_sublist.append(idx)
                i += 2
            else:
                new_sublist.append(sublist[i])
                i += 1
        newids.append(new_sublist)
  return newids

def simple_merge(ids, pair, idx):
  newids = []
  i = 0
  while i < len(ids):
    if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
      newids.append(idx)
      i += 2
    else:
      newids.append(ids[i])
      i += 1
  return newids

def length(ids):
    return sum(len(id) for id in ids)

print("length:", length(words_tokens))
tokens2 = merge(words_tokens, top_pair, 256)
print(tokens2[:52])
print("length:", length(tokens2))

length: 751752
[[220, 152], [220, 161, 220, 160, 220, 159, 256], [32, 220, 149, 220, 152, 220, 157, 220, 149], [32, 220, 163, 256, 220, 146], [32, 220, 152], [220, 165, 220, 160], [32, 220, 146, 220, 171, 220, 162, 220, 157, 256], [32, 220, 152], [220, 161, 220, 159, 220, 163, 220, 157, 220, 162], [32, 220, 151, 220, 152, 220, 152], [32, 220, 160, 220, 151], [32, 220, 146, 220, 160, 220, 146, 220, 152, 220, 171, 256], [32, 220, 152], [220, 160, 256], [32, 220, 171, 220, 154, 220, 162], [32, 220, 152], [256, 220, 161, 220, 170, 220, 152], [32, 220, 160, 220, 151], [32, 220, 165, 220, 146, 220, 149, 220, 152, 220, 151, 220, 157], [32, 220, 151, 256], [32, 220, 165, 220, 146, 220, 149, 220, 157, 220, 159], [32, 220, 169, 220, 149, 220, 161, 220, 157, 220, 159], [32, 220, 162, 220, 146, 220, 165, 220, 152, 220, 162], [32, 220, 160, 220, 161, 220, 170, 220, 162], [32, 220, 161, 220, 160, 220, 159, 256], [32, 220, 165, 220, 160, 220, 157, 220, 161, 220, 172, 256], [32, 220, 146, 220, 172, 22

In [62]:


# ---
vocab_size = 6000 # the desired final vocabulary size
num_merges = vocab_size - 256
ids = list(words_tokens) # copy so we don't destroy the original list

merges = {} # (int, int) -> int
for i in range(num_merges):
  stats = get_stats(ids)
  pair = max(stats, key=stats.get)
  idx = 256 + i
  ids = merge(ids, pair, idx)
  merges[pair] = idx

In [63]:
print("tokens length:", length(words_tokens))
print("ids length:", length(ids))
print(f"compression ratio: {length(words_tokens) / length(ids):.2f}X")

tokens length: 751752
ids length: 111069
compression ratio: 6.77X


### Generate Vocab


In [64]:
vocab = {idx: bytes([idx]) for idx in range(256)}
for (p0, p1), idx in merges.items():
    vocab[idx] = vocab[p0] + vocab[p1]

vocab_converted= {idx: vocab[idx].decode("utf-8") for idx in vocab if idx > 255}

merges_converted = {str(key): value for key, value in merges.items()}

In [67]:
import json


file_path_voc = os.path.join(current_directory, f"vocabulary_{vocab_size}.json")
file_path_merge = os.path.join(current_directory, f"merges_{vocab_size}.json")


with open(file_path_voc, "w", encoding="utf-8") as file:
    json.dump(vocab_converted, file, indent=4, ensure_ascii=False)
with open(file_path_merge, "w") as file:
    json.dump(merges_converted, file, indent=4)   

    
print(f"Vocabulary and merges saved to {file_path}")

Vocabulary and merges saved to /home/zhanchen/Dropbox/Projects/Tokenizer/Learning Set/result.txt
