ggerganov · teleprint-me · May 7, 2024 · May 7, 2024 · May 7, 2024 · May 7, 2024
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
@@ -22,17 +22,16 @@
 # TODO: generate tokenizer tests for llama.cpp
 #
 
+import json
 import logging
 import os
 import pathlib
 import re
-
-import requests
 import sys
-import json
-
-from hashlib import sha256
 from enum import IntEnum, auto
+from hashlib import sha256
+
+import requests
 from transformers import AutoTokenizer
 
 logging.basicConfig(level=logging.DEBUG)
@@ -72,6 +71,12 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
     {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
     {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
+    {"name": "phi",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-1", },
+    {"name": "stablelm",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
+    {"name": "mistral-bpe",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2", },
+    {"name": "mistral-spm",    "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2", },
+    {"name": "mixtral-bpe",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", },
+    {"name": "mixtral-spm",    "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", },
     {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
     {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
     {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
@@ -314,12 +319,18 @@ def get_vocab_base_pre(self, tokenizer) -> str:
     logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
 
 # generate commands for creating vocab files
-
-logger.info("\nRun the following commands to generate the vocab files for testing:\n")
+shscript = "#!/usr/bin/env bash\n\n"
 
 for model in models:
     name = model["name"]
+    tmpline = f"python3 convert-hf-to-gguf.py models/tokenizers/{name} --outfile models/ggml-vocab-{name}.gguf --vocab-only\n"
+    shscript += tmpline
+    logging.info(tmpline.strip())
 
-    print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
+with open("generate-vocab.sh", "w", encoding="utf-8") as f:
+    f.writelines(shscript)
+    logging.info(f"Wrote {len(shscript)} bytes to generate-vocab.sh")
 
-logger.info("\n")
+logging.info("Run the following command to generate the vocab files for testing:")
+logging.info("Enable execution: chmod +x generate-vocab.sh")
+logging.info("Execute with ./generate-vocab.sh")
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -2,17 +2,27 @@
 
 from __future__ import annotations
 
-import logging
 import argparse
 import contextlib
 import json
+import logging
 import os
 import re
 import sys
 from enum import IntEnum
-from pathlib import Path
 from hashlib import sha256
-from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
+from pathlib import Path
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    ContextManager,
+    Iterable,
+    Iterator,
+    Sequence,
+    TypeVar,
+    cast,
+)
 
 import numpy as np
 import torch
@@ -446,6 +456,18 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
             # ref: https://huggingface.co/openai-community/gpt2
             res = "gpt-2"
+        if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
+            # ref: https://huggingface.co/microsoft/phi-1
+            res = "phi"
+        if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
+            # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
+            res = "stablelm"
+        if chkhsh == "e750a9b14dfed9b73287639bd1ecda50c38fa6011138f2f609804c6dab9ed5c2":
+            # ref: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2
+            res = "mistral-bpe"
+        if chkhsh == "e750a9b14dfed9b73287639bd1ecda50c38fa6011138f2f609804c6dab9ed5c2":
+            # ref: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+            res = "mixtral-bpe"
         if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
             # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
             res = "refact"
@@ -1703,6 +1725,7 @@ def set_gguf_parameters(self):
         n_head = self.find_hparam(["num_attention_heads", "n_head"])
 
         self.gguf_writer.add_name("Phi2")
+        self.gguf_writer.add_tokenizer_pre("gpt-2")
         self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
 
         self.gguf_writer.add_embedding_length(n_embd)

diff --git a/generate-vocab.sh b/generate-vocab.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+python3 convert-hf-to-gguf.py models/tokenizers/llama-spm --outfile models/ggml-vocab-llama-spm.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/llama-bpe --outfile models/ggml-vocab-llama-bpe.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/phi-3 --outfile models/ggml-vocab-phi-3.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/deepseek-llm --outfile models/ggml-vocab-deepseek-llm.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/deepseek-coder --outfile models/ggml-vocab-deepseek-coder.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/falcon --outfile models/ggml-vocab-falcon.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/bert-bge --outfile models/ggml-vocab-bert-bge.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/mpt --outfile models/ggml-vocab-mpt.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/starcoder --outfile models/ggml-vocab-starcoder.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/gpt-2 --outfile models/ggml-vocab-gpt-2.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/phi --outfile models/ggml-vocab-phi.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/stablelm --outfile models/ggml-vocab-stablelm.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/mistral-bpe --outfile models/ggml-vocab-mistral-bpe.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/mistral-spm --outfile models/ggml-vocab-mistral-spm.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/mixtral-bpe --outfile models/ggml-vocab-mixtral-bpe.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/mixtral-spm --outfile models/ggml-vocab-mixtral-spm.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/refact --outfile models/ggml-vocab-refact.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/command-r --outfile models/ggml-vocab-command-r.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/qwen2 --outfile models/ggml-vocab-qwen2.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/olmo --outfile models/ggml-vocab-olmo.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/dbrx --outfile models/ggml-vocab-dbrx.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/jina-v2-en --outfile models/ggml-vocab-jina-v2-en.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/jina-v2-es --outfile models/ggml-vocab-jina-v2-es.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/jina-v2-de --outfile models/ggml-vocab-jina-v2-de.gguf --vocab-only
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
@@ -384,7 +384,7 @@ class TensorNameMap:
 
     mapping: dict[str, tuple[MODEL_TENSOR, str]]
 
-    def __init__(self, arch: MODEL_ARCH, n_blocks: int):
+    def __init__(self, arch: MODEL_ARCH, n_blocks: int, n_experts: int = 60):
         self.mapping = {}
         for tensor, keys in self.mappings_cfg.items():
             if tensor not in MODEL_TENSORS[arch]:
@@ -398,7 +398,6 @@ def __init__(self, arch: MODEL_ARCH, n_blocks: int):
                 if tensor not in MODEL_TENSORS[arch]:
                     continue
                 # TODO: make this configurable
-                n_experts = 60
                 for xid in range(n_experts):
                     tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
                     self.mapping[tensor_name] = (tensor, tensor_name)

diff --git a/llama.cpp b/llama.cpp
@@ -4458,6 +4458,9 @@ static void llm_load_vocab(
             } else if (
                 tokenizer_pre == "command-r") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
+            } else if (
+                    tokenizer_pre == "qwen") { 
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN;
             } else if (
                 tokenizer_pre == "qwen2") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
@@ -12354,6 +12357,12 @@ struct llm_tokenizer_bpe {
                             "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                         });
                         break;
+                    case LLAMA_VOCAB_PRE_TYPE_QWEN:
+                        word_collection = unicode_regex_split(text, {
+                            // original regex from tokenization_qwen.py
+                            "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                        });
+                        break;
                     case LLAMA_VOCAB_PRE_TYPE_QWEN2:
                         word_collection = unicode_regex_split(text, {
                             // original regex from tokenizer.json

diff --git a/llama.h b/llama.h
@@ -81,9 +81,10 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
         LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
         LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
-        LLAMA_VOCAB_PRE_TYPE_QWEN2          = 10,
-        LLAMA_VOCAB_PRE_TYPE_OLMO           = 11,
-        LLAMA_VOCAB_PRE_TYPE_DBRX           = 12,
+        LLAMA_VOCAB_PRE_TYPE_QWEN           = 10,
+        LLAMA_VOCAB_PRE_TYPE_QWEN2          = 11,
+        LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
+        LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
     };
 
     // note: these values should be synchronized with ggml_rope

diff --git a/models/ggml-vocab-bert-bge.gguf b/models/ggml-vocab-bert-bge.gguf
diff --git a/models/ggml-vocab-command-r.gguf b/models/ggml-vocab-command-r.gguf
diff --git a/models/ggml-vocab-deepseek-coder.gguf b/models/ggml-vocab-deepseek-coder.gguf
diff --git a/models/ggml-vocab-deepseek-llm.gguf b/models/ggml-vocab-deepseek-llm.gguf
diff --git a/models/ggml-vocab-falcon.gguf b/models/ggml-vocab-falcon.gguf
diff --git a/models/ggml-vocab-gpt-2.gguf b/models/ggml-vocab-gpt-2.gguf
diff --git a/models/ggml-vocab-llama-bpe.gguf b/models/ggml-vocab-llama-bpe.gguf
diff --git a/models/ggml-vocab-llama-bpe.gguf.inp b/models/ggml-vocab-llama-bpe.gguf.inp
@@ -104,5 +104,3 @@ __ggml_vocab_test__
 
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
 __ggml_vocab_test__
- Việt
-__ggml_vocab_test__
diff --git a/models/ggml-vocab-llama-bpe.gguf.out b/models/ggml-vocab-llama-bpe.gguf.out
@@ -41,4 +41,3 @@
  8765 8765 1644
  8765 8765 8765
  198 4815 15073 66597 8004 1602 2355 79772 11187 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 11410 99 247 9468 99 247 220 18 220 1644 220 8765 220 8765 18 220 8765 1644 220 8765 8765 220 8765 8765 18 220 8765 8765 1644 220 18 13 18 220 18 497 18 220 18 1131 18 220 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 76460 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909 56560 54337 19175 102118 13373 64571 34694 3114 112203 80112 3436 106451 14196 14196 74694 3089 3089 29249 17523 3001 27708 7801 358 3077 1027 364 83 820 568 596 1070 11 364 793 499 2771 30 364 44 539 2771 358 3358 1304 433 11 364 35 499 1093 1063 15600 30 1226 6 43712 264 64966 43
- 101798
diff --git a/models/ggml-vocab-llama-spm.gguf b/models/ggml-vocab-llama-spm.gguf
diff --git a/models/ggml-vocab-mpt.gguf b/models/ggml-vocab-mpt.gguf
diff --git a/models/ggml-vocab-phi-3.gguf b/models/ggml-vocab-phi-3.gguf
diff --git a/models/ggml-vocab-qwen2.gguf b/models/ggml-vocab-qwen2.gguf
diff --git a/models/ggml-vocab-refact.gguf b/models/ggml-vocab-refact.gguf
diff --git a/models/ggml-vocab-stablelm.gguf b/models/ggml-vocab-stablelm.gguf
diff --git a/models/ggml-vocab-starcoder.gguf b/models/ggml-vocab-starcoder.gguf
diff --git a/requirements/requirements-convert-hf-to-gguf-update.txt b/requirements/requirements-convert-hf-to-gguf-update.txt
@@ -1,2 +1,3 @@
 -r ./requirements-convert.txt
 torch~=2.1.1
+tiktoken~=0.6.0