From 86016b7d8d23f9baad82ff1c4bce4134b632ace0 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Fri, 17 May 2024 13:43:12 +0300 Subject: [PATCH 1/2] convert-hf-to-gguf-update: automate updating --- convert-hf-to-gguf-update.py | 20 ++++++++++++++------ convert-hf-to-gguf.py | 2 ++ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 14aa0c45a6a87..bbd66464b7c0c 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -20,11 +20,13 @@ # - Update llama.cpp with the new pre-tokenizer if necessary # # TODO: generate tokenizer tests for llama.cpp -# TODO: automate the update of convert-hf-to-gguf.py # import logging import os +import pathlib +import re + import requests import sys import json @@ -135,7 +137,6 @@ def download_file_with_auth(url, token, save_path): download_file_with_auth(url, token, save_path) # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function: -# TODO: auto-update convert-hf-to-gguf.py with the generated function src_ifs = "" for model in models: @@ -224,11 +225,18 @@ def get_vocab_base_pre(self, tokenizer) -> str: return res """ -print(src_func) # noqa: NP100 +convert_py_pth = pathlib.Path("convert-hf-to-gguf.py") +convert_py = convert_py_pth.read_text() +convert_py = re.sub( + r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)", + lambda m: m.group(1) + src_func + m.group(3), + convert_py, + flags=re.DOTALL | re.MULTILINE, +) -logger.info("\n") -logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!") -logger.info("\n") +convert_py_pth.write_text(convert_py) + +logger.info("+++ convert-hf-to-gguf.py was updated") # generate tests for each tokenizer model diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index cd875fa4af6af..17cec08f7e234 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -402,6 +402,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: # NOTE: this function is generated by convert-hf-to-gguf-update.py # do not modify it manually! # ref: https://github.com/ggerganov/llama.cpp/pull/6920 + # Marker: Start get_vocab_base_pre def get_vocab_base_pre(self, tokenizer) -> str: # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that # is specific for the BPE pre-tokenizer used by the model @@ -489,6 +490,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: logger.debug(f"chkhsh: {chkhsh}") return res + # Marker: End get_vocab_base_pre def _set_vocab_gpt2(self) -> None: tokens, toktypes, tokpre = self.get_vocab_base() From 3afb4947591b0f19fbdf2a1a672beeb185a17bb0 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Fri, 17 May 2024 13:58:30 +0300 Subject: [PATCH 2/2] convert-hf-to-gguf-update: improve download * share requests session for performance * create directories only when needed, don't skip downloads when empty directory encountered * be more graceful about errors --- convert-hf-to-gguf-update.py | 63 +++++++++++++----------------------- 1 file changed, 23 insertions(+), 40 deletions(-) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index bbd66464b7c0c..27983fadf4ac5 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -37,6 +37,7 @@ logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger("convert-hf-to-gguf-update") +sess = requests.Session() class TOKENIZER_TYPE(IntEnum): @@ -81,60 +82,42 @@ class TOKENIZER_TYPE(IntEnum): {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, ] -# make directory "models/tokenizers" if it doesn't exist -if not os.path.exists("models/tokenizers"): - os.makedirs("models/tokenizers") - def download_file_with_auth(url, token, save_path): headers = {"Authorization": f"Bearer {token}"} - response = requests.get(url, headers=headers) - if response.status_code == 200: - with open(save_path, 'wb') as f: - f.write(response.content) - logger.info(f"File {save_path} downloaded successfully") - else: - logger.info(f"Failed to download file. Status code: {response.status_code}") + response = sess.get(url, headers=headers) + response.raise_for_status() + os.makedirs(os.path.dirname(save_path), exist_ok=True) + with open(save_path, 'wb') as f: + f.write(response.content) + logger.info(f"File {save_path} downloaded successfully") -# download the tokenizer models -for model in models: +def download_model(model): name = model["name"] repo = model["repo"] tokt = model["tokt"] - if not os.path.exists(f"models/tokenizers/{name}"): - os.makedirs(f"models/tokenizers/{name}") - else: - logger.info(f"Directory models/tokenizers/{name} already exists - skipping") - continue - - logger.info(f"Downloading {name} to models/tokenizers/{name}") + os.makedirs(f"models/tokenizers/{name}", exist_ok=True) - url = f"{repo}/raw/main/config.json" - save_path = f"models/tokenizers/{name}/config.json" - download_file_with_auth(url, token, save_path) + files = ["config.json", "tokenizer.json", "tokenizer_config.json"] + if tokt == TOKENIZER_TYPE.SPM: + files.append("tokenizer.model") - url = f"{repo}/raw/main/tokenizer.json" - save_path = f"models/tokenizers/{name}/tokenizer.json" - download_file_with_auth(url, token, save_path) + for file in files: + save_path = f"models/tokenizers/{name}/{file}" + if os.path.isfile(save_path): + logger.info(f"{name}: File {save_path} already exists - skipping") + continue + download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) - # if downloaded file is less than 1KB, we likely need to download an LFS instead - if os.path.getsize(save_path) < 1024: - # remove the file - os.remove(save_path) - url = f"{repo}/resolve/main/tokenizer.json" - save_path = f"models/tokenizers/{name}/tokenizer.json" - download_file_with_auth(url, token, save_path) - if tokt == TOKENIZER_TYPE.SPM: - url = f"{repo}/resolve/main/tokenizer.model" - save_path = f"models/tokenizers/{name}/tokenizer.model" - download_file_with_auth(url, token, save_path) +for model in models: + try: + download_model(model) + except Exception as e: + logger.error(f"Failed to download model {model['name']}. Error: {e}") - url = f"{repo}/raw/main/tokenizer_config.json" - save_path = f"models/tokenizers/{name}/tokenizer_config.json" - download_file_with_auth(url, token, save_path) # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function: