From 04c9c9d811ae760e2a3757a94fa18c1c8aad98e9 Mon Sep 17 00:00:00 2001 From: hoangdz Date: Sat, 7 Sep 2024 23:50:02 +0900 Subject: [PATCH 1/3] add phi2 tokenizer --- convert_hf_to_gguf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0a9bbc8294ef7..afb54e3bc71f8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -624,6 +624,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae": # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct res = "exaone" + if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085": + # ref: https://huggingface.co/microsoft/phi-2 + res = "phi2" if res is None: logger.warning("\n") From 87d8636e938f2fc37c1f1419322bb3117b736bec Mon Sep 17 00:00:00 2001 From: hoangdz Date: Sun, 8 Sep 2024 15:14:24 +0900 Subject: [PATCH 2/3] add phi name to convert_hf_to_gguf_update.py --- convert_hf_to_gguf_update.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index ff4955f9c614e..ab50da3686118 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -97,6 +97,7 @@ class TOKENIZER_TYPE(IntEnum): {'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", }, {'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", }, {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, + {"name": "phi", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, ] From 195a062986cca8253a92d630af131f3e72bd8eaf Mon Sep 17 00:00:00 2001 From: hoangdz Date: Mon, 9 Sep 2024 16:20:39 +0900 Subject: [PATCH 3/3] make tokenizer_pre consistent; llama.cpp work --- convert_hf_to_gguf.py | 2 +- convert_hf_to_gguf_update.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index afb54e3bc71f8..084ae04731596 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -626,7 +626,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: res = "exaone" if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085": # ref: https://huggingface.co/microsoft/phi-2 - res = "phi2" + res = "phi-2" if res is None: logger.warning("\n") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index ab50da3686118..57650af64240b 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -97,7 +97,7 @@ class TOKENIZER_TYPE(IntEnum): {'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", }, {'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", }, {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, - {"name": "phi", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, + {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, ]