In [1]:
import sys
import torch
import os

from gguf import GGUFWriter, GGMLQuantizationType
from transformers import AutoModel, AutoTokenizer

os.environ["http_proxy"] = "http://127.0.0.1:2080"
os.environ["https_proxy"] = "http://127.0.0.1:2080"

def convert_hf(repo_id, output_path, float_type='f16'):
    # convert to ggml quantization type
    if float_type not in ['f16', 'f32']:
        print(f'Float type must be f16 or f32, got: {float_type}')
        sys.exit(1)
    else:
        qtype = GGMLQuantizationType[float_type.upper()]
        dtype0 = {'f16': torch.float16, 'f32': torch.float32}[float_type]

    # load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(repo_id)
    tokenizer_json = os.path.join(os.path.dirname(output_path), os.path.basename(repo_id) + ".tokenizer.json")
    # tokenizer.save_pretrained(tokenizer_json)
    tokenizer._tokenizer.save(tokenizer_json, False)
    model = AutoModel.from_pretrained(repo_id, add_pooling_layer=False, trust_remote_code=True)

    config = model.config
    print(config)
    
    # print model
    param_keys = [
        'vocab_size', 'hidden_size', 'num_hidden_layers',
        'num_attention_heads', 'intermediate_size', 'type_vocab_size', 'pad_token_id'
    ]
    print('PARAMS')
    for k in param_keys:
        v = getattr(config, k)
        print(f'{k:<24s} = {v}')
    print()

    # print vocab
    vocab_keys = [
        'vocab_size', 'pad_token_id', 'unk_token_id', 'cls_token_id', 'sep_token_id'
    ]
    print('VOCAB')
    for k in vocab_keys:
        v = getattr(tokenizer, k)
        print(f'{k:24s} = {v}')


    # start to write GGUF file
    gguf_writer = GGUFWriter(output_path, "JinaBert")

    # write metadata
    gguf_writer.add_name(repo_id)
    gguf_writer.add_description('gguf model for embeddings.cpp')
    gguf_writer.add_file_type(qtype)

    # write model params
    gguf_writer.add_uint32('vocab_size', config.vocab_size)
    gguf_writer.add_uint32('hidden_size', config.hidden_size)
    gguf_writer.add_uint32('intermediate_size', config.intermediate_size)
    gguf_writer.add_uint32('num_attention_heads', config.num_attention_heads)
    gguf_writer.add_uint32('num_hidden_layers', config.num_hidden_layers)
    gguf_writer.add_uint32('type_vocab_size', config.type_vocab_size)
    gguf_writer.add_uint32('pad_token_id', config.pad_token_id)
    gguf_writer.add_float32('layer_norm_eps', config.layer_norm_eps)
    gguf_writer.add_float32('rope_theta', config.rope_theta)


    # write the tokenizer special token(we only need to know [PAD])
    KEY_PAD_ID = 'tokenizer.ggml.padding_token_id'
    gguf_writer.add_int32(KEY_PAD_ID, tokenizer.pad_token_id)

    # write tensors
    print('TENSORS')
    hidden_size = config.hidden_size
    for name, data in model.state_dict().items():
        # get correct dtype
        if 'emb_ln' in name or 'norm1' in name or 'norm2' in name or 'bias' in name:
            dtype = torch.float32
        else:
            dtype = dtype0
        shape_str = str(list(data.shape))
        print(f'{name:64s} = {shape_str:16s} {data.dtype} → {dtype}')

        # do conversion
        data = data.to(dtype)

        # add to gguf output
        gguf_writer.add_tensor(name, data.numpy())

    # execute and close writer
    gguf_writer.write_header_to_file()
    gguf_writer.write_kv_data_to_file()
    gguf_writer.write_tensors_to_file()
    gguf_writer.close()

    # print success
    print()
    print(f'GGML model written to {output_path}')

repo_id = 'Snowflake/snowflake-arctic-embed-m-v2.0'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
convert_hf(repo_id, "../models/snowflake-arctic-embed-m-v2.0.fp16.gguf", float_type="f16")

Override attn_implementation='sdpa' to 'eager' as use_memory_efficient_attention='true'


GteConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "GteModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "auto_map": {
    "AutoConfig": "Snowflake/snowflake-arctic-embed-m-v2.0--configuration_hf_alibaba_nlp_gte.GteConfig",
    "AutoModel": "Snowflake/snowflake-arctic-embed-m-v2.0--modeling_hf_alibaba_nlp_gte.GteModel"
  },
  "classifier_dropout": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "layer_norm_type": "layer_norm",
  "logn_attention_clip1": false,
  "logn_attention_scale": false,
  "matryoshka_dimensions": [
    256
  ],
  "max_position_embeddings": 8192,
  "model_type": "gte",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pack_qkv": true,
  "pad_token_id": 1,
  "position_embedding_type": "rope",
  "rope_scaling": null,
  "rope_theta": 160000,
  "torch_dtype": "float32",
  "transformers_version": "4.50.3",
  "type_