In [1]:
import sys
import torch
import os

from gguf import GGUFWriter, GGMLQuantizationType
from transformers import AutoModel, AutoTokenizer

os.environ["http_proxy"] = "http://127.0.0.1:2080"
os.environ["https_proxy"] = "http://127.0.0.1:2080"

def convert_hf(repo_id, output_path, float_type='f16'):
    # convert to ggml quantization type
    if float_type not in ['f16', 'f32']:
        print(f'Float type must be f16 or f32, got: {float_type}')
        sys.exit(1)
    else:
        qtype = GGMLQuantizationType[float_type.upper()]
        dtype0 = {'f16': torch.float16, 'f32': torch.float32}[float_type]

    # load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(repo_id)
    tokenizer_json = os.path.join(os.path.dirname(output_path), os.path.basename(repo_id) + ".tokenizer.json")
    # tokenizer.save_pretrained(tokenizer_json)
    tokenizer._tokenizer.save(tokenizer_json, False)
    model = AutoModel.from_pretrained(repo_id, add_pooling_layer=False, trust_remote_code=True)

    config = model.config
    print(config)
    
    # print model
    param_keys = [
        'vocab_size', 'hidden_size', 'num_hidden_layers',
        'num_attention_heads', 'intermediate_size', 'type_vocab_size', 'pad_token_id'
    ]
    print('PARAMS')
    for k in param_keys:
        v = getattr(config, k)
        print(f'{k:<24s} = {v}')
    print()

    # print vocab
    vocab_keys = [
        'vocab_size', 'pad_token_id', 'unk_token_id', 'cls_token_id', 'sep_token_id'
    ]
    print('VOCAB')
    for k in vocab_keys:
        v = getattr(tokenizer, k)
        print(f'{k:24s} = {v}')


    # start to write GGUF file
    gguf_writer = GGUFWriter(output_path, "JinaBert")

    # write metadata
    gguf_writer.add_name(repo_id)
    gguf_writer.add_description('gguf model for embeddings.cpp')
    gguf_writer.add_file_type(qtype)

    # write model params
    gguf_writer.add_uint32('vocab_size', config.vocab_size)
    gguf_writer.add_uint32('hidden_size', config.hidden_size)
    gguf_writer.add_uint32('intermediate_size', config.intermediate_size)
    gguf_writer.add_uint32('num_attention_heads', config.num_attention_heads)
    gguf_writer.add_uint32('num_hidden_layers', config.num_hidden_layers)
    gguf_writer.add_uint32('type_vocab_size', config.type_vocab_size)
    gguf_writer.add_uint32('pad_token_id', config.pad_token_id)
    gguf_writer.add_float32('layer_norm_eps', config.layer_norm_eps)
    gguf_writer.add_float32('rope_theta', config.rope_theta)


    # write the tokenizer special token(we only need to know [PAD])
    KEY_PAD_ID = 'tokenizer.ggml.padding_token_id'
    gguf_writer.add_int32(KEY_PAD_ID, tokenizer.pad_token_id)

    # write tensors
    print('TENSORS')
    hidden_size = config.hidden_size
    for name, data in model.state_dict().items():
        # get correct dtype
        if 'attn_ln' in name or 'mlp' in name  or 'bias' in name or 'proj' in name or 'LayerNorm' in name:
            dtype = torch.float32
        else:
            dtype = dtype0
        shape_str = str(list(data.shape))
        print(f'{name:64s} = {shape_str:16s} {data.dtype} → {dtype}')

        # do conversion
        data = data.to(dtype)

        # add to gguf output
        gguf_writer.add_tensor(name, data.numpy())

    # execute and close writer
    gguf_writer.write_header_to_file()
    gguf_writer.write_kv_data_to_file()
    gguf_writer.write_tensors_to_file()
    gguf_writer.close()

    # print success
    print()
    print(f'GGML model written to {output_path}')

repo_id = 'Snowflake/snowflake-arctic-embed-m-v2.0'

KeyboardInterrupt: 

In [None]:
convert_hf(repo_id, "../models/snowflake-arctic-embed-m-v2.0.fp16.gguf", float_type="f16")

GteConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "GteModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "auto_map": {
    "AutoConfig": "Snowflake/snowflake-arctic-embed-m-v2.0--configuration_hf_alibaba_nlp_gte.GteConfig",
    "AutoModel": "Snowflake/snowflake-arctic-embed-m-v2.0--modeling_hf_alibaba_nlp_gte.GteModel"
  },
  "classifier_dropout": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "layer_norm_type": "layer_norm",
  "logn_attention_clip1": false,
  "logn_attention_scale": false,
  "matryoshka_dimensions": [
    256
  ],
  "max_position_embeddings": 8192,
  "model_type": "gte",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pack_qkv": true,
  "pad_token_id": 1,
  "position_embedding_type": "rope",
  "rope_scaling": null,
  "rope_theta": 160000,
  "torch_dtype": "float32",
  "transformers_version": "4.50.3",
  "type_

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer
import os
os.environ["http_proxy"] = "http://127.0.0.1:2080"
os.environ["https_proxy"] = "http://127.0.0.1:2080"
model_name = 'Snowflake/snowflake-arctic-embed-m-v2.0'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, add_pooling_layer=False, trust_remote_code=True, use_memory_efficient_attention=False)
model.eval()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

documents = ['A blue cat.', 'A blue cat', 'A cat']
document_tokens =  tokenizer(documents, padding=True, truncation=True, return_tensors='pt', max_length=8192)

print(document_tokens)

# Move inputs to same device as model
document_tokens = {k: v.to(device) for k, v in document_tokens.items()}

# Compute token embeddings
with torch.no_grad():
    document_embeddings = model(**document_tokens)[0][:, 0]


# normalize embeddings
document_embeddings = torch.nn.functional.normalize(document_embeddings, p=2, dim=1)

for i, doc in enumerate(documents):
    print(f"Document {i+1}: {doc}")
    print(f"Embedding: {document_embeddings[i].cpu().numpy()}\n")


  from .autonotebook import tqdm as notebook_tqdm


{'input_ids': tensor([[    0,    62, 57571,  7515,     5,     2],
        [    0,    62, 57571,  7515,     2,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0]])}
unpad_inputs: true, attention_mask: torch.Size([2, 6]), length: None
input_ids: tensor([[    0,    62, 57571,  7515,     5,     2],
        [    0,    62, 57571,  7515,     2,     1]])
attention_mask_bool: tensor([[ True,  True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True, False]])
input_ids after unpadding: tensor([[    0,    62, 57571,  7515,     5,     2,     0,    62, 57571,  7515,
             2]])
Using unpadded position_ids
position_ids: None
position_ids after unpadding: tensor([[0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4]])
position_ids: tensor([[0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4]])
seq_length: 6
rope_cos: tensor([[ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
          1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
    

In [None]:
from sentence_transformers import SentenceTransformer

# Load the model
model_name = 'Snowflake/snowflake-arctic-embed-m-v2.0'
model = SentenceTransformer(model_name, trust_remote_code=True)


document_embeddings = model.encode(documents)

print(document_embeddings)

[[-4.37089577e-02  2.43543386e-02 -7.39330947e-02  4.05579172e-02
  -1.80412754e-02 -3.35012674e-02 -2.56192908e-02  3.63120325e-02
   3.08795143e-02  5.94893135e-02 -1.77344810e-02 -3.08901053e-02
  -1.19969159e-01  1.11098858e-02  7.12967850e-03  4.86099496e-02
   4.09523770e-02 -2.60164179e-02  1.54911261e-02 -4.48167212e-02
   9.60970595e-02 -3.00353728e-02 -5.51805412e-03 -3.18316813e-03
  -4.58059460e-02  6.32777736e-02 -4.30310741e-02 -2.53691040e-02
   3.28916237e-02 -2.61493474e-02 -7.25693554e-02 -4.75339927e-02
  -2.72440091e-02 -2.53808890e-02  3.23742777e-02  9.14607570e-03
  -3.50533575e-02 -6.04486093e-02  2.99426056e-02  7.79717788e-03
  -4.31877784e-02 -4.42339294e-02 -2.62413621e-02  8.77672341e-03
   6.17679255e-03  2.62576751e-02 -4.23056409e-02  4.28507663e-02
  -1.76152233e-02 -1.85956955e-02 -4.28638048e-02  5.63584790e-02
   6.27824292e-03  2.84444727e-02 -6.26334362e-03  1.27175646e-02
  -2.20532417e-02  6.75354898e-03  5.79505190e-02 -1.09032188e-02
  -5.18030