In [1]:
import sys
import torch
import os

from gguf import GGUFWriter, GGMLQuantizationType
from transformers import AutoModel, AutoTokenizer

os.environ["http_proxy"] = "http://127.0.0.1:2081"
os.environ["https_proxy"] = "http://127.0.0.1:2081"

def convert_hf(repo_id, output_path, float_type='f16'):
    # convert to ggml quantization type
    if float_type not in ['f16', 'f32']:
        print(f'Float type must be f16 or f32, got: {float_type}')
        sys.exit(1)
    else:
        qtype = GGMLQuantizationType[float_type.upper()]
        dtype0 = {'f16': torch.float16, 'f32': torch.float32}[float_type]

    # load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(repo_id)
    tokenizer_json = os.path.join(os.path.dirname(output_path), os.path.basename(repo_id) + ".tokenizer.json")
    # tokenizer.save_pretrained(tokenizer_json)
    tokenizer._tokenizer.save(tokenizer_json, False)

    model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)

    if model.text_model:
        model = model.text_model

    config = model.config
    print(config)
    # check: https://huggingface.co/jinaai/jina-bert-flash-implementation/blob/main/configuration_bert.py#L62
        # self,
        # vocab_size=30522,
        # hidden_size=768,
        # num_hidden_layers=12,
        # num_attention_heads=12,
        # intermediate_size=3072,
        # hidden_act="gelu",
        # hidden_dropout_prob=0.1,
        # attention_probs_dropout_prob=0.1,
        # type_vocab_size=2,
        # initializer_range=0.02,
        # layer_norm_eps=1e-12,
        # pad_token_id=0,
        # window_size=(-1, -1),
        # dense_seq_output=False,
        # mlp_type='mlp',
        # mlp_checkpoint_lvl=0,
        # last_layer_subset=False,
        # fused_dropout_add_ln=False,
        # fused_bias_fc=False,
        # pad_vocab_size_multiple=1,
        # use_flash_attn=True,
        # use_qk_norm=True,
        # emb_pooler=None,
        # classifier_dropout=None,
        # num_loras=5,
        # **kwargs,
    # print model
    param_keys = [
        'vocab_size', 'hidden_size', 'num_hidden_layers',
        'num_attention_heads', 'intermediate_size', 'type_vocab_size', 'pad_token_id'
    ]
    print('PARAMS')
    for k in param_keys:
        v = getattr(config, k)
        print(f'{k:<24s} = {v}')
    print()

    # print vocab
    vocab_keys = [
        'vocab_size', 'pad_token_id', 'unk_token_id', 'cls_token_id', 'sep_token_id'
    ]
    print('VOCAB')
    for k in vocab_keys:
        v = getattr(tokenizer, k)
        print(f'{k:24s} = {v}')


    # start to write GGUF file
    gguf_writer = GGUFWriter(output_path, "JinaBert")

    # write metadata
    gguf_writer.add_name(repo_id)
    gguf_writer.add_description('gguf model for embeddings.cpp')
    gguf_writer.add_file_type(qtype)

    # write model params
    gguf_writer.add_uint32('vocab_size', config.vocab_size)
    gguf_writer.add_uint32('hidden_size', config.hidden_size)
    gguf_writer.add_uint32('num_hidden_layers', config.num_hidden_layers)
    gguf_writer.add_uint32('num_attention_heads', config.num_attention_heads)
    gguf_writer.add_uint32('intermediate_size', config.intermediate_size)
    gguf_writer.add_uint32('type_vocab_size', config.type_vocab_size)
    gguf_writer.add_uint32('pad_token_id', config.pad_token_id)
    gguf_writer.add_float32('layer_norm_eps', config.layer_norm_eps)

    # write the tokenizer special token(we only need to know [PAD])
    KEY_PAD_ID = 'tokenizer.ggml.padding_token_id'
    gguf_writer.add_int32(KEY_PAD_ID, tokenizer.pad_token_id)

    # write tensors
    print('TENSORS')
    hidden_size = config.hidden_size
    for name, data in model.state_dict().items():
        # get correct dtype
        if 'emb_ln' in name or 'norm1' in name or 'norm2' in name or 'bias' in name:
            dtype = torch.float32
        else:
            dtype = dtype0
        # if "mixer.Wqkv.weight" in name:
        #     data_np = data.numpy()
        #     data_q = data_np[:hidden_size, :]  # [768, 768]
        #     data_k = data_np[hidden_size:2*hidden_size, :]  # [768, 768]
        #     data_v = data_np[2*hidden_size:, :]  # [768, 768]
        #     name_q = name + ".q"
        #     name_k = name + ".k"
        #     name_v = name + ".v"

        #     gguf_writer.add_tensor(name_q, data_q)
        #     gguf_writer.add_tensor(name_k, data_k)
        #     gguf_writer.add_tensor(name_v, data_v)

        #     print(f'{name_q:64s} = {str(list(data_q.shape)):16s} {data_q.dtype} → {dtype}')
        #     print(f'{name_k:64s} = {str(list(data_k.shape)):16s} {data_k.dtype} → {dtype}')
        #     print(f'{name_v:64s} = {str(list(data_v.shape)):16s} {data_k.dtype} → {dtype}')
        # elif "mixer.Wqkv.bias" in name:
        #     data_np = data.numpy()
        #     data_q = data_np[:hidden_size]  # [768, 768]
        #     data_k = data_np[hidden_size:2*hidden_size]  # [768, 768]
        #     data_v = data_np[2*hidden_size:]  # [768, 768]
        #     name_q = name + ".q"
        #     name_k = name + ".k"
        #     name_v = name + ".v"

        #     gguf_writer.add_tensor(name_q, data_q)
        #     gguf_writer.add_tensor(name_k, data_k)
        #     gguf_writer.add_tensor(name_v, data_v)

        #     print(f'{name_q:64s} = {str(list(data_q.shape)):16s} {data_q.dtype} → {dtype}')
        #     print(f'{name_k:64s} = {str(list(data_k.shape)):16s} {data_k.dtype} → {dtype}')
        #     print(f'{name_v:64s} = {str(list(data_v.shape)):16s} {data_k.dtype} → {dtype}')
        # else:
        # print info
        shape_str = str(list(data.shape))
        print(f'{name:64s} = {shape_str:16s} {data.dtype} → {dtype}')

        # do conversion
        data = data.to(dtype)

        # add to gguf output
        gguf_writer.add_tensor(name, data.numpy())

    # execute and close writer
    gguf_writer.write_header_to_file()
    gguf_writer.write_kv_data_to_file()
    gguf_writer.write_tensors_to_file()
    gguf_writer.close()

    # print success
    print()
    print(f'GGML model written to {output_path}')

repo_id = 'jinaai/jina-clip-v1'

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
print(model)
text_model = model.text_model

JinaCLIPModel(
  (text_model): HFTextEncoder(
    (transformer): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30528, 768, padding_idx=0)
        (token_type_embeddings): Embedding(2, 768)
      )
      (emb_drop): Dropout(p=0.1, inplace=False)
      (emb_ln): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (encoder): BertEncoder(
        (layers): ModuleList(
          (0-11): 12 x Block(
            (mixer): MHA(
              (Wqkv): LinearResidual(in_features=768, out_features=2304, bias=True)
              (inner_attn): SelfAttention(
                (drop): Dropout(p=0.1, inplace=False)
              )
              (inner_cross_attn): CrossAttention(
                (drop): Dropout(p=0.1, inplace=False)
              )
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (dropout1): Dropout(p=0.1, inplace=False)
            (drop_path1): StochasticDepth(p=0.0, mode=row)
        

In [52]:
text_model = model.text_model
print(text_model.transformer.encoder.layers[0])
# print(text_model.transformer.embeddings.token_type_embeddings.state_dict())

Block(
  (mixer): MHA(
    (Wqkv): LinearResidual(in_features=768, out_features=2304, bias=True)
    (inner_attn): SelfAttention(
      (drop): Dropout(p=0.1, inplace=False)
    )
    (inner_cross_attn): CrossAttention(
      (drop): Dropout(p=0.1, inplace=False)
    )
    (out_proj): Linear(in_features=768, out_features=768, bias=True)
  )
  (dropout1): Dropout(p=0.1, inplace=False)
  (drop_path1): StochasticDepth(p=0.0, mode=row)
  (norm1): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (mlp): GLUMLP(
    (gated_layers): Linear(in_features=768, out_features=6144, bias=False)
    (act): GELU(approximate='none')
    (wo): Linear(in_features=3072, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (dropout2): Dropout(p=0.1, inplace=False)
  (drop_path2): StochasticDepth(p=0.0, mode=row)
  (norm2): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)


In [2]:
convert_hf(repo_id, "../models/jina-clip-v1.fp32.gguf", float_type="f32")

use_flash_attn: False
use_qk_norm: False
fused_bias_fc: False
window_size: [-1, -1]
num_heads: 12, cross_attn: False, use_flash_attn: False, return_residual: True, window_size: [-1, -1]
seqlen: 16, linear_biases: torch.Size([1, 12, 16, 16])
use_flash_attn: False
use_qk_norm: False
fused_bias_fc: False
window_size: [-1, -1]
num_heads: 12, cross_attn: False, use_flash_attn: False, return_residual: True, window_size: [-1, -1]
seqlen: 16, linear_biases: torch.Size([1, 12, 16, 16])
use_flash_attn: False
use_qk_norm: False
fused_bias_fc: False
window_size: [-1, -1]
num_heads: 12, cross_attn: False, use_flash_attn: False, return_residual: True, window_size: [-1, -1]
seqlen: 16, linear_biases: torch.Size([1, 12, 16, 16])
use_flash_attn: False
use_qk_norm: False
fused_bias_fc: False
window_size: [-1, -1]
num_heads: 12, cross_attn: False, use_flash_attn: False, return_residual: True, window_size: [-1, -1]
seqlen: 16, linear_biases: torch.Size([1, 12, 16, 16])
use_flash_attn: False
use_qk_norm: F