In [1]:
from transformers import Qwen2ForCausalLM, Qwen2Model, AutoTokenizer
import torch

torch.set_grad_enabled(False)
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model = Qwen2ForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map={"": 0},
)
model = model.eval()

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [2]:
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = "Give me a short introduction to large language model..1"
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt},
]
text = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512)
generated_ids = [
    output_ids[len(input_ids) :]
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'Okay, so I need to come up with a short introduction to a large language model. Hmm, where do I start? Well, I know that LLMs are these big machines that can understand and generate human language. They\'re really good at processing information and generating responses. I think they\'re used in various fields like AI, chatbots, and even creative industries. But I\'m not exactly sure about all the specifics. Let me think about how to structure this.\n\nFirst, I should probably define what an LLM is. Maybe start with something like "Large Language Models (LLMs)" or "Language Models." Then, mention that they are designed to understand and generate human language. That makes sense. But I should also highlight their capabilities and applications. Maybe something about their ability to handle diverse languages and contexts.\n\nI remember hearing that LLMs can act as chatbots, so that\'s a good point. They can respond to questions and provide information. Also, they can generate creative con

In [3]:
model_output_1 = model(
    **model_inputs,
    output_hidden_states=True,
)
model_output_1.hidden_states[-1].shape, model_output_1.hidden_states[-1].dtype

(torch.Size([1, 23, 1536]), torch.bfloat16)

In [None]:
class Qwen2ModelEmbedPooler(Qwen2ForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        self.model = Qwen2Model(config)
        self.lm_head = None
        self.post_init()

    def forward(self, input_embeds):
        # print(input_embeds.dtype)
        input_embeds = self.model(
            inputs_embeds=input_embeds,
            output_hidden_states=True,
        )[0]
        # print(input_embeds.dtype)
        input_embeds = input_embeds.sum(1) / torch.tensor(
            input_embeds.shape[1],
            device=input_embeds.device,
            dtype=input_embeds.dtype,
        )
        # print(input_embeds.dtype)
        input_embeds = input_embeds.unsqueeze(1)
        return input_embeds


embed_pooler = Qwen2ModelEmbedPooler.from_pretrained(
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", device_map={"": 0}
)
result = embed_pooler(
    # model_output_1.hidden_states[-1],
    torch.cat(
        [
            model_output_1.hidden_states[-1],
            model_output_1.hidden_states[-1],
        ],
        dim=0,
    ),
)
result.shape

torch.Size([2, 1, 1536])

In [None]:
from transformers import AutoTokenizer, RobertaModel
import torch

model = RobertaModel.from_pretrained("FacebookAI/roberta-base")
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

outputs.last_hidden_state.shape

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([1, 8, 768])

In [None]:
from transformers import RobertaConfig

model_pooler = RobertaModel(
    RobertaConfig(
        # num_hidden_layers=6,
        hidden_size=1536,
    )
)
model_pooler = model_pooler.to("cuda")
# RobertaConfig(
#     num_hidden_layers=6,
# )

In [22]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


count_parameters(model_pooler)

307055616

In [16]:
result = model_pooler(
    # model_output_1.hidden_states[-1],
    inputs_embeds=torch.cat(
        [
            model_output_1.hidden_states[-1],
            model_output_1.hidden_states[-1],
        ],
        dim=0,
    ),
    output_hidden_states=True,
)
result

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-1.7319, -0.3607,  1.3480,  ..., -0.7462, -1.3530,  1.6311],
         [-0.9783,  0.4005,  1.1449,  ..., -0.8091, -1.1307,  0.1278],
         [-1.6309,  0.9927,  1.3285,  ..., -0.9963, -1.0119,  0.0880],
         ...,
         [-0.0877, -0.7833,  0.2703,  ..., -1.1880, -1.4520,  0.4867],
         [-1.9345, -0.4809,  0.7215,  ..., -1.7561, -1.0042, -0.3902],
         [-2.1255, -0.8651,  0.4842,  ..., -1.5525, -1.3648,  0.6962]],

        [[-0.9212,  0.5121,  1.8180,  ..., -1.5539,  0.2467,  0.9719],
         [-1.5249, -0.0152,  0.5992,  ..., -0.8235, -1.0264,  0.4108],
         [-1.0407, -1.4339,  1.3941,  ..., -1.1372,  0.2327,  0.9719],
         ...,
         [-0.6508, -0.5177,  1.5293,  ..., -1.0896,  0.0824,  1.1235],
         [-1.1410, -0.8605,  1.5728,  ..., -1.2504, -0.7460,  0.3516],
         [-2.0835, -0.4642,  1.4805,  ..., -1.0204, -1.0545,  0.6524]]],
       device='cuda:0'), pooler_output=tensor([[-0.47

In [None]:
input_embeds = result.last_hidden_state  # .shape

input_embeds = input_embeds.sum(1) / torch.tensor(
    input_embeds.shape[1],
    device=input_embeds.device,
    dtype=input_embeds.dtype,
)
# print(input_embeds.dtype)
input_embeds = input_embeds.unsqueeze(1)
input_embeds.shape

torch.Size([2, 1, 1536])

In [4]:
from hidden_capacity_reasoning.models import RobertaModelEmbedPoolerV1
from transformers import RobertaConfig

roberta_model = RobertaModelEmbedPoolerV1(
    RobertaConfig(
        # num_hidden_layers=6,
        hidden_size=1536,
    )
)
roberta_model = roberta_model.to("cuda")

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [5]:
result = roberta_model(
    # model_output_1.hidden_states[-1],
    input_embeds=torch.cat(
        [
            model_output_1.hidden_states[-1],
            model_output_1.hidden_states[-1],
        ],
        dim=0,
    ),
    # output_hidden_states=True,
)
result

tensor([[[-0.2004,  1.7836,  0.4762,  ..., -0.2210, -1.0527,  2.2582]],

        [[-0.2585,  1.3164,  0.5259,  ..., -0.1286, -0.8434,  1.8991]]],
       device='cuda:0')

In [6]:
result.shape

torch.Size([2, 1, 1536])

In [7]:
roberta_model.get_input_embeddings()

Embedding(50265, 1536, padding_idx=1)

In [21]:
from transformers.models.qwen2.modeling_qwen2 import Qwen2RotaryEmbedding, Qwen2Config
from torch import nn

config = Qwen2Config()
emb = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id).cuda()

new_embeds = emb(model_inputs["input_ids"])

In [23]:
new_embeds.shape

torch.Size([1, 23, 4096])

In [None]:
from transformers import Qwen2Config, RobertaConfig, PretrainedConfig


# PretrainedConfig(new_config)
class EmdeddingCompressorConfig(PretrainedConfig):
    roberta: RobertaConfig = RobertaConfig(
        hidden_size=1568,
    )
    qwen: Qwen2Config = Qwen2Config()


config = EmdeddingCompressorConfig()
config.roberta
config.qwen

Qwen2Config {
  "attention_dropout": 0.0,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 22016,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "transformers_version": "4.49.0",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

In [36]:
config.save_pretrained("EmdeddingCompressorConfig")

In [39]:
EmdeddingCompressorConfig.from_pretrained("EmdeddingCompressorConfig").roberta

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1568,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.49.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50265
}

In [11]:
from hidden_capacity_reasoning.models import (
    Qwen2ForCausalLMCompressionV3,
    EmdeddingCompressorConfigV1,
)
from transformers import Qwen2Config, RobertaConfig

# config = EmdeddingCompressorConfigV1()
config = Qwen2Config.from_pretrained('deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B')
config.roberta = RobertaConfig(
    hidden_size=1536,
)
# new_model = Qwen2ForCausalLMCompressionV3.from_pretrained(
new_model = Qwen2ForCausalLMCompressionV3(
    config=config,
    # torch_dtype=torch.bfloat16,
    # **config,
    # "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
)

In [12]:
new_model.model

Qwen2Model(
  (embed_tokens): Embedding(151936, 1536)
  (layers): ModuleList(
    (0-27): 28 x Qwen2DecoderLayer(
      (self_attn): Qwen2Attention(
        (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
        (k_proj): Linear(in_features=1536, out_features=256, bias=True)
        (v_proj): Linear(in_features=1536, out_features=256, bias=True)
        (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
      )
      (mlp): Qwen2MLP(
        (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
        (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
        (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
    )
  )
  (norm): Qwen2RMSNorm((1536,), eps=1e-06)
  (rotary_emb): Qwen2RotaryEmbedding()
)

In [10]:
new_model

Qwen2ForCausalLMCompressionV3(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 4096)
    (layers): ModuleList(
      (0-31): 32 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=True)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=True)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=True)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=4096, out_features=22016, bias=False)
          (up_proj): Linear(in_features=4096, out_features=22016, bias=False)
          (down_proj): Linear(in_features=22016, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((4096,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((4096,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((4096,), eps=1e-06)
  

In [None]:
new_model

In [13]:
new_model.load_state_dict(model.state_dict(), strict=False)

_IncompatibleKeys(missing_keys=['embed_pooler.embeddings.word_embeddings.weight', 'embed_pooler.embeddings.position_embeddings.weight', 'embed_pooler.embeddings.token_type_embeddings.weight', 'embed_pooler.embeddings.LayerNorm.weight', 'embed_pooler.embeddings.LayerNorm.bias', 'embed_pooler.encoder.layer.0.attention.self.query.weight', 'embed_pooler.encoder.layer.0.attention.self.query.bias', 'embed_pooler.encoder.layer.0.attention.self.key.weight', 'embed_pooler.encoder.layer.0.attention.self.key.bias', 'embed_pooler.encoder.layer.0.attention.self.value.weight', 'embed_pooler.encoder.layer.0.attention.self.value.bias', 'embed_pooler.encoder.layer.0.attention.output.dense.weight', 'embed_pooler.encoder.layer.0.attention.output.dense.bias', 'embed_pooler.encoder.layer.0.attention.output.LayerNorm.weight', 'embed_pooler.encoder.layer.0.attention.output.LayerNorm.bias', 'embed_pooler.encoder.layer.0.intermediate.dense.weight', 'embed_pooler.encoder.layer.0.intermediate.dense.bias', 'embed

In [None]:
new_model.save_pretrained('r1_compressor_v4')

: 