In [8]:
from transformers import Qwen2ForCausalLM, Qwen2Model, AutoTokenizer
import torch

torch.set_grad_enabled(False)
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model = Qwen2ForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map={"": 0},
)
model = model.eval()

In [9]:
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = "Give me a short introduction to large language model..1"
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt},
]
text = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512)
generated_ids = [
    output_ids[len(input_ids) :]
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


'Okay, so I need to come up with a short introduction to a large language model. The user provided an example, which is pretty concise and covers the basics. Let me think about how I can approach this.\n\nFirst, I should consider what a large language model (LLM) is. It\'s a type of AI that can understand and generate human language, right? So, it\'s designed to do things like text generation, translation, summarization, and more. It\'s widely used in various fields like education, healthcare, and even entertainment.\n\nI should make sure the introduction is informative but not too technical. It should give a good overview without getting bogged down in details. Maybe start with the definition, then mention its capabilities, and perhaps touch on its applications.\n\nWait, the example also mentions the user\'s role and the purpose of the assistant. So, I should keep it friendly and approachable, maybe a bit enthusiastic to attract the user.\n\nI should avoid making it too long. The exam

In [10]:
model_output_1 = model(
    **model_inputs,
    output_hidden_states=True,
)
model_output_1.hidden_states[-1].shape, model_output_1.hidden_states[-1].dtype

(torch.Size([1, 23, 1536]), torch.bfloat16)

In [11]:
from transformers import Qwen2Config


class Qwen2ModelEmbedPooler(Qwen2ForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        self.model = Qwen2Model(config)
        self.lm_head = None
        self.post_init()

    def forward(self, input_embeds):
        # print(input_embeds.dtype)
        input_embeds = self.model(
            inputs_embeds=input_embeds,
            output_hidden_states=True,
        )[0]
        # print(input_embeds.dtype)
        input_embeds = input_embeds.sum(1) / torch.tensor(
            input_embeds.shape[1],
            device=input_embeds.device,
            dtype=input_embeds.dtype,
        )
        # print(input_embeds.dtype)
        input_embeds = input_embeds.unsqueeze(1)
        return input_embeds


config = Qwen2Config.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
config.num_hidden_layers = 4

In [None]:
from hidden_capacity_reasoning.models import (
    Qwen2ModelEmbedPoolerV2,
    Qwen2ForCausalLMCompressionV5,
)


# embed_pooler = Qwen2ModelEmbedPooler.from_pretrained(

# embed_pooler = Qwen2ModelEmbedPooler(
embed_pooler = Qwen2ModelEmbedPoolerV2(
    config=config,
    # "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    # device_map={"":0},
)


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(count_parameters(embed_pooler))

# result = embed_pooler(
#     # model_output_1.hidden_states[-1],
#     torch.cat(
#         [
#             model_output_1.hidden_states[-1],
#             model_output_1.hidden_states[-1],
#         ],
#         dim=0,
#     ),
# )
# result.shape

In [None]:
embed_pooler

In [1]:
from hidden_capacity_reasoning.models import (
    Qwen2ModelEmbedPoolerV2,
    Qwen2ForCausalLMCompressionV5,
    Qwen2PoolerConfig,
)
from transformers import Qwen2Config, PretrainedConfig


# class Qwen2PoolerConfig(Qwen2Config):
#     pooler_config: Qwen2Config = None


config = Qwen2Config.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
pooler_config = Qwen2Config.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
pooler_config.num_hidden_layers = 4
config.pooler_config = pooler_config


new_model = Qwen2ForCausalLMCompressionV5.from_pretrained(
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    config=config,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Some weights of Qwen2ForCausalLMCompressionV5 were not initialized from the model checkpoint at deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B and are newly initialized: ['embed_pooler.model.embed_tokens.weight', 'embed_pooler.model.layers.0.input_layernorm.weight', 'embed_pooler.model.layers.0.mlp.down_proj.weight', 'embed_pooler.model.layers.0.mlp.gate_proj.weight', 'embed_pooler.model.layers.0.mlp.up_proj.weight', 'embed_pooler.model.layers.0.post_attention_layernorm.weight', 'embed_pooler.model.layers.0.self_attn.k_proj.bias', 'embed_pooler.model.layers.0.self_attn.k_proj.weight', 'embed_pooler.model.layers.0.self_attn.o_proj.weight', 'embed_pooler.model.layers.0.self_attn.q_proj.bias', 'embed_pooler.model.layers.0.self_attn.q_proj.weight', 'embed_pooler.model.layers.0.self_attn.v_proj.bias', 'embed_pooler.model.layers.0.self_attn.v_proj.weight', 'embed_pooler.model.layers.1.input_

In [2]:
new_model.save_pretrained("r1_compressor_v5")

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
tokenizer.save_pretrained("r1_compressor_v5")

('r1_compressor_v5/tokenizer_config.json',
 'r1_compressor_v5/special_tokens_map.json',
 'r1_compressor_v5/tokenizer.json')

In [3]:
from hidden_capacity_reasoning.models import (
    Qwen2ModelEmbedPoolerV2,
    Qwen2ForCausalLMCompressionV5,
)

new_model = Qwen2ForCausalLMCompressionV5.from_pretrained(
    "r1_compressor_v5",
)

Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
from hidden_capacity_reasoning.models import (
    Qwen2ModelEmbedPoolerV2,
    Qwen2ForCausalLMCompressionV5,
    Qwen2PoolerConfig,
)
from transformers import Qwen2Config

config = Qwen2PoolerConfig.from_pretrained("r1_compressor_v5")
Qwen2Config(config.pooler_config)

Qwen2Config {
  "attention_dropout": 0.0,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 22016,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "transformers_version": "4.49.0",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": {
    "_attn_implementation_autoset": true,
    "_name_or_path": "",
    "add_cross_attention": false,
    "architectures": [
      "Qwen2ForCausalLM"
    ],
    "attention_dropout": 0.0,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": 151643,
    "chunk_size_feed_forward": 0,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,


In [4]:
39 % 40 == 0 

False