In [1]:
from datasets import load_dataset

dataset = load_dataset("dim/open_orca_905_DeepSeek-R1-Distill-Qwen-1.5B")
dataset = dataset["train"]
dataset = dataset.train_test_split(test_size=5, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 900
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 5
    })
})

In [1]:
from hidden_capacity_reasoning.utils import tokenize_single_turn
from transformers import Qwen2ForCausalLM, Qwen2Model, AutoTokenizer
import torch


class Qwen2ModelEmbedPoolerV1(Qwen2ForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        self.model = Qwen2Model(config)
        # self.model.embed_tokens = None
        self.post_init()

    def forward(self, input_embeds):
        # print(input_embeds.dtype)
        input_embeds = self.model(
            inputs_embeds=input_embeds,
            output_hidden_states=True,
        )[0]
        # print(input_embeds.dtype)
        input_embeds = input_embeds.sum(1) / torch.tensor(
            input_embeds.shape[1],
            device=input_embeds.device,
        )
        # print(input_embeds.dtype)
        input_embeds = input_embeds.unsqueeze(1)
        return input_embeds


class Qwen2ForCausalLMCompressionV1(Qwen2ForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        self.model = Qwen2Model(config)
        self.vocab_size = config.vocab_size
        self.lm_head = torch.nn.Linear(
            config.hidden_size, config.vocab_size, bias=False
        )
        print(config._name_or_path)
        self.embed_pooler = Qwen2ModelEmbedPoolerV1.from_pretrained(
            config._name_or_path,
        )

        self.post_init()
        # Initialize weights and apply final processing

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        position_ids=None,
        past_key_values=None,
        inputs_embeds=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        cache_position=None,
        logits_to_keep=0,
        **kwargs
    ):
        if "replaced_original_tokens" in kwargs:
            pass
        return super().forward(
            input_ids,
            attention_mask,
            position_ids,
            past_key_values,
            inputs_embeds,
            labels,
            use_cache,
            output_attentions,
            output_hidden_states,
            return_dict,
            cache_position,
            logits_to_keep,
            **kwargs
        )


# torch.set_grad_enabled(False)
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
# model_name = "./test_model/"
model = Qwen2ForCausalLMCompressionV1.from_pretrained(
    model_name,
    # embed_pooler,
    torch_dtype=torch.bfloat16,
    # device_map={"": 0},
)

model = model.eval().cuda()
# model.model = model.embed_pooler.model
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = "how many wings has a bird?"
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt},
]
text = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
print(text)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

with torch.no_grad():
    # generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=1)
    generated_ids = model.generate(
        model_inputs.input_ids, max_new_tokens=1000, do_sample=False
    )
generated_ids = [
    output_ids[len(input_ids) :]
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B


Some weights of Qwen2ForCausalLMCompressionV1 were not initialized from the model checkpoint at deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B and are newly initialized: ['embed_pooler.lm_head.weight', 'embed_pooler.model.embed_tokens.weight', 'embed_pooler.model.layers.0.input_layernorm.weight', 'embed_pooler.model.layers.0.mlp.down_proj.weight', 'embed_pooler.model.layers.0.mlp.gate_proj.weight', 'embed_pooler.model.layers.0.mlp.up_proj.weight', 'embed_pooler.model.layers.0.post_attention_layernorm.weight', 'embed_pooler.model.layers.0.self_attn.k_proj.bias', 'embed_pooler.model.layers.0.self_attn.k_proj.weight', 'embed_pooler.model.layers.0.self_attn.o_proj.weight', 'embed_pooler.model.layers.0.self_attn.q_proj.bias', 'embed_pooler.model.layers.0.self_attn.q_proj.weight', 'embed_pooler.model.layers.0.self_attn.v_proj.bias', 'embed_pooler.model.layers.0.self_attn.v_proj.weight', 'embed_pooler.model.layers.1.input_layernorm.weight', 'embed_pooler.model.layers.1.mlp.down_proj.weight', 'embe

<｜begin▁of▁sentence｜>You are a helpful assistant.<｜User｜>how many wings has a bird?<｜Assistant｜><think>



'Okay, so I need to figure out how many wings a bird has. I\'m not entirely sure, but I know that birds are birds, so they\'re related to birds. Let me start by recalling what I know about birds. I remember that birds are often called "fowl" in some languages, but that\'s probably just a naming thing. \n\nI think most birds have wings, but I\'m not certain. Maybe I should think about different types of birds. For example, let\'s take a common bird like a sparrow. I know sparrow is a type of raptor, and I think they have wings. They\'re usually called "wings" in English, so that makes sense. \n\nWhat about a different bird, like a crow? I believe crows have wings too. They\'re often called "wings" in some languages, but I\'m not sure if that\'s accurate. Maybe it\'s just a naming convention. \n\nWait, I should make sure. Let me think about other birds. How about a penguin? I think penguins have wings, but they\'re called "feathers" in some languages. Hmm, so that\'s a bit different. But

In [2]:
model.model.embed_tokens.weight == model.embed_pooler.model.embed_tokens.weight

tensor([[True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        ...,
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True]], device='cuda:0')

In [2]:
pooller = Qwen2ModelEmbedPoolerV1.from_pretrained(model_name)

In [3]:
tokenizer.save_pretrained("./test_model")

('./test_model/tokenizer_config.json',
 './test_model/special_tokens_map.json',
 './test_model/tokenizer.json')

In [2]:
# model.save_pretrained("./test_model", safe_serialization=False)
model.save_pretrained("./test_model")

In [2]:
model

Qwen2ForCausalLMCompressionV1(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps=1e-06)
    (ro

In [None]:
# model.save_pretrained('test_model')

In [3]:
# test
tokenize_single_turn(
    question=dataset["train"][0]["question"],
    answer=dataset["train"][0]["answer"],
    tokenizer=tokenizer,
)
None

In [4]:
from tqdm import tqdm

train_examples = [
    tokenize_single_turn(tokenizer=tokenizer, **item)
    for item in tqdm(dataset["train"].to_list())
]

100%|██████████| 900/900 [00:01<00:00, 663.34it/s]


In [9]:
from hidden_capacity_reasoning.utils import generate_train_examples, pad_train_examples

prepared_train_examples = []
for item in tqdm(train_examples):
    for example in generate_train_examples(
        dataset_batch=[item], tokenizer=tokenizer, window_size=4
    ):
        prepared_train_examples.append(example)

100%|██████████| 900/900 [00:04<00:00, 180.62it/s]


In [11]:
len(prepared_train_examples)

124217

In [29]:
from more_itertools import chunked

batch_size = 4
train_examples_batches = [
    pad_train_examples(
        train_examples=item,
        tokenizer=tokenizer,
    )
    for item in tqdm(
        list(
            chunked(
                prepared_train_examples,
                batch_size,
            )
        )
    )
]

100%|██████████| 31055/31055 [00:05<00:00, 5994.41it/s] 


In [None]:
len(train_examples_batches[0]["replaced_original_tokens"]["input_ids"])

4

In [38]:
train_examples_batches[0].keys()

dict_keys(['replaced_original_tokens', 'compressed_input_ids', 'original_tokens'])

In [47]:
train_examples_batches_clean = []
for item in train_examples_batches:
    train_examples_batches_clean.append(
        {
            "replaced_original_tokens": item["replaced_original_tokens"]["input_ids"],
            "compressed_input_ids": item["compressed_input_ids"]["input_ids"],
            "original_tokens": item["original_tokens"]["input_ids"],
            "attention_mask": item["compressed_input_ids"]["attention_mask"],
            "labels": item["compressed_input_ids"]["input_ids"],
        }
    )
print(len(train_examples_batches_clean))
# train_examples_batches_clean[0]

31055


In [None]:
model.get_input_embeddings()(torch.tensor([[1, 2]], device="cuda")).shape

torch.Size([1, 2, 1536])

In [None]:
import transformers
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    default_data_collator,
)

config = LoraConfig(
    r=64,
    lora_alpha=128,
    lora_dropout=0.0,
    target_modules=["lm_head", "q_proj", "v_proj"],
)
embed_pooler = get_peft_model(embed_pooler, config)
print(embed_pooler.print_trainable_parameters())
print(embed_pooler)

trainable params: 18,538,496 || all params: 1,795,626,496 || trainable%: 1.0324
None
PeftModel(
  (base_model): LoraModel(
    (model): Qwen2ModelEmbedPooler(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 1536)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=1536, out_features=1536, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1536, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=1536, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): Module

In [None]:
# https://github.com/huggingface/trl/blob/main/examples/scripts/sft_video_llm.py

### Use unsloth