In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    device_map={"": 0},
    attn_implementation="sdpa",
)
# model = model.eval()
model.requires_grad_(False)

from datasets import load_dataset

dataset = load_dataset(
    # "dim/hendrycks_math_train_12k_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096"
    # "dim/hendrycks_math_test_500_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096_greedy"
    # "dim/hendrycks_math_train_1k_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096_greedy"
    "dim/hendrycks_math_test_500_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096_greedy"
)

dataset = dataset["train"].train_test_split(
    # test_size=250,
    test_size=350,
    # test_size=999,
    # test_size=1,
    seed=42,
)
dataset = dataset["test"].filter(lambda x: x["model_answer"].count("</think>") == 1)

from lm_eval.tasks.hendrycks_math.utils import strip_string, remove_boxed, is_equiv
from hidden_capacity_reasoning.evaluation.math_500.utils import (
    dataset_answer_filter,
    model_answer_filter,
)

correct_dataset = []

for pos, item in enumerate(dataset):
    try:
        answer = dataset_answer_filter(item["answer"])
        model_answer = model_answer_filter(item["model_answer"])
        # print(answer, model_answer)
        # break
        if is_equiv(answer, model_answer):
            correct_dataset.append(item)
    except:
        pass

print(len(dataset), len(correct_dataset), len(correct_dataset) / len(dataset))

correct_dataset = correct_dataset[:30]
len(correct_dataset)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
224 202 0.9017857142857143


30

## Обучение по чанкам в цикле

In [None]:
import torch

from lm_eval.tasks.hendrycks_math.utils import strip_string, remove_boxed, is_equiv
from hidden_capacity_reasoning.evaluation.math_500.utils import (
    dataset_answer_filter,
    model_answer_filter,
)
from tqdm.notebook import tqdm
from tqdm import tqdm as text_tqdm
from hidden_capacity_reasoning.utils import (
    tokenize_single_turn,
    EOS_TOKEN_ID,
    END_THINK_ID,
)

torch.manual_seed(0)
base_prompt = open(
    "hidden_capacity_reasoning/evaluation/math_500/math_500_prompt"
).read()

max_new_tokens = 200
compression_tokens_amount = 4
max_total_tokens = 4096
max_total_steps = max_total_tokens // max_new_tokens + 1

evaluation_dataset = []
correct_items = 0
model.generation_config.pad_token_id = tokenizer.pad_token_id

for dataset_pos in tqdm(range(len(correct_dataset))):
# for dataset_pos in tqdm(range(1, len(correct_dataset))):
    tokenized_turn = tokenize_single_turn(
        question=base_prompt.format(question=correct_dataset[dataset_pos]["problem"]),
        answer=correct_dataset[dataset_pos]["model_answer"],
        tokenizer=tokenizer,
    )
    for key in tokenized_turn.keys():
        tokenized_turn[key] = torch.tensor(tokenized_turn[key])

    device = "cuda"

    content_compression_mask = tokenized_turn["content_compression_mask"]

    input_part_end = (content_compression_mask == 0).nonzero()[-3][0]
    # get only question part
    question_input_ids = (
        tokenized_turn["input_ids"][: int(input_part_end) + 1].unsqueeze(0).cuda()
    )
    # print(tokenizer.decode(question_input_ids[-1]))

    ######## start loop generation
    ########
    compression_loop = True
    input_ids_embeds = model.get_input_embeddings()(question_input_ids)
    compression_part = torch.tensor([[0]])
    generated_ids_new = None
    generated_embeds = None
    generated_embeds_prev = None
    generated_ids_new_prev = None
    end_of_think = False
    total_generated_text = ""

    for compression_step in text_tqdm(range(max_total_steps)):
        ######## generate new tokens
        ########
        inputs_embeds = None
        with torch.no_grad():

            if compression_part.shape[1] >= compression_tokens_amount:
                generated_embeds_prev = generated_embeds[
                    :, -(max_new_tokens // 2) :, :
                ].clone()
                inputs_embeds = torch.cat(
                    [
                        input_ids_embeds,
                        compression_part,
                        generated_embeds_prev,
                    ],
                    dim=1,
                )
            else:
                # first time generation
                inputs_embeds = torch.cat(
                    [
                        input_ids_embeds,
                    ],
                    dim=1,
                )
            generated_ids_new = model.generate(
                inputs_embeds=inputs_embeds,
                attention_mask=torch.ones(
                    inputs_embeds.shape[:2],
                    device="cuda",
                ).long(),
                max_new_tokens=max_new_tokens,
                do_sample=False,
                use_cache=compression_step > 0,
            )
            # break
        generated_result = tokenizer.decode(generated_ids_new[-1])
        # print(generated_result)
        total_generated_text += generated_result
        print("=" * 50)
        generated_embeds = model.get_input_embeddings()(generated_ids_new)
        if END_THINK_ID in generated_ids_new[-1].tolist():
            end_of_think = True
            break

        ########
        ######## get original language loss
        ########
        labels = None
        if compression_part.shape[1] >= compression_tokens_amount:
            labels = torch.cat(
                [
                    question_input_ids.cuda(),
                    ((torch.ones(compression_part.shape[:2]) * -100).long()).cuda(),
                    (
                        (torch.ones(generated_embeds_prev.shape[:2]) * -100).long()
                    ).cuda(),
                    generated_ids_new.cuda(),
                ],
                dim=1,
            )
        else:
            # first time generation
            labels = torch.cat(
                [
                    question_input_ids.cuda(),
                    generated_ids_new.cuda(),
                ],
                dim=1,
            )

        question_content_mask = content_compression_mask[
            : int(input_part_end) + 1
        ].clone()
        question_content_mask[question_content_mask == 0] = 4
        question_content_mask[question_content_mask == 1] = 0
        question_content_mask[question_content_mask == 4] = 1

        if compression_part.shape[1] >= compression_tokens_amount:
            train_content_mask_new = torch.cat(
                [
                    question_content_mask,
                    torch.zeros(compression_part.shape[1]),
                    torch.zeros(generated_embeds_prev.shape[1]),
                    torch.zeros(generated_ids_new.shape[1] // 2),
                    torch.ones(generated_ids_new.shape[1] // 2),
                ]
            ).long()
        else:
            train_content_mask_new = torch.cat(
                [
                    question_content_mask,
                    torch.ones(generated_ids_new.shape[1] // 2) * 0,
                    torch.ones(generated_ids_new.shape[1] // 2),
                ]
            ).long()

        generated_ids_new_prev = generated_ids_new.clone()
        # generated_embeds = model.get_input_embeddings()(generated_ids_new)

        new_input_embeds = None
        if compression_part.shape[1] >= compression_tokens_amount:
            new_input_embeds = torch.cat(
                [
                    input_ids_embeds.cuda(),
                    compression_part.cuda(),
                    generated_embeds_prev.cuda(),
                    generated_embeds,
                ],
                dim=1,
            )
        else:
            new_input_embeds = torch.cat(
                [
                    input_ids_embeds,
                    generated_embeds,
                ],
                dim=1,
            )

        labels[:, train_content_mask_new == 0] = -100

        with torch.no_grad():
            original_loss = model(
                inputs_embeds=new_input_embeds,
                labels=labels,
            ).loss
        print("original_loss", original_loss)
        ########
        ######## generate compress embeddings
        ########
        compression_tensor = torch.nn.Parameter(
            torch.rand_like(
                new_input_embeds[:, :compression_tokens_amount, :],
            )
            * model.get_input_embeddings().weight.data.std(),
            requires_grad=True,
        )

        question_labels = question_input_ids.clone().cuda()
        question_labels[0][question_content_mask == 0] = -100
        compression_tensor_labels = (
            (torch.ones(compression_tensor.shape[:2]) * -100).long().cuda()
        )
        if compression_part.shape[1] >= compression_tokens_amount:
            compression_part_labels = (
                (torch.ones(compression_part.shape[:2]) * -100).long().cuda()
            )
            compressed_labels = torch.cat(
                [
                    question_labels,
                    compression_part_labels,
                    compression_tensor_labels,
                    generated_ids_new[:, -(max_new_tokens // 2) :],
                ],
                dim=-1,
            )
        else:
            compressed_labels = torch.cat(
                [
                    question_labels,
                    compression_tensor_labels,
                    generated_ids_new[:, -(max_new_tokens // 2) :],
                ],
                dim=-1,
            )

        ########
        ######## train
        ########
        epoch_amount = 100

        optimizer = torch.optim.Adam([compression_tensor], lr=0.1)
        acclumulation_steps = 1
        for epoch in range(epoch_amount):
            if compression_part.shape[1] >= compression_tokens_amount:
                compressed_inputs_embeds = torch.cat(
                    [
                        input_ids_embeds.detach(),
                        compression_part.detach(),
                        compression_tensor,
                        generated_embeds[:, -(max_new_tokens // 2) :, :].detach(),
                    ],
                    dim=1,
                )
            else:
                compressed_inputs_embeds = torch.cat(
                    [
                        input_ids_embeds.detach(),
                        compression_tensor,
                        generated_embeds[:, -(max_new_tokens // 2) :, :].detach(),
                    ],
                    dim=1,
                )
            compression_loss = model(
                inputs_embeds=compressed_inputs_embeds,
                labels=compressed_labels,
            ).loss
            compression_loss.backward()
            if (epoch + 1) % acclumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            if compression_loss.item() <= original_loss.item():
                break
            # if compression_loss.item() <= (original_loss.item() + 0.01):
            #     break
        print("compression_loss", compression_loss)
        if compression_part.shape[1] >= compression_tokens_amount:
            compression_part = torch.cat(
                [
                    compression_part,
                    compression_tensor.detach(),
                ],
                dim=1,
            )
        else:
            compression_part = compression_tensor.detach()
    inputs_embeds = torch.cat(
        [
            inputs_embeds,
            generated_embeds,
        ],
        dim=1,
    )
    final_response = model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=torch.ones(
            inputs_embeds.shape[:2],
            device="cuda",
        ).long(),
        max_new_tokens=(max_total_steps - compression_step) * max_new_tokens,
        do_sample=False,
    )
    final_answer = tokenizer.decode(final_response[-1])
    total_generated_text += final_answer
    # print("FINAL ANSWER", final_answer)

    gold_answer = correct_dataset[dataset_pos]["answer"]
    answer = dataset_answer_filter(gold_answer)
    # print("GOLD ANSWER", answer)
    model_answer = model_answer_filter(total_generated_text)
    if is_equiv(answer, model_answer):
        correct_items += 1
        print("CORRECT")
    else:
        print("WRONG", gold_answer)
        print(total_generated_text)
        
    compressed_total_len = inputs_embeds.shape[1] + final_response.shape[1]
    total_generated_tokens = final_response.shape[1] + max_new_tokens * (
        compression_step + 1
    )
    original_total_len = len(
        tokenizer.encode(
            correct_dataset[dataset_pos]["model_answer"],
            add_special_tokens=False,
        )
    )
    print(
        f"вопрос+сжатые+сгенерированные={compressed_total_len}, всего_сгенерированно_токенов={total_generated_tokens} оригинальная_генерация={original_total_len}"
    )
    evaluation_dataset.append(
        {
            "original_total_len": original_total_len,
            "compressed_total_len": compressed_total_len,
        }
    )
    # break

  0%|          | 0/30 [00:00<?, ?it/s]



original_loss tensor(0.4852, device='cuda:0')




compression_loss tensor(0.4828, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.3640, device='cuda:0')




compression_loss tensor(0.4036, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.3844, device='cuda:0')




compression_loss tensor(0.3832, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.4838, device='cuda:0')




compression_loss tensor(0.7059, device='cuda:0', grad_fn=<NllLossBackward0>)


 19%|█▉        | 4/21 [00:42<03:02, 10.74s/it]






CORRECT
вопрос+сжатые+сгенерированные=1457, всего_сгенерированно_токенов=1887 оригинальная_генерация=1959




original_loss tensor(0.6115, device='cuda:0')




compression_loss tensor(0.6092, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.4828, device='cuda:0')




compression_loss tensor(0.4773, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.4040, device='cuda:0')




compression_loss tensor(0.4039, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.3793, device='cuda:0')




compression_loss tensor(0.4500, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.3714, device='cuda:0')




compression_loss tensor(0.3710, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.3604, device='cuda:0')




compression_loss tensor(0.6290, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.3638, device='cuda:0')




compression_loss tensor(0.3669, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.5439, device='cuda:0')




compression_loss tensor(0.5417, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.5295, device='cuda:0')




compression_loss tensor(0.5295, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.5750, device='cuda:0')




compression_loss tensor(0.5734, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.4180, device='cuda:0')




compression_loss tensor(0.4158, device='cuda:0', grad_fn=<NllLossBackward0>)


 52%|█████▏    | 11/21 [01:24<01:16,  7.64s/it]






CORRECT
вопрос+сжатые+сгенерированные=445, всего_сгенерированно_токенов=2432 оригинальная_генерация=1125




original_loss tensor(0.4880, device='cuda:0')




compression_loss tensor(0.4856, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.4484, device='cuda:0')




compression_loss tensor(0.4405, device='cuda:0', grad_fn=<NllLossBackward0>)


 10%|▉         | 2/21 [00:14<02:21,  7.44s/it]






CORRECT
вопрос+сжатые+сгенерированные=677, всего_сгенерированно_токенов=913 оригинальная_генерация=1548




original_loss tensor(0.4977, device='cuda:0')




compression_loss tensor(0.4919, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.5172, device='cuda:0')




compression_loss tensor(0.5151, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.4632, device='cuda:0')




compression_loss tensor(0.4567, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.5629, device='cuda:0')




compression_loss tensor(0.5611, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.4721, device='cuda:0')




compression_loss tensor(0.4688, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.5359, device='cuda:0')




compression_loss tensor(0.5241, device='cuda:0', grad_fn=<NllLossBackward0>)


 29%|██▊       | 6/21 [00:37<01:34,  6.28s/it]






WRONG 6
Okay, so I have this problem: I need to find the smallest positive integer \( n \) such that all the roots of the equation \( z^4 + z^2 + 1 = 0 \) are \( n^{\text{th}} \) roots of unity. Hmm, okay. Let me think about how to approach this.

First, I remember that the roots of unity are the solutions to the equation \( z^n = 1 \). So, if all the roots of \( z^4 + z^2 + 1 = 0 \) are \( n^{\text{th}} \) roots of unity, that means each root \( z \) satisfies \( z^n = 1 \). Therefore, I need to find the smallest \( n \) such that every root of the given quartic equation is also a root of unity of order \( n \).

Let me write down the equation again: \( z^4 + z^2 + 1 = 0 \). Hmm, this looks similar to some cyclotomic polynomials I've heard about. Cyclotomic polynomials are minimal polynomials over the integers for roots of unity. Maybe this quartic is related to a cyclotomic polynomial?

I recall that cyclotomic polynomials are factors of \( z^n - 1 \) and are irreducible over the int



original_loss tensor(0.6077, device='cuda:0')




compression_loss tensor(0.5980, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.5295, device='cuda:0')




compression_loss tensor(0.5241, device='cuda:0', grad_fn=<NllLossBackward0>)


 10%|▉         | 2/21 [00:16<02:37,  8.27s/it]






CORRECT
вопрос+сжатые+сгенерированные=511, всего_сгенерированно_токенов=670 оригинальная_генерация=748




original_loss tensor(0.5476, device='cuda:0')




compression_loss tensor(0.5475, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.4552, device='cuda:0')




compression_loss tensor(0.4532, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.5352, device='cuda:0')




compression_loss tensor(0.5253, device='cuda:0', grad_fn=<NllLossBackward0>)


 14%|█▍        | 3/21 [00:19<01:56,  6.49s/it]






CORRECT
вопрос+сжатые+сгенерированные=400, всего_сгенерированно_токенов=829 оригинальная_генерация=1080




original_loss tensor(0.5603, device='cuda:0')




compression_loss tensor(0.5533, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.4948, device='cuda:0')




compression_loss tensor(0.4920, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.4773, device='cuda:0')




compression_loss tensor(0.4761, device='cuda:0', grad_fn=<NllLossBackward0>)


 14%|█▍        | 3/21 [00:21<02:06,  7.04s/it]






CORRECT
вопрос+сжатые+сгенерированные=505, всего_сгенерированно_токенов=920 оригинальная_генерация=1639




original_loss tensor(0.5776, device='cuda:0')




compression_loss tensor(0.5737, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.4510, device='cuda:0')




compression_loss tensor(0.4483, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.3970, device='cuda:0')




compression_loss tensor(0.3960, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.3770, device='cuda:0')




compression_loss tensor(0.3752, device='cuda:0', grad_fn=<NllLossBackward0>)


 19%|█▉        | 4/21 [00:28<02:00,  7.07s/it]






CORRECT
вопрос+сжатые+сгенерированные=508, всего_сгенерированно_токенов=1136 оригинальная_генерация=1732




original_loss tensor(0.5864, device='cuda:0')




compression_loss tensor(0.5773, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.4754, device='cuda:0')




compression_loss tensor(0.4724, device='cuda:0', grad_fn=<NllLossBackward0>)


 10%|▉         | 2/21 [00:14<02:19,  7.32s/it]






CORRECT
вопрос+сжатые+сгенерированные=717, всего_сгенерированно_токенов=924 оригинальная_генерация=1198




original_loss tensor(0.6308, device='cuda:0')




compression_loss tensor(0.6272, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.4757, device='cuda:0')




compression_loss tensor(0.4689, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.5037, device='cuda:0')




compression_loss tensor(0.5023, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.5118, device='cuda:0')




compression_loss tensor(0.5116, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.4416, device='cuda:0')




compression_loss tensor(0.4333, device='cuda:0', grad_fn=<NllLossBackward0>)


 24%|██▍       | 5/21 [00:32<01:43,  6.44s/it]






In [None]:
# 