In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    device_map={"": 0},
    attn_implementation="sdpa",
)
# model = model.eval()
model.requires_grad_(False)

from datasets import load_dataset

dataset = load_dataset(
    # "dim/hendrycks_math_train_12k_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096"
    # "dim/hendrycks_math_test_500_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096_greedy"
    # "dim/hendrycks_math_train_1k_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096_greedy"
    "dim/hendrycks_math_test_500_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096_greedy"
)

dataset = dataset["train"].train_test_split(
    # test_size=250,
    test_size=350,
    # test_size=999,
    # test_size=1,
    seed=42,
)
dataset = dataset["test"].filter(lambda x: x["model_answer"].count("</think>") == 1)

from lm_eval.tasks.hendrycks_math.utils import strip_string, remove_boxed, is_equiv
from hidden_capacity_reasoning.evaluation.math_500.utils import (
    dataset_answer_filter,
    model_answer_filter,
)

correct_dataset = []

for pos, item in enumerate(dataset):
    try:
        answer = dataset_answer_filter(item["answer"])
        model_answer = model_answer_filter(item["model_answer"])
        # print(answer, model_answer)
        # break
        if is_equiv(answer, model_answer):
            correct_dataset.append(item)
    except:
        pass

print(len(dataset), len(correct_dataset), len(correct_dataset) / len(dataset))

correct_dataset = correct_dataset[:30]
len(correct_dataset)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
224 202 0.9017857142857143


30

## Обучение по чанкам в цикле

In [None]:
import torch

from lm_eval.tasks.hendrycks_math.utils import strip_string, remove_boxed, is_equiv
from hidden_capacity_reasoning.evaluation.math_500.utils import (
    dataset_answer_filter,
    model_answer_filter,
)
from tqdm.notebook import tqdm
from tqdm import tqdm as text_tqdm
from hidden_capacity_reasoning.utils import (
    tokenize_single_turn,
    EOS_TOKEN_ID,
    END_THINK_ID,
)

torch.manual_seed(0)
base_prompt = open(
    "hidden_capacity_reasoning/evaluation/math_500/math_500_prompt"
).read()

max_new_tokens = 400
compression_tokens_amount = 16
max_total_tokens = 4096
max_total_steps = max_total_tokens // max_new_tokens + 1

evaluation_dataset = []
correct_items = 0
model.generation_config.pad_token_id = tokenizer.pad_token_id

for dataset_pos in tqdm(range(len(correct_dataset))):
    # for dataset_pos in tqdm(range(1, len(correct_dataset))):
    tokenized_turn = tokenize_single_turn(
        question=base_prompt.format(question=correct_dataset[dataset_pos]["problem"]),
        answer=correct_dataset[dataset_pos]["model_answer"],
        tokenizer=tokenizer,
    )
    for key in tokenized_turn.keys():
        tokenized_turn[key] = torch.tensor(tokenized_turn[key])

    device = "cuda"

    content_compression_mask = tokenized_turn["content_compression_mask"]

    input_part_end = (content_compression_mask == 0).nonzero()[-3][0]
    # get only question part
    question_input_ids = (
        tokenized_turn["input_ids"][: int(input_part_end) + 1].unsqueeze(0).cuda()
    )
    # print(tokenizer.decode(question_input_ids[-1]))

    ######## start loop generation
    ########
    compression_loop = True
    input_ids_embeds = model.get_input_embeddings()(question_input_ids)
    compression_part = torch.tensor([[0]])
    generated_ids_new = None
    generated_embeds = None
    generated_embeds_prev = None
    generated_ids_new_prev = None
    end_of_think = False
    total_generated_text = ""

    for compression_step in text_tqdm(range(max_total_steps)):
        ######## generate new tokens
        ########
        inputs_embeds = None
        with torch.no_grad():

            if compression_part.shape[1] >= compression_tokens_amount:
                generated_embeds_prev = generated_embeds[
                    :, -(max_new_tokens // 2) :, :
                ].clone()
                inputs_embeds = torch.cat(
                    [
                        input_ids_embeds,
                        compression_part,
                        generated_embeds_prev,
                    ],
                    dim=1,
                )
            else:
                # first time generation
                inputs_embeds = torch.cat(
                    [
                        input_ids_embeds,
                    ],
                    dim=1,
                )
            generated_ids_new = model.generate(
                inputs_embeds=inputs_embeds,
                attention_mask=torch.ones(
                    inputs_embeds.shape[:2],
                    device="cuda",
                ).long(),
                max_new_tokens=max_new_tokens,
                # do_sample=False,
                do_sample=True,
                top_p=0.95,
                top_k=30,
                temperature=0.6,
                # use_cache=compression_step > 0,
                # use_cache=False,
            )
            # break
        generated_result = tokenizer.decode(generated_ids_new[-1])
        # print(generated_result)
        total_generated_text += generated_result
        print("=" * 50)
        generated_embeds = model.get_input_embeddings()(generated_ids_new)
        if END_THINK_ID in generated_ids_new[-1].tolist():
            end_of_think = True
            break

        ########
        ######## get original language loss
        ########
        labels = None
        if compression_part.shape[1] >= compression_tokens_amount:
            labels = torch.cat(
                [
                    question_input_ids.cuda(),
                    ((torch.ones(compression_part.shape[:2]) * -100).long()).cuda(),
                    (
                        (torch.ones(generated_embeds_prev.shape[:2]) * -100).long()
                    ).cuda(),
                    generated_ids_new.cuda(),
                ],
                dim=1,
            )
        else:
            # first time generation
            labels = torch.cat(
                [
                    question_input_ids.cuda(),
                    generated_ids_new.cuda(),
                ],
                dim=1,
            )

        question_content_mask = content_compression_mask[
            : int(input_part_end) + 1
        ].clone()
        question_content_mask[question_content_mask == 0] = 4
        question_content_mask[question_content_mask == 1] = 0
        question_content_mask[question_content_mask == 4] = 1

        if compression_part.shape[1] >= compression_tokens_amount:
            train_content_mask_new = torch.cat(
                [
                    question_content_mask,
                    torch.zeros(compression_part.shape[1]),
                    torch.zeros(generated_embeds_prev.shape[1]),
                    torch.zeros(generated_ids_new.shape[1] // 2),
                    torch.ones(generated_ids_new.shape[1] // 2),
                ]
            ).long()
        else:
            train_content_mask_new = torch.cat(
                [
                    question_content_mask,
                    torch.ones(generated_ids_new.shape[1] // 2) * 0,
                    torch.ones(generated_ids_new.shape[1] // 2),
                ]
            ).long()

        generated_ids_new_prev = generated_ids_new.clone()
        # generated_embeds = model.get_input_embeddings()(generated_ids_new)

        new_input_embeds = None
        if compression_part.shape[1] >= compression_tokens_amount:
            new_input_embeds = torch.cat(
                [
                    input_ids_embeds.cuda(),
                    compression_part.cuda(),
                    generated_embeds_prev.cuda(),
                    generated_embeds,
                ],
                dim=1,
            )
        else:
            new_input_embeds = torch.cat(
                [
                    input_ids_embeds,
                    generated_embeds,
                ],
                dim=1,
            )

        labels[:, train_content_mask_new == 0] = -100

        with torch.no_grad():
            original_loss = model(
                inputs_embeds=new_input_embeds,
                labels=labels,
            ).loss
        print("original_loss", original_loss)
        ########
        ######## generate compress embeddings
        ########
        compression_tensor = torch.nn.Parameter(
            torch.rand_like(
                new_input_embeds[:, :compression_tokens_amount, :],
            )
            * model.get_input_embeddings().weight.data.std(),
            requires_grad=True,
        )

        question_labels = question_input_ids.clone().cuda()
        question_labels[0][question_content_mask == 0] = -100
        compression_tensor_labels = (
            (torch.ones(compression_tensor.shape[:2]) * -100).long().cuda()
        )
        if compression_part.shape[1] >= compression_tokens_amount:
            compression_part_labels = (
                (torch.ones(compression_part.shape[:2]) * -100).long().cuda()
            )
            compressed_labels = torch.cat(
                [
                    question_labels,
                    compression_part_labels,
                    compression_tensor_labels,
                    generated_ids_new[:, -(max_new_tokens // 2) :],
                ],
                dim=-1,
            )
        else:
            compressed_labels = torch.cat(
                [
                    question_labels,
                    compression_tensor_labels,
                    generated_ids_new[:, -(max_new_tokens // 2) :],
                ],
                dim=-1,
            )

        ########
        ######## train
        ########
        epoch_amount = 100

        optimizer = torch.optim.Adam([compression_tensor], lr=0.1)
        acclumulation_steps = 1
        for epoch in range(epoch_amount):
            if compression_part.shape[1] >= compression_tokens_amount:
                compressed_inputs_embeds = torch.cat(
                    [
                        input_ids_embeds.detach(),
                        compression_part.detach(),
                        compression_tensor,
                        generated_embeds[:, -(max_new_tokens // 2) :, :].detach(),
                    ],
                    dim=1,
                )
            else:
                compressed_inputs_embeds = torch.cat(
                    [
                        input_ids_embeds.detach(),
                        compression_tensor,
                        generated_embeds[:, -(max_new_tokens // 2) :, :].detach(),
                    ],
                    dim=1,
                )
            compression_loss = model(
                inputs_embeds=compressed_inputs_embeds,
                labels=compressed_labels,
            ).loss
            compression_loss.backward()
            if (epoch + 1) % acclumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            if compression_loss.item() <= original_loss.item():
                break
            # if compression_loss.item() <= (original_loss.item() + 0.01):
            #     break
            # if (compression_loss.item() + 0.05) <= original_loss.item():
            #     break
        print("compression_loss", compression_loss)
        # compression_tensor = torch.rand_like(compression_tensor.detach())
        if compression_part.shape[1] >= compression_tokens_amount:
            compression_part = torch.cat(
                [
                    compression_part,
                    compression_tensor.detach(),
                ],
                dim=1,
            )
            # compression_part = compression_tensor.detach()
        else:
            compression_part = compression_tensor.detach()
    # if end_of_think:
    inputs_embeds = torch.cat(
        [
            inputs_embeds,
            generated_embeds,
        ],
        dim=1,
    )
    final_response = model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=torch.ones(
            inputs_embeds.shape[:2],
            device="cuda",
        ).long(),
        max_new_tokens=(max_total_steps - compression_step) * max_new_tokens,
        do_sample=False,
    )
    final_answer = tokenizer.decode(final_response[-1])
    total_generated_text += final_answer
    # print("FINAL ANSWER", final_answer)

    gold_answer = correct_dataset[dataset_pos]["answer"]
    answer = dataset_answer_filter(gold_answer)
    # print("GOLD ANSWER", answer)
    model_answer = model_answer_filter(total_generated_text)
    if is_equiv(answer, model_answer):
        correct_items += 1
        print("CORRECT")
    else:
        print("WRONG", gold_answer)
        print(total_generated_text)

    compressed_total_len = inputs_embeds.shape[1] + final_response.shape[1]
    total_generated_tokens = final_response.shape[1] + max_new_tokens * (
        compression_step + 1
    )
    original_total_len = len(
        tokenizer.encode(
            correct_dataset[dataset_pos]["model_answer"],
            add_special_tokens=False,
        )
    )
    print(
        f"вопрос+сжатые+сгенерированные={compressed_total_len}, всего_сгенерированно_токенов={total_generated_tokens} оригинальная_генерация={original_total_len}"
    )
    evaluation_dataset.append(
        {
            "original_total_len": original_total_len,
            "compressed_total_len": compressed_total_len,
        }
    )
    # break

  0%|          | 0/30 [00:00<?, ?it/s]



original_loss tensor(0.3264, device='cuda:0')




compression_loss tensor(0.3235, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.3353, device='cuda:0')




compression_loss tensor(0.3136, device='cuda:0', grad_fn=<NllLossBackward0>)


 18%|█▊        | 2/11 [00:26<01:59, 13.26s/it]






CORRECT
вопрос+сжатые+сгенерированные=1193, всего_сгенерированно_токенов=1507 оригинальная_генерация=1959




original_loss tensor(0.3798, device='cuda:0')




compression_loss tensor(0.3741, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.2249, device='cuda:0')




compression_loss tensor(0.2209, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.1980, device='cuda:0')




compression_loss tensor(0.1965, device='cuda:0', grad_fn=<NllLossBackward0>)


 27%|██▋       | 3/11 [00:31<01:23, 10.45s/it]






CORRECT
вопрос+сжатые+сгенерированные=852, всего_сгенерированно_токенов=1735 оригинальная_генерация=1125




original_loss tensor(0.4056, device='cuda:0')




compression_loss tensor(0.4024, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.3287, device='cuda:0')




compression_loss tensor(0.3126, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.4619, device='cuda:0')




compression_loss tensor(0.4382, device='cuda:0', grad_fn=<NllLossBackward0>)


 27%|██▋       | 3/11 [00:30<01:21, 10.15s/it]






CORRECT
вопрос+сжатые+сгенерированные=863, всего_сгенерированно_токенов=1759 оригинальная_генерация=1548




original_loss tensor(0.4035, device='cuda:0')




compression_loss tensor(0.4021, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.2993, device='cuda:0')




compression_loss tensor(0.2966, device='cuda:0', grad_fn=<NllLossBackward0>)


 18%|█▊        | 2/11 [00:24<01:48, 12.03s/it]






WRONG 6
Okay, so I need to find the smallest positive integer \( n \) such that all the roots of the equation \( z^4 + z^2 + 1 = 0 \) are \( n \)-th roots of unity. Hmm, let me think about how to approach this.

First, I remember that roots of unity are complex numbers that satisfy \( z^n = 1 \) for some integer \( n \). So, I need to find the smallest \( n \) where all the roots of this quartic equation are such roots.

Let me start by analyzing the given equation: \( z^4 + z^2 + 1 = 0 \). It looks a bit like a quadratic in terms of \( z^2 \). Maybe I can factor it or find its roots by substitution.

Let me set \( w = z^2 \). Then the equation becomes \( w^2 + w + 1 = 0 \). That's a quadratic equation, so I can solve for \( w \) using the quadratic formula:

\( w = \frac{ -1 \pm \sqrt{1 - 4} }{2} = \frac{ -1 \pm \sqrt{ -3 } }{2} \).

So, \( w = \frac{ -1 \pm i\sqrt{3} }{2} \). These are complex numbers, specifically the primitive 3rd roots of unity. I remember that the roots of \( w^2

  0%|          | 0/11 [00:06<?, ?it/s]

original_loss tensor(0.3759, device='cuda:0')





KeyboardInterrupt: 

In [9]:
len(correct_dataset) / len(dataset), correct_items / len(dataset), correct_items / len(
    correct_dataset
)

(0.13392857142857142, 0.12946428571428573, 0.9666666666666667)

In [11]:
original_total_len = 0
compressed_total_len = 0
for item in evaluation_dataset:
    original_total_len += item["original_total_len"]
    compressed_total_len += item["compressed_total_len"]
original_total_len, compressed_total_len, compressed_total_len / original_total_len

(56056, 29497, 0.5262059369202227)

In [None]:
# тестовая выборка из 30 элементов
# (56056, 21924, 0.39110889110889113) - 0.8333333333333334, 200 токенов, 4 сжимающих
# (56056, 29497, 0.5262059369202227) - 0.9666666666666667, 400 токенов, 16 сжимающих
# (56056, 27994, 0.499393463679178) - 0.9, 400 токенов, 8 сжимающих
# (56056, 23062, 0.4114100185528757) - 0.9, 200 токенов, 16 сжимающих
# (56056, 30087, 0.5367311260168403) - 0.8666666666666667, 400, 32
# (56056, 19406, 0.3461895247609533) - 0.7333333333333333, 100, 2
# (56056, 28068, 0.5007135721421436) - 0.9333333333333333 - 400 токенов, 16 сжимающих, но они не конкатенировались, а просто пересоздавались
# (56056, 28098, 0.5012487512487512) - 0.9333333333333333 - 400 токенов, 32 сжимающих, но они не конкатенировались, а просто пересоздавались

# тестовая выборка на 202
# (408262, 192057, 0.4704258539859208) - 0.8564356435643564, 400 токенов, 16 сжимающих