In [1]:
import os

os.environ["WANDB_PROJECT"] = "hidden_capacity_reasoning"
from transformers import Qwen2ForCausalLM, Qwen2Model, AutoTokenizer, BitsAndBytesConfig
import torch
from trl import (
    ModelConfig,
    ScriptArguments,
    SFTConfig,
    SFTTrainer,
    TrlParser,
    get_kbit_device_map,
)

from datasets import load_dataset
from tqdm import tqdm
from hidden_capacity_reasoning.utils import (
    generate_train_examples,
    pad_train_examples,
    tokenize_single_turn,
)
from datasets import Dataset
import gc
import types

# need for auto SFTTrainer patch(possible increase speed)
from unsloth import is_bfloat16_supported
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from hidden_capacity_reasoning.utils import (
    EOS_TOKEN_ID,
    TEXT_TOKEN_ID,
    WINDOW_SIZE,
    VISION_START,
    VISION_END,
    find_all_linear_names_v3,
)

import time
from datetime import datetime


from hidden_capacity_reasoning.models import (
    Qwen2ForCausalLMCompressionV1,
    Qwen2ModelEmbedPoolerV1,
    Qwen2ForCausalLMCompressionV2,
    Qwen2ModelEmbedPoolerV2,
    Qwen2ForCausalLMCompressionV3,
    Qwen2ModelEmbedPoolerV3,
)

# model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model_name = "my_r1_model_v3"
model = Qwen2ForCausalLMCompressionV3.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    # torch_dtype=torch.float32,
    device_map={"": 0},
    attn_implementation="flash_attention_2",
)
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.model.requires_grad_(False)
# model = model.to(torch.bfloat16)

# temp_model = Qwen2ModelEmbedPoolerV3.from_pretrained(
#     model_name,
#     attn_implementation="flash_attention_2",
#     torch_dtype=torch.bfloat16,
#     device_map={"": 0},
#     # quantization_config=BitsAndBytesConfig(load_in_4bit=True),
# )
# print(
#     model.embed_pooler.load_state_dict(
#         temp_model.state_dict(),
#         strict=False,
#     ),
# )
# temp_model = temp_model.cpu()
# del temp_model
# gc.collect()
# torch.cuda.empty_cache()

dataset = load_dataset("dim/open_orca_905_DeepSeek-R1-Distill-Qwen-1.5B")
dataset = dataset["train"]
dataset = dataset.train_test_split(test_size=10, seed=42)

# test pass
tokenize_single_turn(
    question=dataset["train"][0]["question"],
    answer=dataset["train"][0]["answer"],
    tokenizer=tokenizer,
)
train_examples = [
    tokenize_single_turn(tokenizer=tokenizer, **item)
    for item in tqdm(dataset["train"].to_list()[:3])
]

prepared_train_examples = []
for item in tqdm(train_examples):
    for example in generate_train_examples(
        dataset_batch=[item],
        window_size=WINDOW_SIZE,
    ):
        prepared_train_examples.append(example)

print(
    "max_len",
    max([len(item["original_tokens"]) for item in prepared_train_examples]),
)

new_dataset = Dataset.from_list(prepared_train_examples)
print(dataset)


def collate_fn(batch):
    padded_batch = pad_train_examples(
        train_examples=batch,
        tokenizer=tokenizer,
    )
    padded_batch = {
        "replaced_original_tokens": padded_batch["replaced_original_tokens"][
            "input_ids"
        ],
        "compressed_input_ids": padded_batch["compressed_input_ids"]["input_ids"],
        "original_tokens": padded_batch["original_tokens"]["input_ids"],
        "attention_mask": padded_batch["compressed_input_ids"]["attention_mask"],
        "labels": padded_batch["compressed_input_ids"]["input_ids"],
        "content_compression_mask": padded_batch["content_compression_mask"][
            "input_ids"
        ],
    }
    for key in padded_batch.keys():
        padded_batch[key] = torch.tensor(padded_batch[key])
    skip_ids = [
        TEXT_TOKEN_ID,
        EOS_TOKEN_ID,
        VISION_START,
        VISION_END,
    ]
    for skip_id in skip_ids:
        padded_batch["labels"][padded_batch["labels"] == skip_id] = -100
    # часть инпута от пользователя
    last_index = (padded_batch["content_compression_mask"] == 1).long().nonzero()[-1][1]
    padded_batch["labels"][:, :last_index][
        padded_batch["content_compression_mask"][:, :last_index] == 1
    ] = -100
    # print(padded_batch)
    return padded_batch


peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.0,
    bias="none",
    target_modules=find_all_linear_names_v3(model=model),
    modules_to_save=[
        "embed_pooler.model.embed_tokens",
        "embed_pooler.weight_pooler",
    ],
)

formatted_date = datetime.fromtimestamp(time.time()).strftime("%Y_%m_%d_%H_%M_%S_%f")
model.embed_pooler = prepare_model_for_kbit_training(model.embed_pooler)
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=new_dataset,
    data_collator=collate_fn,
    peft_config=peft_config,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        warmup_steps=5,
        num_train_epochs=1,  # 90,  # Set this for 1 full training run.
        # num_train_epochs=90,  # Set this for 1 full training run.
        # max_steps=10000,
        learning_rate=1e-4,
        bf16=True,
        # fp16=model.dtype == torch.float16,
        logging_steps=8,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir=f"outputs/{formatted_date}",
        # report_to="wandb",
        report_to="none",
        remove_unused_columns=False,
        dataset_kwargs={"skip_prepare_dataset": True},
        # gradient_checkpointing=True,
        save_steps=10000,
        run_name=formatted_date,
    ),
)
trainer.train()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00, 956.22it/s]
100%|██████████| 3/3 [00:00<00:00, 38.69it/s]


max_len 580
DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 895
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 10
    })
})
trainable params: 254,197,760 || all params: 3,577,359,360 || trainable%: 7.1057


  trainer = SFTTrainer(
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
8,0.0
16,0.0
24,0.0
32,0.0
40,0.0
48,0.0
56,0.0
64,0.0
72,0.0
80,0.0


TrainOutput(global_step=169, training_loss=0.0, metrics={'train_runtime': 61.8415, 'train_samples_per_second': 10.931, 'train_steps_per_second': 2.733, 'total_flos': 0.0, 'train_loss': 0.0})

In [4]:
model.save_pretrained("my_r1_model_v3")

In [5]:
tokenizer.save_pretrained("my_r1_model_v3")

('my_r1_model_v3/tokenizer_config.json',
 'my_r1_model_v3/special_tokens_map.json',
 'my_r1_model_v3/tokenizer.json')

In [None]:
trainer.model

In [None]:
trainer.model.base_model.embed_pooler

In [3]:
model.to(torch.bfloat16)
model.embed_pooler(
    torch.randn(
        2,
        2,
        1536,
        device="cuda",
        dtype=torch.bfloat16,
    )
).shape

torch.Size([2, 1, 1536])

In [3]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Layer: {name}, Requires Gradient: {param.requires_grad}")

Layer: embed_pooler.model.embed_tokens.modules_to_save.default.weight, Requires Gradient: True
Layer: embed_pooler.model.layers.0.self_attn.q_proj.lora_A.default.weight, Requires Gradient: True
Layer: embed_pooler.model.layers.0.self_attn.q_proj.lora_B.default.weight, Requires Gradient: True
Layer: embed_pooler.model.layers.0.self_attn.k_proj.lora_A.default.weight, Requires Gradient: True
Layer: embed_pooler.model.layers.0.self_attn.k_proj.lora_B.default.weight, Requires Gradient: True
Layer: embed_pooler.model.layers.0.self_attn.v_proj.lora_A.default.weight, Requires Gradient: True
Layer: embed_pooler.model.layers.0.self_attn.v_proj.lora_B.default.weight, Requires Gradient: True
Layer: embed_pooler.model.layers.0.self_attn.o_proj.lora_A.default.weight, Requires Gradient: True
Layer: embed_pooler.model.layers.0.self_attn.o_proj.lora_B.default.weight, Requires Gradient: True
Layer: embed_pooler.model.layers.0.mlp.gate_proj.lora_A.default.weight, Requires Gradient: True
Layer: embed_pool

In [2]:
for name, param in trainer.model.named_parameters():
    if param.requires_grad:
        print(f"Layer: {name}, Requires Gradient: {param.requires_grad}")

Layer: base_model.model.embed_pooler.model.embed_tokens.modules_to_save.default.weight, Requires Gradient: True
Layer: base_model.model.embed_pooler.model.layers.0.self_attn.q_proj.lora_A.default.weight, Requires Gradient: True
Layer: base_model.model.embed_pooler.model.layers.0.self_attn.q_proj.lora_B.default.weight, Requires Gradient: True
Layer: base_model.model.embed_pooler.model.layers.0.self_attn.k_proj.lora_A.default.weight, Requires Gradient: True
Layer: base_model.model.embed_pooler.model.layers.0.self_attn.k_proj.lora_B.default.weight, Requires Gradient: True
Layer: base_model.model.embed_pooler.model.layers.0.self_attn.v_proj.lora_A.default.weight, Requires Gradient: True
Layer: base_model.model.embed_pooler.model.layers.0.self_attn.v_proj.lora_B.default.weight, Requires Gradient: True
Layer: base_model.model.embed_pooler.model.layers.0.self_attn.o_proj.lora_A.default.weight, Requires Gradient: True
Layer: base_model.model.embed_pooler.model.layers.0.self_attn.o_proj.lora_B.

In [3]:
model = trainer.model

In [None]:
dataset

In [1]:
from peft import PeftModel
from hidden_capacity_reasoning.models import (
    Qwen2ForCausalLMCompressionV1,
    Qwen2ModelEmbedPoolerV1,
    Qwen2ForCausalLMCompressionV2,
    Qwen2ModelEmbedPoolerV2,
)
import torch
from transformers import AutoTokenizer

# model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model_name = "r1_compressor_v2"
model = Qwen2ForCausalLMCompressionV2.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map={"": 0},
    attn_implementation="flash_attention_2",
)
model = PeftModel.from_pretrained(
    model,
    # "outputs/2025_04_19_17_17_34_493839/checkpoint-210000",
    # "outputs/2025_04_19_17_17_34_493839/checkpoint-50000",
    # "outputs/2025_04_21_19_23_11_642509/checkpoint-10000",
    "outputs/2025_04_22_01_43_43_347583/checkpoint-90000",
    # "outputs/2025_04_22_01_43_43_347583/checkpoint-10000",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-23 00:08:33 __init__.py:190] Automatically detected platform cuda.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
from datasets import load_dataset

dataset = load_dataset(
    "dim/hendrycks_math_train_12k_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096"
    # "dim/hendrycks_math_test_500_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096"
)

dataset = dataset["train"].train_test_split(
    test_size=250,
    seed=42,
)
dataset = dataset["test"].filter(lambda x: x["model_answer"].count("</think>") == 1)

from lm_eval.tasks.hendrycks_math.utils import strip_string, remove_boxed, is_equiv
from hidden_capacity_reasoning.evaluation.math_500.utils import (
    dataset_answer_filter,
    model_answer_filter,
)

correct_dataset = []

for pos, item in enumerate(dataset):
    try:
        answer = dataset_answer_filter(item["answer"])
        model_answer = model_answer_filter(item["model_answer"])
        # print(answer, model_answer)
        # break
        if is_equiv(answer, model_answer):
            correct_dataset.append(item)
    except:
        pass

len(dataset), len(correct_dataset), len(correct_dataset) / len(dataset)

'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'


(155, 136, 0.8774193548387097)

In [3]:
correct_dataset[0]

{'problem': 'Below is a magic square, meaning that the sum of the numbers in each row, in each column, and in each of the $2$ main diagonals are equal. What is the value of $n$?\n\n[asy]size(125);\nfor(int i = 0; i<4; ++i)\n{\n\ndraw((0,i)--(3,i),linewidth(1));\n}\n\nfor(int j = 0; j<4; ++j)\n{\n\ndraw((j,0)--(j,3),linewidth(1));\n}\n\nlabel("$n-3$",(.5,.5));\nlabel("3",(.5,1.5));\nlabel("$n+1$",(.5,2.5));\n\nlabel("$n+2$",(1.5,.5));\nlabel("$2n-9$",(1.5,1.5));\nlabel("$1$",(1.5,2.5));\n\nlabel("$2$",(2.5,.5));\nlabel("$n$",(2.5,1.5));\nlabel("$n-1$",(2.5,2.5));\n[/asy]',
 'solution': 'First, we can evaluate the sum across the first row, which gives $(n+1)+1+(n-1)=2n+1$.  Evaluate the sum of the entries across the second row, $3+(2n-9)+n=3n-6$. Now, since we have a magic square, these two sums are equal.  So $2n+1=3n-6$. Isolating $n$, we obtain $n = \\boxed{7}$.\n\nThe square will look like: [asy] size(2cm);\ndraw((0,0)--(3,0)--(3,3)--(0,3)--cycle,linewidth(1));\ndraw((1,0)--(1,3),lin

In [None]:
model = trainer.model

In [29]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
device = "cuda"
# prompt = "how many wings has a bird?"
prompt = correct_dataset[:5][0]["problem"]

base_prompt = open(
    "hidden_capacity_reasoning/evaluation/math_500/math_500_prompt"
).read()

messages = [
    {"role": "user", "content": base_prompt.format(question=prompt)},
]
text = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
print(text)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

with torch.no_grad():
    # generated_ids = model.generate(
    #     model_inputs.input_ids,
    #     max_new_tokens=1,
    #     do_sample=False,
    # )
    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=4096,
        do_sample=False,
        # do_sample=not False,
        # temperature=0.6,
        # top_p=0.95,
    )
generated_ids = [
    output_ids[len(input_ids) :]
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response

###

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<｜begin▁of▁sentence｜><｜User｜>Problem: An elephant and a lion are currently 1 mile apart. The elephant runs directly away from the lion at 19 miles per hour, while the lion runs directly towards the elephant at 24 miles per hour.  How many minutes will it take for the lion to catch the elephant?

Please reason step by step, and put your final answer within \boxed{}.<｜Assistant｜><think>



"Okay, so I have this problem where an elephant and a lion are 1 mile apart. The elephant is running directly away from the lion at 19 miles per hour, and the lion is running towards the elephant at 24 miles per hour. I need to figure out how many minutes it will take for the lion to catch the elephant. Hmm, let me think about how to approach this.\n\nFirst, I know that both the elephant and the lion are moving towards or away from each other. The elephant is moving away at 19 mph, and the lion is moving towards it at 24 mph. So, their relative speed is the difference between their speeds because they're moving towards each other in terms of closing the distance. Wait, no, actually, the elephant is moving away, so the lion is closing the distance at the sum of their speeds? Or is it the difference?\n\nLet me clarify. If two objects are moving towards each other, their relative speed is the sum of their individual speeds. But in this case, the elephant is moving away, so the lion is mov

In [None]:
correct_dataset[0]

### Greedy Generation

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
device = "cuda"
prompt = correct_dataset[:5][0]["problem"]

base_prompt = open(
    "hidden_capacity_reasoning/evaluation/math_500/math_500_prompt"
).read()

generated_tokens = tokenizer.apply_chat_template(
    [
        {"role": "user", "content": base_prompt.format(question=prompt)},
    ],
    tokenize=True,
    add_generation_prompt=True,
)
# initial_len = generated_tokens.shape
with torch.no_grad():
    generated_tokens = torch.tensor(generated_tokens).unsqueeze(0).cuda()
    generated_embeds = model.get_input_embeddings()(generated_tokens)
    max_steps = 1200
    past_key_values = None
    for step in range(max_steps):
        if step == 0:
            logits = model(
                inputs_embeds=generated_embeds,
                attention_mask=torch.ones(1, generated_embeds.shape[1]).long().cuda(),
                position_ids=torch.arange(generated_embeds.shape[1])
                .cuda()
                .unsqueeze(0),
                use_cache=True,
                past_key_values=None,
            )
            past_key_values = logits.past_key_values
            logits = logits.logits[:, -1, :].clone().float()
        else:
            logits = model(
                # input_ids=generated_tokens[-1][-1:].unsqueeze(0),
                inputs_embeds=generated_embeds[:, -1:, :],
                attention_mask=torch.ones(1, generated_embeds.shape[1]).long().cuda(),
                position_ids=torch.tensor(generated_embeds.shape[1] - 1)
                .reshape(1, 1)
                .cuda(),
                use_cache=True,
                past_key_values=past_key_values,
            )
            past_key_values = logits.past_key_values

            logits = logits.logits[:, -1, :].clone().float()

        top_token = logits.argmax(-1)[-1]
        top_token_embed = model.get_input_embeddings()(top_token)
        # print(top)
        generated_tokens = torch.cat([generated_tokens, top_token.reshape(1, 1)], dim=1)
        generated_embeds = torch.cat(
            [generated_embeds, top_token_embed.reshape(1, 1, -1)],
            dim=1,
        )
        # print(step, tokenizer.decode(generated_tokens[-1]))
    # break
print(tokenizer.decode(generated_tokens[-1]))
# break
embeds_generation_tokens = generated_tokens[-1]
###

In [106]:
tokenizer.decode(generated_tokens[-1])

"<｜begin▁of▁sentence｜><｜User｜>Problem: An elephant and a lion are currently 1 mile apart. The elephant runs directly away from the lion at 19 miles per hour, while the lion runs directly towards the elephant at 24 miles per hour.  How many minutes will it take for the lion to catch the elephant?\n\nPlease reason step by step, and put your final answer within \\boxed{}.<｜Assistant｜><think>\nOkay, so I have this problem where an elephant and a lion are 1 mile apart. The elephant is running directly away from the lion at 19 miles per hour, and the lion is running towards the elephant at 24 miles per hour. I need to figure out how many minutes it will take for the lion to catch the elephant. Hmm, let me think about how to approach this.\n\nFirst, I know that both the elephant and the lion are moving towards or away from each other. The elephant is moving away at 19 mph, and the lion is moving towards it at 24 mph. So, their relative speed is the difference between their speeds because they

In [14]:
from transformers.generation import utils
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
device = "cuda"
dataset_pos = 10
prompt = correct_dataset[dataset_pos]["problem"]

base_prompt = open(
    "hidden_capacity_reasoning/evaluation/math_500/math_500_prompt"
).read()

generated_tokens = tokenizer.apply_chat_template(
    [
        {
            "role": "user",
            "content": base_prompt.format(question=prompt),
        },
    ],
    tokenize=True,
    add_generation_prompt=True,
)
generated_tokens = torch.tensor(generated_tokens).unsqueeze(0).cuda()
generated_embeds = model.get_input_embeddings()(generated_tokens)

generated_ids = model.generate(
    inputs_embeds=generated_embeds,
    max_new_tokens=1800,
    # max_new_tokens=5,
    do_sample=False,
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [34]:
correct_dataset[dataset_pos]

{'problem': "Alicia's average on her five tests is 88 points. The score range for each test is 0 points to 100 points, inclusive. What is the lowest possible score that Alicia could have earned on one of the five tests?",
 'solution': "If Alicia's average score on her five tests is 88 points, then the sum of her scores must be $88 \\times 5 = 440$ points. If she earned   100 points on four of the tests, then she could have earned a score as low as $\\boxed{40\\text{ points}}$ on the other test.",
 'answer': '40\\text{ points}',
 'subject': 'Prealgebra',
 'level': 4,
 'unique_id': 'test/prealgebra/1697.json',
 'model_answer': "Okay, so I need to figure out the lowest possible score Alicia could have earned on one of her five tests. She has an average of 88 points, and each test is scored between 0 and 100, inclusive. Hmm, let me think about this step by step.\n\nFirst, I know that an average is calculated by taking the sum of all the test scores and dividing it by the number of tests. I

In [16]:
import json

example_id = correct_dataset[dataset_pos]["unique_id"].replace("/", "_")
with open(f"./temp/{example_id}", "w") as f:
    temp = {
        "generated_ids": generated_ids.tolist(),
        "input_ids": generated_tokens.tolist(),
    }
    json.dump(temp, f)

In [None]:
tokenizer.encode(
    correct_dataset[dataset_pos]["model_answer"],
    add_special_tokens=False,
)

In [None]:
tokenizer.decode(generated_ids[-1])

In [None]:
tokenizer.decode(generated_tokens[-1])

In [None]:
# <｜begin▁of▁sentence｜><｜User｜>Problem: An elephant and a lion are currently 1 mile apart. The elephant runs directly away from the lion at 19 miles per hour, while the lion runs directly towards the elephant at 24 miles per hour.  How many minutes will it take for the lion to catch the elephant?\n\nPlease reason step by step, and put your final answer within \\boxed{}.<｜Assistant｜><think>\nOkay, so I have this problem where an elephant and a lion are 1 mile apart. The elephant is running directly away from the lion at 19 miles per hour, and the lion is running towards the elephant at 24 miles per hour. I need to figure out how many minutes it will take for the lion to catch the elephant. Hmm, let me think about how to approach this.\n\nFirst, I know that both the elephant and the lion are moving towards or away from each other. The elephant is moving away at 19 mph, and the lion is moving towards it at 24 mph. So, their relative speed is the difference between their speeds because they're moving towards each other. Wait, no, actually, since the elephant is moving away, the lion has to cover the distance that the elephant is moving away plus the distance the elephant covers while the lion is moving towards it.\n\nLet me clarify. The lion is moving towards the elephant at 24 mph, and the elephant is moving away at 19 mph. So, the lion is closing the gap at a rate of 24 mph minus 19 mph, which is 5 mph. That makes sense because if two objects are moving towards each other, their relative speed is the sum of their speeds, but in this case, one is moving away and the other is moving towards, so it's the difference.\n\nSo, the initial distance between them is 1 mile. The lion is closing the gap at 5 mph. To find the time it takes to catch up, I can use the formula:\n\nTime = Distance / Speed\n\nSo, plugging in the numbers, the time should be 1 mile divided by 5 mph. That gives me 0.2 hours. But the question asks for the time in minutes, so I need to convert 0.2 hours to minutes. Since 1 hour is 60 minutes, 0.2 hours is 0.2 * 60 = 12 minutes.\n\nWait, let me double-check that. If the lion is moving at 24 mph and the elephant is moving away at 19 mph, the relative speed is 24 - 19 = 5 mph. So, yes, the lion is gaining on the elephant at 5 mph. So, 1 mile divided by 5 mph is indeed 0.2 hours, which is 12 minutes. That seems right.\n\nBut just to make sure I didn't make a mistake, let me think about it another way. Maybe set up an equation for their positions as functions of time and see if I get the same result.\n\nLet's denote t as the time in hours it takes for the lion to catch the elephant. In that time, the elephant will have moved a distance of 19t miles away from the starting point, and the lion will have moved 24t miles towards the elephant. Since they start 1 mile apart, the distance between them when the lion catches the elephant will be zero.\n\nSo, the distance the lion covers plus the distance the elephant covers should equal the initial distance between them. Wait, no, actually, the lion is moving towards the elephant, so the distance the lion covers is 24t, and the distance the elephant covers is 19t. But since the elephant is moving away, the total distance between them when the lion catches up is 24t - 19t = 5t. This should equal the initial distance, which is 1 mile.\n\nSo, 5t = 1 mile. Solving for t, we get t = 1/5 hours, which is 0.2 hours. Converting that to minutes, 0.2 * 60 = 12 minutes. Yep, same result. So, that seems consistent.\n\nAlternatively, maybe I can think about it in terms of how much distance the lion needs to cover relative to the elephant. Since the lion is moving faster, it's gaining on the elephant at 5 mph. So, the lion needs to cover the 1 mile gap at a relative speed of 5 mph. So, time = distance / speed = 1 / 5 hours, which is 12 minutes. Yep, same answer.\n\nI think that's solid. So, the lion will catch the elephant in 12 minutes.\n\n**Final Answer**\nThe lion will catch the elephant in \\boxed{12} minutes.\n</think>\n\nThe elephant and the lion are initially 1 mile apart. The elephant runs directly away from the lion at 19 miles per hour, while the lion runs directly towards the elephant at 24 miles per hour. \n\nTo find the time it takes for the lion to catch the elephant, we first determine their relative speed. Since the lion is moving towards the elephant and the elephant is moving away, their relative speed is the difference between their speeds:\n\n\\[\n24 \\text{ mph} - 19 \\text{ mph} = 5 \\text{ mph}\n\\]\n\nThe initial distance between them is 1 mile. Using the formula for time, which is distance divided by speed, we get:\n\n\\[\n\\text{Time} = \\frac{1 \\text{ mile}}{5 \\text{ mph}} = 0.2 \\text{ hours}\n\\]\n\nConverting 0.2 hours to minutes:\n\n\\[\n0.2 \\text{ hours} \\times 60 \\text{ minutes per hour} = 12 \\text{ minutes}\n\\]\n\nThus, the lion will catch the elephant in \\boxed{12} minutes.<｜end▁of▁sentence｜><｜begin▁of▁sentence｜>\n\nTo determine how long it will take for the lion to catch the elephant, we need to consider their relative speed. The elephant is running away at
# Okay, so I have this problem where an elephant and a lion are 1 mile apart. The elephant is running directly away from the lion at 19 miles per hour, and the lion is running towards the elephant at 24 miles per hour. I need to figure out how many minutes it will take for the lion to catch the elephant. Hmm, let me think about how to approach this.\n\nFirst, I know that both the elephant and the lion are moving towards or away from each other. The elephant is moving away at 19 mph, and the lion is moving towards it at 24 mph. So, their relative speed is the difference between their speeds because they're moving towards each other. Wait, no, actually, since the elephant is moving away, the lion has to cover the distance that the elephant is moving away plus the distance the elephant covers while the lion is moving towards it.\n\nLet me clarify. The lion is moving towards the elephant at 24 mph, and the elephant is moving away at 19 mph. So, the lion is closing the gap at a rate of 24 mph minus 19 mph, which is 5 mph. That makes sense because if two objects are moving towards each other, their relative speed is the sum of their speeds, but in this case, one is moving away and the other is moving towards, so it's the difference.\n\nSo, the initial distance between them is 1 mile. The lion is closing the gap at 5 mph. To find the time it takes to catch up, I can use the formula:\n\nTime = Distance / Speed\n\nSo, plugging in the numbers, the time should be 1 mile divided by 5 mph. That gives me 0.2 hours. But the question asks for the time in minutes, so I need to convert 0.2 hours to minutes. Since 1 hour is 60 minutes, 0.2 hours is 0.2 * 60 = 12 minutes.\n\nWait, let me double-check that. If the lion is moving at 24 mph and the elephant is moving away at 19 mph, the relative speed is 24 - 19 = 5 mph. So, yes, the lion is gaining on the elephant at 5 mph. So, 1 mile divided by 5 mph is indeed 0.2 hours, which is 12 minutes. That seems right.\n\nBut just to make sure I didn't make a mistake, let me think about it another way. Maybe set up an equation for their positions as functions of time and see if I get the same result.\n\nLet's denote t as the time in hours it takes for the lion to catch the elephant. In that time, the elephant will have moved a distance of 19t miles away from the starting point, and the lion will have moved 24t miles towards the elephant. Since they start 1 mile apart, the distance between them when the lion catches the elephant will be zero.\n\nSo, the distance the lion covers plus the distance the elephant covers should equal the initial distance between them. Wait, no, actually, the lion is moving towards the elephant, so the distance the lion covers is 24t, and the distance the elephant covers is 19t. But since the elephant is moving away, the total distance between them when the lion catches up is 24t - 19t = 5t. This should equal the initial distance, which is 1 mile.\n\nSo, 5t = 1 mile. Solving for t, we get t = 1/5 hours, which is 0.2 hours. Converting that to minutes, 0.2 * 60 = 12 minutes. Yep, same result. So, that seems consistent.\n\nAlternatively, maybe I can think about it in terms of how much distance the lion needs to cover relative to the elephant. Since the lion is moving faster, it's gaining on the elephant at 5 mph. So, the lion needs to cover the 1 mile gap at a relative speed of 5 mph. So, time = distance / speed = 1 / 5 hours, which is 12 minutes. Yep, same answer.\n\nI think that's solid. So, the lion will catch the elephant in 12 minutes.\n\n**Final Answer**\nThe lion will catch the elephant in \\boxed{12} minutes.\n</think>\n\nThe elephant and the lion are initially 1 mile apart. The elephant runs directly away from the lion at 19 miles per hour, while the lion runs directly towards the elephant at 24 miles per hour. \n\nTo find the time it takes for the lion to catch the elephant, we first determine their relative speed. Since the lion is moving towards the elephant and the elephant is moving away, their relative speed is the difference between their speeds:\n\n\\[\n24 \\text{ mph} - 19 \\text{ mph} = 5 \\text{ mph}\n\\]\n\nThe initial distance between them is 1 mile. Using the formula for time, which is distance divided by speed, we get:\n\n\\[\n\\text{Time} = \\frac{1 \\text{ mile}}{5 \\text{ mph}} = 0.2 \\text{ hours}\n\\]\n\nConverting 0.2 hours to minutes:\n\n\\[\n0.2 \\text{ hours} \\times 60 \\text{ minutes per hour} = 12 \\text{ minutes}\n\\]\n\nThus, the lion will catch the elephant in \\boxed{12} minutes.<｜end▁of▁sentence｜>

In [None]:
print(tokenizer.decode(generated_ids[-1]))

In [None]:
print(tokenizer.decode(generated_tokens[-1]))

In [23]:
tokenizer.decode(generated_tokens[-1])

"<｜begin▁of▁sentence｜><｜User｜>An elephant and a lion are currently 1 mile apart. The elephant runs directly away from the lion at 19 miles per hour, while the lion runs directly towards the elephant at 24 miles per hour.  How many minutes will it take for the lion to catch the elephant?<｜Assistant｜><think>\nOkay, so I have this problem where an elephant and a lion are 1 mile apart. The elephant is running directly away from the lion at 19 miles per hour, and the lion is running towards the elephant at 24 miles per hour. I need to figure out how many minutes it will take for the lion to catch the elephant. Hmm, let me think about how to approach this.\n\nFirst, I know that both the elephant and the lion are moving towards or away from each other. The elephant is moving away at 19 mph, and the lion is moving towards the elephant at 24 mph. So, their speeds are different, which means the distance between them will be changing over time. I need to find the time it takes for the lion to clo

In [None]:
# Okay, so I have this problem where an elephant and a lion are 1 mile apart. The elephant is running directly away from the lion at 19 miles per hour, and the lion is running towards the elephant at 24 miles per hour. I need to figure out how many minutes it will take for the lion to catch the elephant. Hmm, let me think about how to approach this.\n\nFirst, I know that both the elephant and the lion are moving towards or away from each other. The elephant is moving away at 19 mph, and the lion is moving towards the elephant at 24 mph. So, their speeds are different, which means the distance between them is changing over time. I need to find the time it takes for the lion to close that 1 mile gap.\n\nI remember that when two objects are moving towards each other, their relative speed is the sum of their individual speeds. But in this case, the elephant is moving away, so it's like the lion is chasing the elephant, but the elephant is moving away. So, maybe I should consider the relative speed between the lion and the elephant.\n\nWait, actually, since the elephant is moving away, the lion has to cover the initial distance plus any additional distance the elephant might cover while the lion is chasing. But since the lion is faster, it should eventually catch up. So, maybe I can model this as a relative speed problem.\n\nLet me denote the speed of the lion as \\( v_l = 24 \\) mph and the speed of the elephant as \\( v_e = 19 \\) mph. The initial distance between them is \\( d = 1 \\) mile.\n\nSince the lion is moving towards the elephant and the elephant is moving away, the relative speed at which the distance between them is decreasing is the difference between the lion's speed and the elephant's speed. So, the relative speed \\( v_{relative} = v_l - v_e = 24 - 19 = 5 \\) mph.\n\nWait, is that right? If the lion is moving towards the elephant and the elephant is moving away, then the lion is effectively closing the gap at a rate of 5 mph. So, the time it takes to close the 1 mile gap would be the initial distance divided by the relative speed.\n\nSo, time \\( t = \\frac{d}{v_{relative}} = \\frac{1}{5} \\) hours. But the question asks for the time in minutes, so I need to convert that.\n\nSince 1 hour is 60 minutes, \\( \\frac{1}{5} \\) hours is \\( \\frac{1}{5} \\times 60 = 12 \\) minutes. So, it should take 12 minutes for the lion to catch the elephant.\n\nWait, let me double-check that. If the lion is moving at 24 mph and the elephant is moving away at 19 mph, then the lion is gaining on the elephant at a rate of 5 mph. So, to cover the 1 mile gap, it would take \\( \\frac{1}{5} \\) hours, which is indeed 12 minutes. That seems correct.\n\nAlternatively, I can model this with equations. Let me set up a coordinate system where at time \\( t \\) hours, the position of the lion is \\( 24t \\) miles from its starting point, and the position of the elephant is \\( 1 + 19t \\) miles from the starting point of the lion. Wait, actually, hold on. If the elephant is moving away from the lion, then the distance between them is increasing. So, the position of the elephant is \\( 1 + 19t \\) miles from the starting point of the lion, and the position of the lion is \\( 24t \\) miles from the same starting point.\n\nWait, but initially, they are 1 mile apart. So, if the lion starts at position 0 and the elephant starts at position 1 mile, then the distance between them at time \\( t \\) is \\( |24t - (1 + 19t)| \\). We want this distance to be 0 when the lion catches the elephant.\n\nSo, setting up the equation:\n\n\\( 24t - (1 + 19t) = 0 \\)\n\nSimplify:\n\n\\( 24t - 19t - 1 = 0 \\)\n\n\\( 5t - 1 = 0 \\)\n\n\\( 5t = 1 \\)\n\n\\( t = \\frac{1}{5} \\) hours, which is 12 minutes. So, that confirms my earlier result.\n\nAlternatively, I can think about it in terms of distance covered. The lion needs to cover the initial 1 mile plus whatever distance the elephant covers in that time. But since the lion is faster, the time is determined by the relative speed.\n\nWait, another way to think about it is using the concept of relative velocity. The lion is moving towards the elephant at 24 mph, and the elephant is moving away at 19 mph. So, the relative speed is 24 - 19 = 5 mph. So, the lion is closing the gap at 5 mph. Therefore, the time to close 1 mile is 1/5 hours, which is 12 minutes.\n\nI think that's solid. I can't see any mistakes in this reasoning. So, I think the answer is 12 minutes.\n\n**Final Answer**\nThe lion will catch the elephant in \\boxed{12} minutes.\n</think>\n\nThe elephant and the lion are initially 1 mile apart. The elephant runs directly away from the lion at 19 miles per hour, while the lion runs directly towards the elephant at 24 miles per hour. To determine how long it will take for the lion to catch the elephant, we need to consider their relative speed.\n\nThe relative speed at which the lion is closing the gap is the difference between their speeds:\n\\[ v_{\\text{relative}} = v_l - v_e = 24 \\text{ mph} - 19 \\text{ mph} = 5 \\text{ mph} \\]\n\nThe time it takes to close the 1 mile gap is calculated by dividing the initial distance by the relative speed:\n\\[ t = \\frac{1 \\text{ mile}}{5 \\text{ mph}} = \\frac{1}{5} \\text{ hours} \\]\n\nConverting this time into minutes:\n\\[ \\frac{1}{5} \\text{ hours} \\times 60 \\text{ minutes per hour} = 12 \\text{ minutes} \\]\n\nThus, the lion will catch the elephant in \\boxed{12} minutes.

### Generate with compression

In [None]:
from transformers import AutoTokenizer
from hidden_capacity_reasoning.utils import (
    WINDOW_SIZE,
    VISION_START,
    VISION_END,
    EOS_TOKEN_ID,
)
import torch
import json

torch.manual_seed(0)

dataset_pos = 11
input_ids = correct_dataset[dataset_pos]["problem"]
print(input_ids)
print("===")
print("===")
base_prompt = open(
    "hidden_capacity_reasoning/evaluation/math_500/math_500_prompt"
).read()

input_ids = [
    tokenizer.apply_chat_template(
        [
            {
                "role": "user",
                "content": base_prompt.format(question=input_ids),
            },
        ],
        tokenize=True,
        add_generation_prompt=True,
    )
]

# input_ids = dataset_item["input_ids"]
# generated_ids = dataset_item["generated_ids"]
generated_ids = [
    tokenizer.encode(
        correct_dataset[dataset_pos]["model_answer"],
        add_special_tokens=False,
    )
]

tokenizer = AutoTokenizer.from_pretrained(model_name)
device = "cuda"

# generated_tokens = tokenizer.apply_chat_template(
#     [
#         # {"role": "user", "content": "how many wings has a bird?"},
#         {"role": "user", "content": example["question"]},
#     ],
#     tokenize=True,
#     add_generation_prompt=True,
# )
generated_tokens = input_ids

with torch.no_grad():
    start_embed = model.base_model.embed_pooler.model.get_input_embeddings()(
        # start_embed = model.base_model.embed_pooler.model.embed_tokens.modules_to_save.default(
        torch.tensor([[VISION_START]], device="cuda")
    )
    end_embed = model.base_model.embed_pooler.model.get_input_embeddings()(
        # end_embed = model.base_model.embed_pooler.model.embed_tokens.modules_to_save.default(
        torch.tensor([[VISION_END]], device="cuda")
    )
    input_ids = torch.tensor(input_ids).cuda()
    input_ids_embeds = model.get_input_embeddings()(input_ids)
    windows_amount = 100
    # windows_amount = 200
    # windows_amount = 300
    # windows_amount = 400
    # windows_amount = 500
    # windows_amount = 2
    next_true_tokens = torch.tensor(generated_ids, device="cuda")[
        :, : WINDOW_SIZE * windows_amount
    ]

    # next_true_tokens = torch.tensor(next_true_tokens, device="cuda")

    original_embeds = (
        # model.base_model.embed_pooler.model.get_input_embeddings()(next_true_tokens)
        # model.base_model.model.get_input_embeddings()(next_true_tokens)
        model.get_input_embeddings()(next_true_tokens)
    ).to(torch.bfloat16)

    # compressed_part = model.base_model.embed_pooler(new_embeds_for_compression)
    new_embeds_for_compression = original_embeds.reshape(
        windows_amount, WINDOW_SIZE, -1
    )
    compressed_part = model.base_model.embed_pooler(new_embeds_for_compression)
    compressed_part = compressed_part.reshape(1, windows_amount, -1)
    start_embed = torch.rand_like(start_embed)
    compressed_part = torch.rand_like(compressed_part)
    end_embed = torch.rand_like(end_embed)
    generated_embeds = torch.cat(
        [
            input_ids_embeds,
            start_embed,
            compressed_part,
            end_embed,
        ],
        dim=1,
    )
    print("COMPRESSED PART", tokenizer.decode(next_true_tokens[-1]))
    print("===")
    generated_ids_compressed = model.generate(
        inputs_embeds=generated_embeds,
        max_new_tokens=2800,
        # max_new_tokens=5,
        do_sample=False,
        # do_sample=True,
        # temperature=0.6,
        # top_p=0.95,
    )
    # break
print(tokenizer.decode(generated_ids_compressed[-1]))
# break
# embeds_generation_tokens = generated_tokens[-1]

Suppose that $f$ is a function and $f^{-1}$ is the inverse of $f$.  If $f(1)=2$, $f(2) = 6$, and $f(3)=5$, then what is $f^{-1}(f^{-1}(6))$?
===
===


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


COMPRESSED PART Okay, so I have this problem here: There's a function f, and its inverse function is f inverse. I know that f(1) = 2, f(2) = 6, and f(3) = 5. I need to find f inverse of f inverse of 6, which is written as f^{-1}(f^{-1}(6)). Hmm, okay. Let me think about how to approach this.

First, let me recall what an inverse function does. If f is a function that maps an input to an output, then the inverse function f^{-1} maps the output back to the input. So, basically, if f(a) = b, then f^{-1}(b) = a. That makes sense. So, if I have f(1)=2, that means f^{-1}(2)=1. Similarly, f(2)=6 implies f^{-1}(6)=2, and f(3)=5 means
===
。)

首先，我需要明确题目中的信息。题目给出了函数f的三个点：f(1)=2，f(2)=6，f(3)=5。然后要求计算f^{-1}(f^{-1}(6))。

首先，我需要理解f^{-1}是什么。f^{-1}是f的反函数，也就是说，如果f(a)=b，那么f^{-1}(b)=a。因此，我需要先找到f^{-1}的值，然后应用两次f^{-1}。

接下来，我来逐步计算。首先，计算f^{-1}(6)。根据f的定义，f(2)=6，所以f^{-1}(6)=2。接下来，我需要计算f^{-1}(2)。这里，我需要找到一个x，使得f(x)=2。根据f的定义，f(1)=2，所以f^{-1}(2)=1。因此，f^{-1}(f^{-1}(6))=f^{-1}(2)=1。

不过，为了确保我的计算是正确的，我再仔细检查一遍。首先，f(1)=2

In [71]:
correct_dataset[dataset_pos]

{'problem': 'Suppose that $f$ is a function and $f^{-1}$ is the inverse of $f$.  If $f(1)=2$, $f(2) = 6$, and $f(3)=5$, then what is $f^{-1}(f^{-1}(6))$?',
 'solution': 'Since $f(2) = 6$, we have $f^{-1}(6)=2$. (Note that the hypothesis that $f$ has an inverse implies that there are no other values of $x$ with $f(x) = 6$.)  Similarly, $f(1) =2$ implies $f^{-1}(2)=1$.   So $f^{-1}(f^{-1}(6))=f^{-1}(2)=\\boxed{1}$.',
 'answer': '1',
 'subject': 'Algebra',
 'level': 4,
 'unique_id': 'test/algebra/1199.json',
 'model_answer': "Okay, so I have this problem here: There's a function f, and its inverse function is f inverse. I know that f(1) = 2, f(2) = 6, and f(3) = 5. I need to find f inverse of f inverse of 6, which is written as f^{-1}(f^{-1}(6)). Hmm, okay. Let me think about how to approach this.\n\nFirst, let me recall what an inverse function does. If f is a function that maps an input to an output, then the inverse function f^{-1} maps the output back to the input. So, basically, if f

In [73]:
next_true_tokens.shape[1]

200

In [81]:
generated_ids_compressed.shape[1], next_true_tokens.shape[
    1
], generated_ids_compressed.shape[1] + compressed_part.shape[1], torch.tensor(
    generated_ids
).shape

(725, 200, 825, torch.Size([1, 1643]))

In [None]:
print(tokenizer.decode(generated_ids[-1][:925]))

In [19]:
end_think = tokenizer.encode("</think>", add_special_tokens=False)[0]
generated_ids[-1].index(end_think)

1512

In [None]:
print(tokenizer.decode(generated_ids_compressed[-1]))

In [11]:
tokenizer.decode(next_true_tokens[-1])

"Okay, so I have this problem where an elephant and a lion are 1 mile apart. The elephant is running directly away from the lion at 19 miles per hour, and the lion is running towards the elephant at 24 miles per hour. I need to figure out how many minutes it will take for the lion to catch the elephant. Hmm, let me think about how to approach this.\n\nFirst, I know that both the elephant and the lion are moving towards or away from each other. The elephant is moving away at 19 mph, and the lion is moving towards it at 24 mph. So, their relative speed is the difference between their speeds because they're moving towards each other. Wait, no, actually, since the elephant is moving away, the lion has to cover the distance that the elephant is moving away plus the distance the elephant covers while the lion is moving towards it.\n\nLet me clarify. The lion is moving towards the elephant at 24 mph, and the elephant is moving away at 19 mph. So, the lion is closing the gap at a rate of 24 mp

In [12]:
tokenizer.decode(generated_ids_compressed[-1])

" think about it in terms of relative velocity. The lion is moving at 24 mph towards the elephant, which is moving away at 19 mph. So, the lion's speed relative to the elephant is 24 - 19 = 5 mph. So, the lion needs to cover 1 mile at a relative speed of 5 mph, which is 1/5 hours, or 12 minutes. Yep, same answer.\n\nWait, another way to think about it is to set up equations for their positions over time. Let's assume the lion starts at position 0, and the elephant starts at position 1 mile. The lion is moving towards the elephant at 24 mph, so its position at time t is 24t. The elephant is moving away from the lion at 19 mph, so its position at time t is 1 + 19t. The lion catches up when their positions are equal, so:\n\n24t = 1 + 19t\n\nSubtract 19t from both sides:\n\n5t = 1\n\nSo, t = 1/5 hours, which is 12 minutes. Yep, same result.\n\nI think I've approached this problem from multiple angles now, and each time I get the same answer: 12 minutes. So, I feel confident that 12 minutes

In [50]:
compressed_part.shape

torch.Size([1, 200, 1536])

In [55]:
print(tokenizer.decode(torch.tensor(generated_ids[-1])))

Okay, so I have this problem where an elephant and a lion are 1 mile apart. The elephant is running directly away from the lion at 19 miles per hour, and the lion is running towards the elephant at 24 miles per hour. I need to figure out how many minutes it will take for the lion to catch the elephant. Hmm, let me think about how to approach this.

First, I know that both the elephant and the lion are moving towards or away from each other. The elephant is moving away at 19 mph, and the lion is moving towards it at 24 mph. So, their relative speed is the difference between their speeds because they're moving towards each other. Wait, no, actually, since the elephant is moving away, the lion has to cover the distance that the elephant is moving away plus the distance the elephant covers while the lion is moving towards it.

Let me clarify. The lion is moving towards the elephant at 24 mph, and the elephant is moving away at 19 mph. So, the lion is closing the gap at a rate of 24 mph min

In [8]:
tokenizer.decode(torch.tensor(generated_ids[-1]))

"Okay, so I have this problem where an elephant and a lion are 1 mile apart. The elephant is running directly away from the lion at 19 miles per hour, and the lion is running towards the elephant at 24 miles per hour. I need to figure out how many minutes it will take for the lion to catch the elephant. Hmm, let me think about how to approach this.\n\nFirst, I know that both the elephant and the lion are moving towards or away from each other. The elephant is moving away at 19 mph, and the lion is moving towards it at 24 mph. So, their relative speed is the difference between their speeds because they're moving towards each other. Wait, no, actually, since the elephant is moving away, the lion has to cover the distance that the elephant is moving away plus the distance the elephant covers while the lion is moving towards it.\n\nLet me clarify. The lion is moving towards the elephant at 24 mph, and the elephant is moving away at 19 mph. So, the lion is closing the gap at a rate of 24 mp

In [None]:
# " think about it in terms of relative velocity. The lion is moving at 24 mph towards the elephant, which is moving away at 19 mph. So, the lion's speed relative to the elephant is 24 - 19 = 5 mph. So, the lion needs to cover 1 mile at a relative speed of 5 mph, which is 1/5 hours, or 12 minutes. Yep, same answer.\n\nWait, another way to think about it is to set up equations for their positions over time. Let's assume the lion starts at position 0, and the elephant starts at position 1 mile. The lion is moving towards the elephant at 24 mph, so its position at time t is 24t. The elephant is moving away from the lion at 19 mph, so its position at time t is 1 + 19t. The lion catches up when their positions are equal, so:\n\n24t = 1 + 19t\n\nSubtract 19t from both sides:\n\n5t = 1\n\nSo, t = 1/5 hours, which is 12 minutes. Yep, same result.\n\nI think I've approached this problem from multiple angles now, and each time I get the same answer: 12 minutes. So, I feel confident that 12 minutes is the correct answer.\n\nJust to make sure, let me think about the units. The speeds are in miles per hour, and the distance is in miles. So, when I divide 1 mile by 5 mph, the units are hours, which is correct. Then, converting hours to minutes by multiplying by 60 gives me 12 minutes. Perfect.\n\nI don't think I made any unit conversion errors here. The speeds are in mph, the distance is in miles, so the time comes out in hours, which I can convert to minutes. So, that seems right.\n\nAnother way to check is to think about how much distance the lion covers in 12 minutes. 12 minutes is 0.2 hours. So, 24 mph * 0.2 hours = 4.8 miles. The elephant is moving away at 19 mph, so in 12 minutes, it moves 19 * 0.2 = 3.8 miles. So, the elephant is 3.8 miles away from the starting point, and the lion is 4.8 miles from the starting point. The distance between them is 4.8 - 3.8 = 1 mile, which is the initial distance. So, that checks out.\n\nSimilarly, if I think about the elephant's speed relative to the lion, it's 19 mph, and the lion is closing the gap at 5 mph. So, in 12 minutes, the lion closes 1 mile, which is exactly the initial distance. So, that also makes sense.\n\nI think I've covered all the angles here. I can't think of another way to approach this problem, and all the methods lead to the same conclusion. So, I'm pretty sure that the answer is 12 minutes.\n\n**Final Answer**\nThe lion will catch the elephant in \\boxed{12} minutes.\n</think>\n\nThe elephant and the lion are initially 1 mile apart. The elephant runs directly away from the lion at 19 miles per hour, while the lion runs towards the elephant at 24 miles per hour. To determine how long it will take for the lion to catch the elephant, we need to consider their relative speed.\n\nThe relative speed at which the lion is closing the gap is the difference between their speeds:\n\\[ 24 \\text{ mph} - 19 \\text{ mph} = 5 \\text{ mph} \\]\n\nThe time it takes for the lion to catch the elephant can be calculated by dividing the initial distance by the relative speed:\n\\[ \\text{Time} = \\frac{\\text{Distance}}{\\text{Relative Speed}} = \\frac{1 \\text{ mile}}{5 \\text{ mph}} = 0.2 \\text{ hours} \\]\n\nConverting 0.2 hours to minutes:\n\\[ 0.2 \\text{ hours} \\times 60 \\text{ minutes per hour} = 12 \\text{ minutes} \\]\n\nThus, the lion will catch the elephant in \\boxed{12} minutes.<｜end▁of▁sentence｜>"

# "Okay, so I have this problem where an elephant and a lion are 1 mile apart. The elephant is running directly away from the lion at 19 miles per hour, and the lion is running towards the elephant at 24 miles per hour. I need to figure out how many minutes it will take for the lion to catch the elephant. Hmm, let me think about how to approach this.\n\nFirst, I know that both the elephant and the lion are moving towards or away from each other. The elephant is moving away at 19 mph, and the lion is moving towards it at 24 mph. So, their relative speed is the difference between their speeds because they're moving towards each other. Wait, no, actually, since the elephant is moving away, the lion has to cover the distance that the elephant is moving away plus the distance the elephant covers while the lion is moving towards it.\n\nLet me clarify. The lion is moving towards the elephant at 24 mph, and the elephant is moving away at 19 mph. So, the lion is closing the gap at a rate of 24 mph minus 19 mph, which is 5 mph. That makes sense because if two objects are moving towards each other, their relative speed is the sum of their speeds, but in this case, one is moving away and the other is moving towards, so it's the difference.\n\nSo, the initial distance between them is 1 mile. The lion is closing the gap at 5 mph. To find the time it takes to catch up, I can use the formula:\n\nTime = Distance / Speed\n\nSo, plugging in the numbers, the time should be 1 mile divided by 5 mph. That gives me 0.2 hours. But the question asks for the time in minutes, so I need to convert 0.2 hours to minutes. Since 1 hour is 60 minutes, 0.2 hours is 0.2 * 60 = 12 minutes.\n\nWait, let me double-check that. If the lion is moving at 24 mph and the elephant is moving away at 19 mph, the relative speed is 24 - 19 = 5 mph. So, yes, the lion is gaining on the elephant at 5 mph. So, 1 mile divided by 5 mph is indeed 0.2 hours, which is 12 minutes. That seems right.\n\nBut just to make sure I didn't make a mistake, let me think about it another way. Maybe set up an equation for their positions as functions of time and see if I get the same result.\n\nLet's denote t as the time in hours it takes for the lion to catch the elephant. In that time, the elephant will have moved a distance of 19t miles away from the starting point, and the lion will have moved 24t miles towards the elephant. Since they start 1 mile apart, the distance between them when the lion catches the elephant will be zero.\n\nSo, the distance the lion covers plus the distance the elephant covers should equal the initial distance between them. Wait, no, actually, the lion is moving towards the elephant, so the distance the lion covers is 24t, and the distance the elephant covers is 19t. But since the elephant is moving away, the total distance between them when the lion catches up is 24t - 19t = 5t. This should equal the initial distance, which is 1 mile.\n\nSo, 5t = 1 mile. Solving for t, we get t = 1/5 hours, which is 0.2 hours. Converting that to minutes, 0.2 * 60 = 12 minutes. Yep, same result. So, that seems consistent.\n\nAlternatively, maybe I can think about it in terms of how much distance the lion needs to cover relative to the elephant. Since the lion is moving faster, it's gaining on the elephant at 5 mph. So, the lion needs to cover the 1 mile gap at a relative speed of 5 mph. So, time = distance / speed = 1 / 5 hours, which is 12 minutes. Yep, same answer.\n\nI think that's solid. So, the lion will catch the elephant in 12 minutes.\n\n**Final Answer**\nThe lion will catch the elephant in \\boxed{12} minutes.\n</think>\n\nThe elephant and the lion are initially 1 mile apart. The elephant runs directly away from the lion at 19 miles per hour, while the lion runs directly towards the elephant at 24 miles per hour. \n\nTo find the time it takes for the lion to catch the elephant, we first determine their relative speed. Since the lion is moving towards the elephant and the elephant is moving away, their relative speed is the difference between their speeds:\n\n\\[\n24 \\text{ mph} - 19 \\text{ mph} = 5 \\text{ mph}\n\\]\n\nThe initial distance between them is 1 mile. Using the formula for time, which is distance divided by speed, we get:\n\n\\[\n\\text{Time} = \\frac{1 \\text{ mile}}{5 \\text{ mph}} = 0.2 \\text{ hours}\n\\]\n\nConverting 0.2 hours to minutes:\n\n\\[\n0.2 \\text{ hours} \\times 60 \\text{ minutes per hour} = 12 \\text{ minutes}\n\\]\n\nThus, the lion will catch the elephant in \\boxed{12} minutes.<｜end▁of▁sentence｜>"

In [9]:
next_true_tokens.shape

torch.Size([1, 400])

In [39]:
start_embed.shape

torch.Size([1, 1, 1536])

In [38]:
new_embeds_for_compression.shape

torch.Size([4, 2, 224256])

In [None]:
from hidden_capacity_reasoning.utils import WINDOW_SIZE, VISION_START, VISION_END
from transformers.cache_utils import DynamicCache


def _crop_past_key_values(model, past_key_values, max_length):
    """Crops the past key values up to a certain maximum length."""
    new_past = []
    if model.config.is_encoder_decoder:
        for idx in range(len(past_key_values)):
            new_past.append(
                (
                    past_key_values[idx][0][:, :, :max_length, :],
                    past_key_values[idx][1][:, :, :max_length, :],
                    past_key_values[idx][2],
                    past_key_values[idx][3],
                )
            )
        past_key_values = tuple(new_past)
    # gptbigcode is special and stores kv in shape (batch_size, seq_len, dim), if it's a multi_query model
    elif "gptbigcode" in model.__class__.__name__.lower() or (
        model.config.architectures is not None
        and "gptbigcode" in model.config.architectures[0].lower()
    ):
        if model.config.multi_query:
            for idx in range(len(past_key_values)):
                past_key_values[idx] = past_key_values[idx][:, :max_length, :]
        else:
            for idx in range(len(past_key_values)):
                past_key_values[idx] = past_key_values[idx][:, :, :max_length, :]
    elif isinstance(past_key_values, DynamicCache):
        past_key_values.crop(max_length)
    elif past_key_values is not None:
        for idx in range(len(past_key_values)):
            if past_key_values[idx] != ([], []):
                new_past.append(
                    (
                        past_key_values[idx][0][:, :, :max_length, :],
                        past_key_values[idx][1][:, :, :max_length, :],
                    )
                )
            else:
                new_past.append((past_key_values[idx][0], past_key_values[idx][1]))
        past_key_values = tuple(new_past)
    return past_key_values


# model = trainer.model
generated_tokens = tokenizer.apply_chat_template(
    [
        # {"role": "user", "content": "how many wings has a bird?"},
        {"role": "user", "content": dataset["test"].to_list()[:5][0]["question"]},
    ],
    tokenize=True,
    add_generation_prompt=True,
)

with torch.no_grad(), torch.autocast(device_type="cuda"):
    start_embed = model.base_model.embed_pooler.model.get_input_embeddings()(
        torch.tensor([[VISION_START]], device="cuda")
    )
    end_embed = model.base_model.embed_pooler.model.get_input_embeddings()(
        torch.tensor([[VISION_END]], device="cuda")
    )
    generated_tokens = torch.tensor(generated_tokens).unsqueeze(0).cuda()
    generated_embeds = model.get_input_embeddings()(generated_tokens)
    temp_gen_size = 0
    window_size = WINDOW_SIZE  # + 1
    # new_tokens = 4
    new_tokens = 1
    generation_started = False
    max_steps = (new_tokens + window_size) * 15
    past_key_values_big = None
    print("generated_embeds", generated_embeds.shape)
    for step in range(max_steps):
        if temp_gen_size == window_size + new_tokens:
            # print(
            #     "TOKENS FOR EMDED",
            #     tokenizer.decode(
            #         generated_tokens[:, -(window_size + new_tokens) :][:, :WINDOW_SIZE]
            #         .cpu()
            #         .tolist()[0]
            #     ),
            # )
            # tokenizer.decode(generated_tokens[:, : -window_size ].cpu().tolist()[0])
            if hasattr(model.base_model, "embed_pooler"):
                new_embeds_for_compression = (
                    model.base_model.embed_pooler.model.get_input_embeddings()(
                        generated_tokens[:, -(window_size + new_tokens) :][
                            :, :WINDOW_SIZE
                        ]
                    )
                ).to(torch.bfloat16)
                compressed_part = model.base_model.embed_pooler(
                    new_embeds_for_compression
                )
            else:
                compressed_part = model.embed_pooler(new_embeds_for_compression)
            # gen_embeds_prev = generated_tokens.shape[1]
            if generation_started:
                # past_key_values_big = _crop_past_key_values(
                #     model=model,
                #     past_key_values=past_key_values_big,
                #     max_length=generated_embeds.shape[1] - new_tokens - 2,
                # )
                generated_embeds = torch.cat(
                    [
                        generated_embeds[:, : -(window_size + new_tokens + 1)],
                        # generated_embeds[:, : -(window_size + new_tokens)],
                        compressed_part,
                        # torch.randn(1, 1, 1536, device="cuda"),
                        end_embed,
                        generated_embeds[:, -new_tokens:],
                    ],
                    dim=1,
                )
            else:
                # past_key_values_big = _crop_past_key_values(
                #     model=model,
                #     past_key_values=past_key_values_big,
                #     max_length=generated_embeds.shape[1] - new_tokens - 3,
                # )
                generated_embeds = torch.cat(
                    [
                        generated_embeds[:, : -(window_size + new_tokens)],
                        start_embed,
                        # torch.randn(1, 1, 1536, device="cuda"),
                        compressed_part,
                        end_embed,
                        generated_embeds[:, -new_tokens:],
                    ],
                    dim=1,
                )
                generation_started = True
            past_key_values_big = _crop_past_key_values(
                model=model,
                past_key_values=past_key_values_big,
                max_length=generated_embeds.shape[1] - new_tokens - 2,
            )
            temp_gen_size = 1

        outputs = model(
            inputs_embeds=generated_embeds,
            past_key_values=past_key_values_big,
            # use_cache=False,
        )
        logits = outputs.logits
        past_key_values_big = outputs.past_key_values
        top_token = logits.argmax(-1)[-1][-1]
        top_token_embed = model.get_input_embeddings()(top_token)
        # print(top)
        generated_tokens = torch.cat([generated_tokens, top_token.reshape(1, 1)], dim=1)

        generated_embeds = torch.cat(
            [generated_embeds, top_token_embed.reshape(1, 1, -1)], dim=1
        )
        # print(temp_gen_size, tokenizer.decode(generated_tokens[-1]))

        temp_gen_size += 1

print(tokenizer.decode(generated_tokens[-1]))

# break

generated_embeds torch.Size([1, 62, 1536])
<｜begin▁of▁sentence｜><｜User｜>Here's a question: What do many people believe happens after you die?  Here are possible answers to this question: - stop moving - nothing - go to heaven - stop living - stop breathing  I believe the correct choice is "go to heaven", here's why:
Answer:<｜Assistant｜><think>
Okay, so I'm trying to figure out the answer is correct. Let me think about it again. I think the user is trying to figure out the correct answer to the question they're asking. They provided a list of answers, and I need to heaven, but I'm not sure if I'm sure if I'm on the right track. Let me break it down step by step. The question is about what people believe happens after you die. I know that when someone dies, they believe that people often believe in something called the afterlife, which is the belief that after you die, you don't need to move or anything else. So, the correct answer is "go to heaven", which is the correct answer. The othe

# я не знаю, может быть я неправильно управляюсь с KV-cache

In [18]:
generated_tokens.shape

torch.Size([1, 227])

In [35]:
new_embeds_for_compression.shape

torch.Size([1, 10, 1536])

In [34]:
[1, 2, 3, 4, 5, 6, 7, 8][:-2]

[1, 2, 3, 4, 5, 6]

In [None]:
print(dataset["test"][0]["answer"])

In [29]:
generated_embeds.shape

torch.Size([1, 100, 1536])

In [28]:
generated_tokens.shape

torch.Size([1, 227])

### Тестируем на MATH-500

In [82]:
len(correct_dataset)

155

In [None]:
from transformers import AutoTokenizer
from hidden_capacity_reasoning.utils import (
    WINDOW_SIZE,
    VISION_START,
    VISION_END,
    EOS_TOKEN_ID,
)
import torch
import json
from lm_eval.tasks.hendrycks_math.utils import strip_string, remove_boxed, is_equiv
from hidden_capacity_reasoning.evaluation.math_500.utils import (
    dataset_answer_filter,
    model_answer_filter,
)
from tqdm.notebook import tqdm

correct_items = 0
torch.manual_seed(0)
base_prompt = open(
    "hidden_capacity_reasoning/evaluation/math_500/math_500_prompt"
).read()
tokenizer = AutoTokenizer.from_pretrained(model_name)

evaluation_dataset = []

for dataset_pos in tqdm(range(len(correct_dataset))):
    # dataset_pos = 11
    input_ids = correct_dataset[dataset_pos]["problem"]
    # print(input_ids)
    # print("===")
    # print("===")

    input_ids = [
        tokenizer.apply_chat_template(
            [
                {
                    "role": "user",
                    "content": base_prompt.format(question=input_ids),
                },
            ],
            tokenize=True,
            add_generation_prompt=True,
        )
    ]

    # input_ids = dataset_item["input_ids"]
    # generated_ids = dataset_item["generated_ids"]
    generated_ids = [
        tokenizer.encode(
            correct_dataset[dataset_pos]["model_answer"],
            add_special_tokens=False,
        )
    ]

    device = "cuda"

    with torch.no_grad():
        start_embed = model.base_model.embed_pooler.model.get_input_embeddings()(
            torch.tensor([[VISION_START]], device="cuda")
        )
        end_embed = model.base_model.embed_pooler.model.get_input_embeddings()(
            torch.tensor([[VISION_END]], device="cuda")
        )
        input_ids = torch.tensor(input_ids).cuda()
        input_ids_embeds = model.get_input_embeddings()(input_ids)
        # windows_amount = 100
        # windows_amount = 200
        # windows_amount = 300
        windows_amount = 400
        # windows_amount = 500
        # windows_amount = 2
        generated_tokens_amount = WINDOW_SIZE * windows_amount
        original_total_len = torch.tensor(generated_ids).shape[1]
        if generated_tokens_amount > original_total_len:
            windows_amount = original_total_len // WINDOW_SIZE
            generated_tokens_amount = WINDOW_SIZE * windows_amount

        next_true_tokens = torch.tensor(generated_ids, device="cuda")[
            :, :generated_tokens_amount
        ]

        # next_true_tokens = torch.tensor(next_true_tokens, device="cuda")

        original_embeds = (model.get_input_embeddings()(next_true_tokens)).to(
            torch.bfloat16
        )

        # compressed_part = model.base_model.embed_pooler(new_embeds_for_compression)
        new_embeds_for_compression = original_embeds.reshape(
            windows_amount, WINDOW_SIZE, -1
        )
        compressed_part = model.base_model.embed_pooler(new_embeds_for_compression)
        compressed_part = compressed_part.reshape(1, windows_amount, -1)
        # start_embed = torch.rand_like(start_embed)
        # compressed_part = torch.rand_like(compressed_part)
        # end_embed = torch.rand_like(end_embed)
        generated_embeds = torch.cat(
            [
                input_ids_embeds,
                start_embed,
                compressed_part,
                end_embed,
            ],
            dim=1,
        )
        # print("COMPRESSED PART", tokenizer.decode(next_true_tokens[-1]))
        # print("===")
        generated_ids_compressed = model.generate(
            inputs_embeds=generated_embeds,
            # attention_mask=torch.ones_like(generated_embeds),
            attention_mask=torch.ones(
                generated_embeds.shape[:2],
                device="cuda",
            ).long(),
            max_new_tokens=4096,
            # max_new_tokens=5,
            do_sample=False,
            # do_sample=True,
            # temperature=0.6,
            # top_p=0.95,
        )
        # break
    generated_result = tokenizer.decode(generated_ids_compressed[-1])
    # print()
    gold_answer = correct_dataset[dataset_pos]["answer"]
    answer = dataset_answer_filter(gold_answer)
    model_answer = model_answer_filter(generated_result)
    if is_equiv(answer, model_answer):
        correct_items += 1
        print("CORRECT")
    else:
        print("WRONG", gold_answer)
        print(generated_result)
    compressed_total_len = generated_ids_compressed.shape[1] + compressed_part.shape[1]
    print(
        generated_ids_compressed.shape[1],
        next_true_tokens.shape[1],
        compressed_total_len,
        torch.tensor(generated_ids).shape[1],
    )
    evaluation_dataset.append(
        {
            **correct_dataset[dataset_pos],
            "compressed_input_part": tokenizer.decode(next_true_tokens[-1]),
            "compressed_output_generation": generated_result,
            "compressed_compression_size": generated_tokens_amount,
            "original_total_len": torch.tensor(generated_ids).shape[1],
            "compressed_total_len": compressed_total_len,
        }
    )
    print("===")
    print("===")
    print("===")
    # break
    # embeds_generation_tokens = generated_tokens[-1]

  0%|          | 0/136 [00:00<?, ?it/s]

WRONG 12
 1 mile.

Let me define the position of the lion as a function of time: \( P_{lion}(t) = 0 + 24t \).

The position of the elephant, moving away from the lion, is \( P_{elephant}(t) = 1 - 19t \).

They will meet when \( P_{lion}(t) = P_{elephant}(t) \).

So, setting them equal:

\( 24t = 1 - 19t \)

Combine like terms:

\( 24t + 19t = 1 \)

\( 43t = 1 \)

\( t = 1 / 43 \) hours.

Wait, hold on, this is different from before. Earlier, I thought it was 1/5 hours, which is 12 minutes, but now I'm getting 1/43 hours. Which one is correct?

Wait, let me check my equations.

Position of lion: starting at 0, moving towards positive direction at 24 mph. So, \( P_{lion}(t) = 24t \).

Position of elephant: starting at 1, moving away from lion towards negative direction at 19 mph. So, \( P_{elephant}(t) = 1 - 19t \).

Setting them equal: \( 24t = 1 - 19t \).

So, \( 24t + 19t = 1 \)

\( 43t = 1 \)

\( t = 1 / 43 \) hours.

Wait, so this is 1 divided by 43 hours. Let me convert that to min

In [8]:
generated_embeds.shape[:2]

torch.Size([1, 656])

In [12]:
torch.ones(
    generated_embeds.shape[:2],
    device="cuda",
).long()

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1

In [None]:
original_embeds.shape

torch.Size([1, 719, 1536])

In [None]:
print(tokenizer.decode(next_true_tokens[-1]))

In [4]:
len(correct_dataset) / len(dataset), correct_items / len(dataset)

(0.8908045977011494, 0.7816091954022989)

In [None]:
# model.base_model_prefix

'model'

In [18]:
evaluation_dataset[0]

{'problem': 'Below is a magic square, meaning that the sum of the numbers in each row, in each column, and in each of the $2$ main diagonals are equal. What is the value of $n$?\n\n[asy]size(125);\nfor(int i = 0; i<4; ++i)\n{\n\ndraw((0,i)--(3,i),linewidth(1));\n}\n\nfor(int j = 0; j<4; ++j)\n{\n\ndraw((j,0)--(j,3),linewidth(1));\n}\n\nlabel("$n-3$",(.5,.5));\nlabel("3",(.5,1.5));\nlabel("$n+1$",(.5,2.5));\n\nlabel("$n+2$",(1.5,.5));\nlabel("$2n-9$",(1.5,1.5));\nlabel("$1$",(1.5,2.5));\n\nlabel("$2$",(2.5,.5));\nlabel("$n$",(2.5,1.5));\nlabel("$n-1$",(2.5,2.5));\n[/asy]',
 'solution': 'First, we can evaluate the sum across the first row, which gives $(n+1)+1+(n-1)=2n+1$.  Evaluate the sum of the entries across the second row, $3+(2n-9)+n=3n-6$. Now, since we have a magic square, these two sums are equal.  So $2n+1=3n-6$. Isolating $n$, we obtain $n = \\boxed{7}$.\n\nThe square will look like: [asy] size(2cm);\ndraw((0,0)--(3,0)--(3,3)--(0,3)--cycle,linewidth(1));\ndraw((1,0)--(1,3),lin

In [5]:
import json

base_path = "hidden_capacity_reasoning/evaluation/math_500/evals/compression_tests"
save_id = f"test_correct_dataset_{len(correct_dataset)}_compressed_window_amount=400,window=2__2025_04_22_01_43_43_347583__10000"
with open(f"{base_path}/{save_id}.json", "w") as f:
    json.dump(evaluation_dataset, f)

In [27]:
import json

with open(
    f"hidden_capacity_reasoning/evaluation/math_500/evals/compression_tests/correct_dataset_155_compressed_window_amount=400,window=2__2025_04_22_01_43_43_347583__90000.json",
    "r",
) as f:
    evaluation_dataset = json.load(f)

In [6]:
original_total_len = 0
compressed_total_len = 0
for item in evaluation_dataset:
    original_total_len += item["original_total_len"]
    compressed_total_len += item["compressed_total_len"]
original_total_len, compressed_total_len, compressed_total_len / original_total_len

(350015, 338450, 0.9669585589189035)