In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    device_map={"": 0},
    attn_implementation="sdpa",
)
# model = model.eval()
model.requires_grad_(False)

from datasets import load_dataset

dataset = load_dataset(
    # "dim/hendrycks_math_train_12k_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096"
    # "dim/hendrycks_math_test_500_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096_greedy"
    # "dim/hendrycks_math_train_1k_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096_greedy"
    "dim/hendrycks_math_test_500_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096_greedy"
)

dataset = dataset["train"].train_test_split(
    # test_size=250,
    test_size=350,
    # test_size=999,
    # test_size=1,
    seed=42,
)
dataset = dataset["test"].filter(lambda x: x["model_answer"].count("</think>") == 1)

from lm_eval.tasks.hendrycks_math.utils import strip_string, remove_boxed, is_equiv
from hidden_capacity_reasoning.evaluation.math_500.utils import (
    dataset_answer_filter,
    model_answer_filter,
)

correct_dataset = []

for pos, item in enumerate(dataset):
    try:
        answer = dataset_answer_filter(item["answer"])
        model_answer = model_answer_filter(item["model_answer"])
        # print(answer, model_answer)
        # break
        if is_equiv(answer, model_answer):
            correct_dataset.append(item)
    except:
        pass

print(len(dataset), len(correct_dataset), len(correct_dataset) / len(dataset))

correct_dataset = correct_dataset[:30]
len(correct_dataset)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
224 202 0.9017857142857143


30

## Обучение по чанкам в цикле

In [5]:
import torch

from lm_eval.tasks.hendrycks_math.utils import strip_string, remove_boxed, is_equiv
from hidden_capacity_reasoning.evaluation.math_500.utils import (
    dataset_answer_filter,
    model_answer_filter,
)
from tqdm.notebook import tqdm
from tqdm import tqdm as text_tqdm
from hidden_capacity_reasoning.utils import (
    tokenize_single_turn,
    EOS_TOKEN_ID,
    END_THINK_ID,
)

torch.manual_seed(0)
base_prompt = open(
    "hidden_capacity_reasoning/evaluation/math_500/math_500_prompt"
).read()

max_new_tokens = 100
compression_tokens_amount = 2
max_total_tokens = 4096
max_total_steps = max_total_tokens // max_new_tokens + 1

evaluation_dataset = []
correct_items = 0
model.generation_config.pad_token_id = tokenizer.pad_token_id

for dataset_pos in tqdm(range(len(correct_dataset))):
    # for dataset_pos in tqdm(range(1, len(correct_dataset))):
    tokenized_turn = tokenize_single_turn(
        question=base_prompt.format(question=correct_dataset[dataset_pos]["problem"]),
        answer=correct_dataset[dataset_pos]["model_answer"],
        tokenizer=tokenizer,
    )
    for key in tokenized_turn.keys():
        tokenized_turn[key] = torch.tensor(tokenized_turn[key])

    device = "cuda"

    content_compression_mask = tokenized_turn["content_compression_mask"]

    input_part_end = (content_compression_mask == 0).nonzero()[-3][0]
    # get only question part
    question_input_ids = (
        tokenized_turn["input_ids"][: int(input_part_end) + 1].unsqueeze(0).cuda()
    )
    # print(tokenizer.decode(question_input_ids[-1]))

    ######## start loop generation
    ########
    compression_loop = True
    input_ids_embeds = model.get_input_embeddings()(question_input_ids)
    compression_part = torch.tensor([[0]])
    generated_ids_new = None
    generated_embeds = None
    generated_embeds_prev = None
    generated_ids_new_prev = None
    end_of_think = False
    total_generated_text = ""

    for compression_step in text_tqdm(range(max_total_steps)):
        ######## generate new tokens
        ########
        inputs_embeds = None
        with torch.no_grad():

            if compression_part.shape[1] >= compression_tokens_amount:
                generated_embeds_prev = generated_embeds[
                    :, -(max_new_tokens // 2) :, :
                ].clone()
                inputs_embeds = torch.cat(
                    [
                        input_ids_embeds,
                        compression_part,
                        generated_embeds_prev,
                    ],
                    dim=1,
                )
            else:
                # first time generation
                inputs_embeds = torch.cat(
                    [
                        input_ids_embeds,
                    ],
                    dim=1,
                )
            generated_ids_new = model.generate(
                inputs_embeds=inputs_embeds,
                attention_mask=torch.ones(
                    inputs_embeds.shape[:2],
                    device="cuda",
                ).long(),
                max_new_tokens=max_new_tokens,
                do_sample=False,
                use_cache=compression_step > 0,
            )
            # break
        generated_result = tokenizer.decode(generated_ids_new[-1])
        # print(generated_result)
        total_generated_text += generated_result
        print("=" * 50)
        generated_embeds = model.get_input_embeddings()(generated_ids_new)
        if END_THINK_ID in generated_ids_new[-1].tolist():
            end_of_think = True
            break

        ########
        ######## get original language loss
        ########
        labels = None
        if compression_part.shape[1] >= compression_tokens_amount:
            labels = torch.cat(
                [
                    question_input_ids.cuda(),
                    ((torch.ones(compression_part.shape[:2]) * -100).long()).cuda(),
                    (
                        (torch.ones(generated_embeds_prev.shape[:2]) * -100).long()
                    ).cuda(),
                    generated_ids_new.cuda(),
                ],
                dim=1,
            )
        else:
            # first time generation
            labels = torch.cat(
                [
                    question_input_ids.cuda(),
                    generated_ids_new.cuda(),
                ],
                dim=1,
            )

        question_content_mask = content_compression_mask[
            : int(input_part_end) + 1
        ].clone()
        question_content_mask[question_content_mask == 0] = 4
        question_content_mask[question_content_mask == 1] = 0
        question_content_mask[question_content_mask == 4] = 1

        if compression_part.shape[1] >= compression_tokens_amount:
            train_content_mask_new = torch.cat(
                [
                    question_content_mask,
                    torch.zeros(compression_part.shape[1]),
                    torch.zeros(generated_embeds_prev.shape[1]),
                    torch.zeros(generated_ids_new.shape[1] // 2),
                    torch.ones(generated_ids_new.shape[1] // 2),
                ]
            ).long()
        else:
            train_content_mask_new = torch.cat(
                [
                    question_content_mask,
                    torch.ones(generated_ids_new.shape[1] // 2) * 0,
                    torch.ones(generated_ids_new.shape[1] // 2),
                ]
            ).long()

        generated_ids_new_prev = generated_ids_new.clone()
        # generated_embeds = model.get_input_embeddings()(generated_ids_new)

        new_input_embeds = None
        if compression_part.shape[1] >= compression_tokens_amount:
            new_input_embeds = torch.cat(
                [
                    input_ids_embeds.cuda(),
                    compression_part.cuda(),
                    generated_embeds_prev.cuda(),
                    generated_embeds,
                ],
                dim=1,
            )
        else:
            new_input_embeds = torch.cat(
                [
                    input_ids_embeds,
                    generated_embeds,
                ],
                dim=1,
            )

        labels[:, train_content_mask_new == 0] = -100

        with torch.no_grad():
            original_loss = model(
                inputs_embeds=new_input_embeds,
                labels=labels,
            ).loss
        print("original_loss", original_loss)
        ########
        ######## generate compress embeddings
        ########
        compression_tensor = torch.nn.Parameter(
            torch.rand_like(
                new_input_embeds[:, :compression_tokens_amount, :],
            )
            * model.get_input_embeddings().weight.data.std(),
            requires_grad=True,
        )

        question_labels = question_input_ids.clone().cuda()
        question_labels[0][question_content_mask == 0] = -100
        compression_tensor_labels = (
            (torch.ones(compression_tensor.shape[:2]) * -100).long().cuda()
        )
        if compression_part.shape[1] >= compression_tokens_amount:
            compression_part_labels = (
                (torch.ones(compression_part.shape[:2]) * -100).long().cuda()
            )
            compressed_labels = torch.cat(
                [
                    question_labels,
                    compression_part_labels,
                    compression_tensor_labels,
                    generated_ids_new[:, -(max_new_tokens // 2) :],
                ],
                dim=-1,
            )
        else:
            compressed_labels = torch.cat(
                [
                    question_labels,
                    compression_tensor_labels,
                    generated_ids_new[:, -(max_new_tokens // 2) :],
                ],
                dim=-1,
            )

        ########
        ######## train
        ########
        epoch_amount = 100

        optimizer = torch.optim.Adam([compression_tensor], lr=0.1)
        acclumulation_steps = 1
        for epoch in range(epoch_amount):
            if compression_part.shape[1] >= compression_tokens_amount:
                compressed_inputs_embeds = torch.cat(
                    [
                        input_ids_embeds.detach(),
                        compression_part.detach(),
                        compression_tensor,
                        generated_embeds[:, -(max_new_tokens // 2) :, :].detach(),
                    ],
                    dim=1,
                )
            else:
                compressed_inputs_embeds = torch.cat(
                    [
                        input_ids_embeds.detach(),
                        compression_tensor,
                        generated_embeds[:, -(max_new_tokens // 2) :, :].detach(),
                    ],
                    dim=1,
                )
            compression_loss = model(
                inputs_embeds=compressed_inputs_embeds,
                labels=compressed_labels,
            ).loss
            compression_loss.backward()
            if (epoch + 1) % acclumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            if compression_loss.item() <= original_loss.item():
                break
            # if compression_loss.item() <= (original_loss.item() + 0.01):
            #     break
            # if (compression_loss.item() + 0.05) <= original_loss.item():
            #     break
        print("compression_loss", compression_loss)
        if compression_part.shape[1] >= compression_tokens_amount:
            compression_part = torch.cat(
                [
                    compression_part,
                    compression_tensor.detach(),
                ],
                dim=1,
            )
        else:
            compression_part = compression_tensor.detach()
    # if end_of_think:
    inputs_embeds = torch.cat(
        [
            inputs_embeds,
            generated_embeds,
        ],
        dim=1,
    )
    final_response = model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=torch.ones(
            inputs_embeds.shape[:2],
            device="cuda",
        ).long(),
        max_new_tokens=(max_total_steps - compression_step) * max_new_tokens,
        do_sample=False,
    )
    final_answer = tokenizer.decode(final_response[-1])
    total_generated_text += final_answer
    # print("FINAL ANSWER", final_answer)

    gold_answer = correct_dataset[dataset_pos]["answer"]
    answer = dataset_answer_filter(gold_answer)
    # print("GOLD ANSWER", answer)
    model_answer = model_answer_filter(total_generated_text)
    if is_equiv(answer, model_answer):
        correct_items += 1
        print("CORRECT")
    else:
        print("WRONG", gold_answer)
        print(total_generated_text)

    compressed_total_len = inputs_embeds.shape[1] + final_response.shape[1]
    total_generated_tokens = final_response.shape[1] + max_new_tokens * (
        compression_step + 1
    )
    original_total_len = len(
        tokenizer.encode(
            correct_dataset[dataset_pos]["model_answer"],
            add_special_tokens=False,
        )
    )
    print(
        f"вопрос+сжатые+сгенерированные={compressed_total_len}, всего_сгенерированно_токенов={total_generated_tokens} оригинальная_генерация={original_total_len}"
    )
    evaluation_dataset.append(
        {
            "original_total_len": original_total_len,
            "compressed_total_len": compressed_total_len,
        }
    )
    # break

  0%|          | 0/30 [00:00<?, ?it/s]



original_loss tensor(0.8431, device='cuda:0')




compression_loss tensor(0.8421, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7182, device='cuda:0')




compression_loss tensor(0.8334, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7118, device='cuda:0')




compression_loss tensor(0.7108, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7083, device='cuda:0')




compression_loss tensor(0.7577, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7250, device='cuda:0')




compression_loss tensor(0.8283, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7804, device='cuda:0')




compression_loss tensor(0.7789, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7775, device='cuda:0')




compression_loss tensor(0.7762, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7640, device='cuda:0')




compression_loss tensor(0.8074, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8841, device='cuda:0')




compression_loss tensor(0.8821, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7429, device='cuda:0')




compression_loss tensor(0.7387, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7621, device='cuda:0')




compression_loss tensor(1.0413, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9197, device='cuda:0')




compression_loss tensor(0.9138, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7764, device='cuda:0')




compression_loss tensor(0.8189, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7202, device='cuda:0')




compression_loss tensor(0.7426, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7072, device='cuda:0')




compression_loss tensor(0.9352, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7070, device='cuda:0')




compression_loss tensor(0.7423, device='cuda:0', grad_fn=<NllLossBackward0>)


 39%|███▉      | 16/41 [02:08<03:20,  8.04s/it]






WRONG 7
Okay, so I have this magic square problem here, and I need to find the value of \( n \). Let me try to figure this out step by step. 

First, I remember that a magic square is a grid where the sums of numbers in each row, each column, and both main diagonals are equal. That common sum is called the magic constant. So, my goal is to find \( n \) such that all these sums are equal.

Looking at the Asymptote code, it seems like the magic square is a 3x3 grid. Let me visualize it based on the labels provided:

- The top row has three cells: \( n-3 \), \( 3 \), and \( n+1 \).
- The middle row has: \( n+2 \), \( 2n-9 \), and \( 1 \).
- The bottom row has: \( 2 \), \( n \), and \( n-1 \).

Now, I'll calculate the sums for each row:

1. **First Row:**
   - Sum = \( (n - 3) + 3 + (n + 1) \)
   - Simplify: \( n - 3 + 3 + n + 1 = 2n + 1 \)

2. **Second Row:**
   - Sum = \( (n + 2) + (2n - 9) + 1 \)
   - \( = 3n - 6 \)

3. **Third Row:**
   - Sum = \( 2 + n + (n - 1) \)
   - \( = 2n + 1 \)



original_loss tensor(0.8121, device='cuda:0')




compression_loss tensor(0.8090, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7928, device='cuda:0')




compression_loss tensor(0.7922, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7880, device='cuda:0')




compression_loss tensor(0.7847, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8396, device='cuda:0')




compression_loss tensor(0.9120, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8257, device='cuda:0')




compression_loss tensor(0.8240, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8081, device='cuda:0')




compression_loss tensor(0.8047, device='cuda:0', grad_fn=<NllLossBackward0>)


 15%|█▍        | 6/41 [00:25<02:26,  4.19s/it]






CORRECT
вопрос+сжатые+сгенерированные=366, всего_сгенерированно_токенов=835 оригинальная_генерация=1125




original_loss tensor(0.8157, device='cuda:0')




compression_loss tensor(0.8120, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8411, device='cuda:0')




compression_loss tensor(0.8384, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8371, device='cuda:0')




compression_loss tensor(0.8334, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8494, device='cuda:0')




compression_loss tensor(0.9680, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8392, device='cuda:0')




compression_loss tensor(0.8378, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9259, device='cuda:0')




compression_loss tensor(0.9237, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8127, device='cuda:0')




compression_loss tensor(0.8100, device='cuda:0', grad_fn=<NllLossBackward0>)


 17%|█▋        | 7/41 [00:26<02:07,  3.75s/it]






CORRECT
вопрос+сжатые+сгенерированные=455, всего_сгенерированно_токенов=1035 оригинальная_генерация=1548




original_loss tensor(0.8621, device='cuda:0')




compression_loss tensor(0.8610, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8977, device='cuda:0')




compression_loss tensor(0.8957, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7744, device='cuda:0')




compression_loss tensor(0.7720, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8085, device='cuda:0')




compression_loss tensor(0.8011, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8441, device='cuda:0')




compression_loss tensor(0.8380, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8294, device='cuda:0')




compression_loss tensor(0.8220, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8461, device='cuda:0')




compression_loss tensor(0.8424, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8185, device='cuda:0')




compression_loss tensor(0.8152, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8214, device='cuda:0')




compression_loss tensor(0.8192, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8659, device='cuda:0')




compression_loss tensor(0.8617, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8761, device='cuda:0')




compression_loss tensor(0.8648, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7963, device='cuda:0')




compression_loss tensor(0.7863, device='cuda:0', grad_fn=<NllLossBackward0>)


 29%|██▉       | 12/41 [00:43<01:45,  3.65s/it]






CORRECT
вопрос+сжатые+сгенерированные=662, всего_сгенерированно_токенов=1722 оригинальная_генерация=3960




original_loss tensor(0.9370, device='cuda:0')




compression_loss tensor(0.9342, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9021, device='cuda:0')




compression_loss tensor(0.8934, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9136, device='cuda:0')




compression_loss tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward0>)


  7%|▋         | 3/41 [00:12<02:42,  4.28s/it]






WRONG 12
Okay, so I have this problem here where a regular octagon has the same perimeter as a regular hexagon that's shown with a side length of 16 cm. I need to find out how long each side of the octagon is. Hmm, let me think about how to approach this.

First, I remember that both a regular hexagon and a regular octagon have all sides equal and all angles equal. So, their perimeters will be just the number of sides multiplied by the length of each side.

The problem says the regular octagon has the same perimeter as the regular hexagon shown, which has a side length of 16 cm. So, I need to find the side length of the octagon.

Let me write down what I know:

- Regular hexagon: 6 sides, each 16 cm.
- Regular octagon: 8 sides, each unknown length, let's call it 's'.

Since their perimeters are equal, I can set up an equation:

Perimeter of hexagon = Perimeter of octagon

Calculating the perimeter of the hexagon: 6 sides * 16 cm = 96 cm.

So, 96 cm = 8 sides * s.

To find 's', I'll div



original_loss tensor(0.8188, device='cuda:0')




compression_loss tensor(0.8172, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8177, device='cuda:0')




compression_loss tensor(0.8136, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9033, device='cuda:0')




compression_loss tensor(0.9006, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8467, device='cuda:0')




compression_loss tensor(0.9670, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8054, device='cuda:0')




compression_loss tensor(0.8038, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(1.0131, device='cuda:0')




compression_loss tensor(1.0108, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9015, device='cuda:0')




compression_loss tensor(0.9003, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8194, device='cuda:0')




compression_loss tensor(0.8147, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8039, device='cuda:0')




compression_loss tensor(0.8017, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7173, device='cuda:0')




compression_loss tensor(0.7168, device='cuda:0', grad_fn=<NllLossBackward0>)


 24%|██▍       | 10/41 [00:34<01:47,  3.47s/it]






CORRECT
вопрос+сжатые+сгенерированные=571, всего_сгенерированно_токенов=1442 оригинальная_генерация=1080




original_loss tensor(0.8423, device='cuda:0')




compression_loss tensor(0.8338, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8926, device='cuda:0')




compression_loss tensor(0.8906, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8459, device='cuda:0')




compression_loss tensor(0.9518, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7256, device='cuda:0')




compression_loss tensor(0.7248, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7343, device='cuda:0')




compression_loss tensor(0.7325, device='cuda:0', grad_fn=<NllLossBackward0>)


 12%|█▏        | 5/41 [00:22<02:44,  4.58s/it]






CORRECT
вопрос+сжатые+сгенерированные=436, всего_сгенерированно_токенов=803 оригинальная_генерация=1639




original_loss tensor(0.9346, device='cuda:0')




compression_loss tensor(0.9314, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7371, device='cuda:0')




compression_loss tensor(0.7354, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8521, device='cuda:0')




compression_loss tensor(0.8421, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8284, device='cuda:0')




compression_loss tensor(0.8237, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7658, device='cuda:0')




compression_loss tensor(0.7649, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7951, device='cuda:0')




compression_loss tensor(0.7943, device='cuda:0', grad_fn=<NllLossBackward0>)


 15%|█▍        | 6/41 [00:20<02:00,  3.45s/it]






CORRECT
вопрос+сжатые+сгенерированные=742, всего_сгенерированно_токенов=1224 оригинальная_генерация=1732




original_loss tensor(0.9463, device='cuda:0')




compression_loss tensor(0.9435, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8920, device='cuda:0')




compression_loss tensor(0.8873, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9196, device='cuda:0')




compression_loss tensor(0.9124, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8193, device='cuda:0')




compression_loss tensor(1.8780, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9878, device='cuda:0')




compression_loss tensor(0.9832, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9555, device='cuda:0')




compression_loss tensor(0.9489, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7867, device='cuda:0')




compression_loss tensor(0.7827, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9423, device='cuda:0')




compression_loss tensor(0.9409, device='cuda:0', grad_fn=<NllLossBackward0>)


 20%|█▉        | 8/41 [00:33<02:18,  4.21s/it]






CORRECT
вопрос+сжатые+сгенерированные=647, всего_сгенерированно_токенов=1296 оригинальная_генерация=1198




original_loss tensor(0.8693, device='cuda:0')




compression_loss tensor(0.8668, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(1.1428, device='cuda:0')




compression_loss tensor(1.1369, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9562, device='cuda:0')




compression_loss tensor(0.9515, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7353, device='cuda:0')




compression_loss tensor(0.7341, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9439, device='cuda:0')




compression_loss tensor(0.9414, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8374, device='cuda:0')




compression_loss tensor(0.8343, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8576, device='cuda:0')




compression_loss tensor(0.8543, device='cuda:0', grad_fn=<NllLossBackward0>)


 17%|█▋        | 7/41 [00:22<01:48,  3.20s/it]






CORRECT
вопрос+сжатые+сгенерированные=647, всего_сгенерированно_токенов=1229 оригинальная_генерация=2314




original_loss tensor(0.7873, device='cuda:0')




compression_loss tensor(0.7839, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8106, device='cuda:0')




compression_loss tensor(0.8015, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8398, device='cuda:0')




compression_loss tensor(0.8323, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8304, device='cuda:0')




compression_loss tensor(0.8292, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7839, device='cuda:0')




compression_loss tensor(0.7819, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8163, device='cuda:0')




compression_loss tensor(0.8139, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8129, device='cuda:0')




compression_loss tensor(0.8127, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8723, device='cuda:0')




compression_loss tensor(0.8718, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9567, device='cuda:0')




compression_loss tensor(0.9562, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8121, device='cuda:0')




compression_loss tensor(0.8113, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8880, device='cuda:0')




compression_loss tensor(0.9090, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7892, device='cuda:0')




compression_loss tensor(0.8143, device='cuda:0', grad_fn=<NllLossBackward0>)


 29%|██▉       | 12/41 [00:56<02:17,  4.73s/it]






CORRECT
вопрос+сжатые+сгенерированные=420, всего_сгенерированно_токенов=1461 оригинальная_генерация=1262




original_loss tensor(0.8776, device='cuda:0')




compression_loss tensor(0.8769, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7426, device='cuda:0')




compression_loss tensor(0.7417, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8488, device='cuda:0')




compression_loss tensor(0.8481, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7872, device='cuda:0')




compression_loss tensor(0.7836, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9293, device='cuda:0')




compression_loss tensor(0.9269, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7949, device='cuda:0')




compression_loss tensor(0.7929, device='cuda:0', grad_fn=<NllLossBackward0>)


 15%|█▍        | 6/41 [00:18<01:47,  3.07s/it]






WRONG 1260
Okay, so I need to figure out how many ways there are to arrange the letters of the word ELLIPSE. Hmm, let me think. I remember that when dealing with permutations of letters in a word, if all the letters are unique, the number of arrangements is just factorial of the number of letters. But wait, in this case, I notice that some letters are repeated. Let me check the word ELLIPSE.

Breaking it down: E, L, L, I, P, S, E. Hmm, so E appears twice, L appears twice, and the rest are unique. So, the formula for permutations of multiset comes into play here.

The general formula is:

\[
\frac{n!}{n_1! \times n_2! \times \dots \times n_k!}
\]

Where \( n \) is the total number of letters, and \( n_1, n_2, \dots, n_k \) are the counts of each repeating letter.

First, I need to figure out how many times each letter appears in "ELLIPSE." Let me write it out: E, L, L, I, P, S, E, E.

So, counting each letter:

- E appears 4 times.
- L appears 2 times.
- I appears 1 time.
- P appears 1 



original_loss tensor(0.8977, device='cuda:0')




compression_loss tensor(0.8867, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8568, device='cuda:0')




compression_loss tensor(0.8491, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8132, device='cuda:0')




compression_loss tensor(0.8131, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8340, device='cuda:0')




compression_loss tensor(0.8308, device='cuda:0', grad_fn=<NllLossBackward0>)


 10%|▉         | 4/41 [00:11<01:48,  2.92s/it]






CORRECT
вопрос+сжатые+сгенерированные=351, всего_сгенерированно_токенов=653 оригинальная_генерация=1482




original_loss tensor(0.8668, device='cuda:0')




compression_loss tensor(1.1339, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(1.0320, device='cuda:0')




compression_loss tensor(1.0253, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(1.1533, device='cuda:0')




compression_loss tensor(1.1393, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9394, device='cuda:0')




compression_loss tensor(0.9359, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(1.1989, device='cuda:0')




compression_loss tensor(1.1752, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8231, device='cuda:0')




compression_loss tensor(0.8130, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9072, device='cuda:0')




compression_loss tensor(0.8972, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8422, device='cuda:0')




compression_loss tensor(0.8401, device='cuda:0', grad_fn=<NllLossBackward0>)


 20%|█▉        | 8/41 [00:25<01:45,  3.21s/it]






CORRECT
вопрос+сжатые+сгенерированные=383, всего_сгенерированно_токенов=1068 оригинальная_генерация=814




original_loss tensor(0.8563, device='cuda:0')




compression_loss tensor(0.8451, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7690, device='cuda:0')




compression_loss tensor(0.7638, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8237, device='cuda:0')




compression_loss tensor(0.8178, device='cuda:0', grad_fn=<NllLossBackward0>)


  7%|▋         | 3/41 [00:10<02:11,  3.46s/it]






CORRECT
вопрос+сжатые+сгенерированные=1303, всего_сгенерированно_токенов=1503 оригинальная_генерация=2318




original_loss tensor(0.8680, device='cuda:0')




compression_loss tensor(0.8648, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8584, device='cuda:0')




compression_loss tensor(0.8558, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7227, device='cuda:0')




compression_loss tensor(0.7219, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7338, device='cuda:0')




compression_loss tensor(0.7261, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7036, device='cuda:0')




compression_loss tensor(0.7030, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7821, device='cuda:0')




compression_loss tensor(0.7810, device='cuda:0', grad_fn=<NllLossBackward0>)


 15%|█▍        | 6/41 [00:26<02:35,  4.43s/it]






CORRECT
вопрос+сжатые+сгенерированные=1033, всего_сгенерированно_токенов=1476 оригинальная_генерация=2469




original_loss tensor(0.8305, device='cuda:0')




compression_loss tensor(1.0887, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8674, device='cuda:0')




compression_loss tensor(1.1047, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7665, device='cuda:0')




compression_loss tensor(0.7664, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7449, device='cuda:0')




compression_loss tensor(0.7430, device='cuda:0', grad_fn=<NllLossBackward0>)


 10%|▉         | 4/41 [00:20<03:11,  5.17s/it]






CORRECT
вопрос+сжатые+сгенерированные=769, всего_сгенерированно_токенов=1026 оригинальная_генерация=1907




original_loss tensor(0.9633, device='cuda:0')




compression_loss tensor(0.9577, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8914, device='cuda:0')




compression_loss tensor(0.8797, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8811, device='cuda:0')




compression_loss tensor(0.8738, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7228, device='cuda:0')




compression_loss tensor(0.7219, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.6864, device='cuda:0')




compression_loss tensor(0.6866, device='cuda:0', grad_fn=<NllLossBackward0>)


 12%|█▏        | 5/41 [00:23<02:51,  4.75s/it]






WRONG -13x+3
Okay, so I have this polynomial f(x) and I need to find the remainder when it's divided by x² - 1. Hmm, I remember that when you divide a polynomial by a quadratic, the remainder should be a linear polynomial, right? So, it should look like ax + b, where a and b are constants that I need to find.

But the problem says I can't use long division, which would be really messy because the polynomial is degree 10. I need another method. Hmm, maybe I can use the Remainder Theorem or something related to polynomial division.

Wait, the Remainder Theorem says that the remainder of a polynomial f(x) divided by (x - c) is just f(c). But here, the divisor is a quadratic, x² - 1, which factors into (x - 1)(x + 1). So, maybe I can use the Remainder Theorem for each root of the divisor.

If I recall correctly, the Remainder Theorem states that the remainder of a polynomial f(x) divided by (x - a) is f(a). So, if I can find the remainders when f(x) is divided by (x - 1) and (x + 1), then 



original_loss tensor(0.9066, device='cuda:0')




compression_loss tensor(0.9027, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8702, device='cuda:0')




compression_loss tensor(0.9115, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8215, device='cuda:0')




compression_loss tensor(0.8200, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8231, device='cuda:0')




compression_loss tensor(0.8221, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9539, device='cuda:0')




compression_loss tensor(0.9499, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8496, device='cuda:0')




compression_loss tensor(0.8456, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9386, device='cuda:0')




compression_loss tensor(0.9385, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9340, device='cuda:0')




compression_loss tensor(0.9332, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9738, device='cuda:0')




compression_loss tensor(0.9663, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9546, device='cuda:0')




compression_loss tensor(0.9545, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7251, device='cuda:0')




compression_loss tensor(0.7164, device='cuda:0', grad_fn=<NllLossBackward0>)


 27%|██▋       | 11/41 [00:40<01:50,  3.68s/it]






CORRECT
вопрос+сжатые+сгенерированные=567, всего_сгенерированно_токенов=1537 оригинальная_генерация=2664




original_loss tensor(0.8891, device='cuda:0')




compression_loss tensor(0.8864, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9535, device='cuda:0')




compression_loss tensor(0.9487, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8809, device='cuda:0')




compression_loss tensor(0.8803, device='cuda:0', grad_fn=<NllLossBackward0>)


  7%|▋         | 3/41 [00:10<02:06,  3.34s/it]






CORRECT
вопрос+сжатые+сгенерированные=428, всего_сгенерированно_токенов=602 оригинальная_генерация=1143




original_loss tensor(0.9330, device='cuda:0')




compression_loss tensor(0.9278, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8332, device='cuda:0')




compression_loss tensor(0.8289, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8765, device='cuda:0')




compression_loss tensor(0.8719, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8904, device='cuda:0')




compression_loss tensor(0.8841, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8653, device='cuda:0')




compression_loss tensor(0.8578, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8713, device='cuda:0')




compression_loss tensor(0.8707, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8107, device='cuda:0')




compression_loss tensor(0.8025, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.6991, device='cuda:0')




compression_loss tensor(0.7239, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(1.0452, device='cuda:0')




compression_loss tensor(1.0401, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8179, device='cuda:0')




compression_loss tensor(0.8100, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7184, device='cuda:0')




compression_loss tensor(0.7161, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7551, device='cuda:0')




compression_loss tensor(0.8852, device='cuda:0', grad_fn=<NllLossBackward0>)


 29%|██▉       | 12/41 [00:45<01:49,  3.78s/it]






WRONG 17
Okay, so I have this problem here: I need to find the unique odd integer t such that 0 < t < 23, and t + 2 is the inverse of t modulo 23. Hmm, let me try to understand what this means.

First, let me recall what an inverse modulo n is. If I have an integer t, its inverse modulo 23 is another integer, let's call it s, such that t * s ≡ 1 mod 23. So, in this problem, we're told that t + 2 is the inverse of t modulo 23. That means t * (t + 2) ≡ 1 mod 23.

So, the equation we need to solve is t(t + 2) ≡ 1 mod 23. Let's write that out:

t(t + 2) ≡ 1 mod 23

Expanding the left side:

t² + 2t ≡ 1 mod 23

Now, let's rearrange the equation to bring all terms to one side:

t² + 2t - 1 ≡ 0 mod 23

This is a quadratic congruence. To solve for t, we can use the quadratic formula, but since we're working modulo a prime number (23 is prime), we can find the discriminant and check if it's a quadratic residue modulo 23.

The discriminant D is given by:
D = (2)^2 - 4*1*2 = 4 - 8 = -4

So, we ne



original_loss tensor(0.7613, device='cuda:0')




compression_loss tensor(0.7583, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7735, device='cuda:0')




compression_loss tensor(0.7688, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7323, device='cuda:0')




compression_loss tensor(0.7314, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7650, device='cuda:0')




compression_loss tensor(0.7641, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7680, device='cuda:0')




compression_loss tensor(0.7653, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7544, device='cuda:0')




compression_loss tensor(0.7537, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7146, device='cuda:0')




compression_loss tensor(0.7264, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7364, device='cuda:0')




compression_loss tensor(0.7341, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7513, device='cuda:0')




compression_loss tensor(0.7478, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8214, device='cuda:0')




compression_loss tensor(0.8093, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9916, device='cuda:0')




compression_loss tensor(0.9725, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7990, device='cuda:0')




compression_loss tensor(0.7894, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7147, device='cuda:0')




compression_loss tensor(0.7141, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7597, device='cuda:0')




compression_loss tensor(0.7577, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7419, device='cuda:0')




compression_loss tensor(0.7402, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7672, device='cuda:0')




compression_loss tensor(0.7579, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.6973, device='cuda:0')




compression_loss tensor(0.6970, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7388, device='cuda:0')




compression_loss tensor(0.7383, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8716, device='cuda:0')




compression_loss tensor(0.8657, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7403, device='cuda:0')




compression_loss tensor(0.7370, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7829, device='cuda:0')




compression_loss tensor(1.0377, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7664, device='cuda:0')




compression_loss tensor(0.7637, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7445, device='cuda:0')




compression_loss tensor(0.7433, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7362, device='cuda:0')




compression_loss tensor(0.8536, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8047, device='cuda:0')




compression_loss tensor(0.7962, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7188, device='cuda:0')




compression_loss tensor(0.7179, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8658, device='cuda:0')




compression_loss tensor(0.8625, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7244, device='cuda:0')




compression_loss tensor(0.7232, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 28/41 [02:12<01:01,  4.74s/it]






WRONG 12
Okay, so I have this problem here: I need to find the smallest distance between the origin and a point on the graph of \( y = \frac{1}{\sqrt{2}}(x^2 - 3) \). The answer should be expressed as \( \sqrt{a}/b \), where \( a \) and \( b \) are positive integers, and \( a \) isn't divisible by the square of any integer greater than one. Then, I have to find \( a + b \). Okay, let's try to figure this out step by step.

First, I need to find the smallest distance from the origin to a point on the graph of the given function. The function is \( y = \frac{1}{\sqrt{2}}(x^2 - 3) \). So, any point on this graph can be represented as \( (x, y) \), where \( y \) is given in terms of \( x \). 

The problem is asking for the smallest distance between the origin (which is the point \( (0, 0) \)) and any point on this graph. To find the smallest distance, I remember that the distance \( D \) between two points \( (x_1, y_1) \) and \( (x_2, y_2) \) is given by the formula:

\[
D = \sqrt{(x_2 - 



original_loss tensor(0.8439, device='cuda:0')




compression_loss tensor(0.8392, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8366, device='cuda:0')




compression_loss tensor(0.8340, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7683, device='cuda:0')




compression_loss tensor(0.7664, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8261, device='cuda:0')




compression_loss tensor(0.8228, device='cuda:0', grad_fn=<NllLossBackward0>)


 10%|▉         | 4/41 [00:13<02:04,  3.36s/it]






CORRECT
вопрос+сжатые+сгенерированные=324, всего_сгенерированно_токенов=602 оригинальная_генерация=1554




original_loss tensor(0.8863, device='cuda:0')




compression_loss tensor(0.8782, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8202, device='cuda:0')




compression_loss tensor(0.8190, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7342, device='cuda:0')




compression_loss tensor(0.7259, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8520, device='cuda:0')




compression_loss tensor(0.8662, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7585, device='cuda:0')




compression_loss tensor(0.7544, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8113, device='cuda:0')




compression_loss tensor(0.8090, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7483, device='cuda:0')




compression_loss tensor(0.7370, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8367, device='cuda:0')




compression_loss tensor(0.8359, device='cuda:0', grad_fn=<NllLossBackward0>)


 20%|█▉        | 8/41 [00:30<02:05,  3.79s/it]






WRONG 9
Okay, so I have this problem here: 

If \(\log_6 (x - y) + \log_6 (x + y) = 2\) and \(\log_y 5x = 2\), then find \(x\).

Hmm, let me try to figure this out step by step. I remember that logarithms can be tricky, but maybe I can use some logarithm properties to simplify these equations.

First, looking at the first equation: \(\log_6 (x - y) + \log_6 (x + y) = 2\). I recall that the sum of two logarithms with the same base can be written as the logarithm of the product of their arguments. So, applying that property here:

\[
\log_6 [(x - y)(x + y)] = 2
\]

Simplifying the expression inside the logarithm, I see that \((x - y)(x + y)\) is a difference of squares, which equals \(x^2 - y^2\). So now the equation becomes:

\[
\log_6 (x^2 - y^2) = 2
\]

To eliminate the logarithm, I can rewrite this in exponential form. Remember that \(\log_b a = c\) is equivalent to \(b^c = a\). Applying that here:

\[
6^2 = x^2 - y^2
\]

Simplifying:

\[
36 = x^2 - y^2
\]

Alright, so that's my firs



original_loss tensor(0.7706, device='cuda:0')




compression_loss tensor(0.7699, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8147, device='cuda:0')




compression_loss tensor(0.8101, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7940, device='cuda:0')




compression_loss tensor(0.7882, device='cuda:0', grad_fn=<NllLossBackward0>)


  7%|▋         | 3/41 [00:11<02:28,  3.90s/it]






CORRECT
вопрос+сжатые+сгенерированные=668, всего_сгенерированно_токенов=838 оригинальная_генерация=1478




original_loss tensor(0.8005, device='cuda:0')




compression_loss tensor(0.7980, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8129, device='cuda:0')




compression_loss tensor(0.8121, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8034, device='cuda:0')




compression_loss tensor(0.7989, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7630, device='cuda:0')




compression_loss tensor(0.7927, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7491, device='cuda:0')




compression_loss tensor(0.7440, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9315, device='cuda:0')




compression_loss tensor(0.9310, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8797, device='cuda:0')




compression_loss tensor(0.8765, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8289, device='cuda:0')




compression_loss tensor(0.8783, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7691, device='cuda:0')




compression_loss tensor(0.7690, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7170, device='cuda:0')




compression_loss tensor(0.7162, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8329, device='cuda:0')




compression_loss tensor(0.8314, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9726, device='cuda:0')




compression_loss tensor(0.9642, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9270, device='cuda:0')




compression_loss tensor(0.9120, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8900, device='cuda:0')




compression_loss tensor(0.9382, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8788, device='cuda:0')




compression_loss tensor(0.8776, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8078, device='cuda:0')




compression_loss tensor(0.8074, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7915, device='cuda:0')




compression_loss tensor(0.7889, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8624, device='cuda:0')




compression_loss tensor(0.8541, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7681, device='cuda:0')




compression_loss tensor(0.7651, device='cuda:0', grad_fn=<NllLossBackward0>)


 46%|████▋     | 19/41 [01:14<01:25,  3.90s/it]






WRONG 32
Okay, so I have this problem here: In a certain isosceles right triangle, the altitude to the hypotenuse has length \(4\sqrt{2}\). I need to find the area of the triangle. Hmm, let me think about how to approach this.

First, let me recall what an isosceles right triangle is. It's a triangle with two legs of equal length and a hypotenuse. The two non-right angles are each 45 degrees. So, it's a 45-45-90 triangle.

In such a triangle, the legs are equal, and the hypotenuse is leg * sqrt(2). That's a key property. So, if I let the length of each leg be 'a', then the hypotenuse will be a * sqrt(2).

Now, the problem mentions the altitude to the hypotenuse. In a right triangle, the altitude to the hypotenuse has a special relationship with the legs. I remember that the length of the altitude (let's call it 'h') can be found using the formula:

h = (a * b) / c

where 'a' and 'b' are the legs, and 'c' is the hypotenuse. But since it's an isosceles right triangle, both legs are equal



original_loss tensor(0.8273, device='cuda:0')




compression_loss tensor(0.8247, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8844, device='cuda:0')




compression_loss tensor(0.8759, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8165, device='cuda:0')




compression_loss tensor(0.8158, device='cuda:0', grad_fn=<NllLossBackward0>)


  7%|▋         | 3/41 [00:09<02:02,  3.24s/it]






CORRECT
вопрос+сжатые+сгенерированные=342, всего_сгенерированно_токенов=547 оригинальная_генерация=1269




original_loss tensor(0.9076, device='cuda:0')




compression_loss tensor(0.9013, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7842, device='cuda:0')




compression_loss tensor(0.7754, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7360, device='cuda:0')




compression_loss tensor(0.7347, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7473, device='cuda:0')




compression_loss tensor(0.8339, device='cuda:0', grad_fn=<NllLossBackward0>)


 10%|▉         | 4/41 [00:13<02:04,  3.36s/it]






CORRECT
вопрос+сжатые+сгенерированные=626, всего_сгенерированно_токенов=929 оригинальная_генерация=2693




original_loss tensor(0.9211, device='cuda:0')




compression_loss tensor(0.9181, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8010, device='cuda:0')




compression_loss tensor(0.7997, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7733, device='cuda:0')




compression_loss tensor(0.7687, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8153, device='cuda:0')




compression_loss tensor(0.8126, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.7876, device='cuda:0')




compression_loss tensor(0.7872, device='cuda:0', grad_fn=<NllLossBackward0>)


 12%|█▏        | 5/41 [00:17<02:05,  3.49s/it]






CORRECT
вопрос+сжатые+сгенерированные=475, всего_сгенерированно_токенов=844 оригинальная_генерация=1346




original_loss tensor(0.8948, device='cuda:0')




compression_loss tensor(0.8942, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.8541, device='cuda:0')




compression_loss tensor(0.8485, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(0.9512, device='cuda:0')




compression_loss tensor(0.9501, device='cuda:0', grad_fn=<NllLossBackward0>)
original_loss tensor(1.0513, device='cuda:0')




compression_loss tensor(1.0352, device='cuda:0', grad_fn=<NllLossBackward0>)


 10%|▉         | 4/41 [00:14<02:17,  3.72s/it]






CORRECT
вопрос+сжатые+сгенерированные=333, всего_сгенерированно_токенов=640 оригинальная_генерация=2159


In [6]:
len(correct_dataset) / len(dataset), correct_items / len(dataset), correct_items / len(
    correct_dataset
)

(0.13392857142857142, 0.09821428571428571, 0.7333333333333333)

In [7]:
original_total_len = 0
compressed_total_len = 0
for item in evaluation_dataset:
    original_total_len += item["original_total_len"]
    compressed_total_len += item["compressed_total_len"]
original_total_len, compressed_total_len, compressed_total_len / original_total_len

(56056, 19406, 0.3461895247609533)

In [None]:
# (56056, 21924, 0.39110889110889113) - 0.8333333333333334, 200 токенов, 4 сжимающих
# (56056, 29497, 0.5262059369202227) - 0.9666666666666667, 400 токенов, 16 сжимающих
# (56056, 27994, 0.499393463679178) - 0.9, 400 токенов, 8 сжимающих
# (56056, 23062, 0.4114100185528757) - 0.9, 200 токенов, 16 сжимающих
# (56056, 30087, 0.5367311260168403) - 0.8666666666666667, 400, 32
# (56056, 19406, 0.3461895247609533) - 0.7333333333333333, 100, 2