In [2]:
# https://pytorch.org/torchtune/stable/generated/torchtune.models.qwen2_5.qwen2_5_1_5b_instruct.html#torchtune.models.qwen2_5.qwen2_5_1_5b_instruct

In [None]:
# https://pytorch.org/torchtune/0.6/deep_dives/checkpointer.html

In [4]:
!tune download deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --output-dir models/DeepSeek-R1-Distill-Qwen-1.5B

Ignoring files matching the following patterns: None
.gitattributes: 100%|██████████████████████| 1.52k/1.52k [00:00<00:00, 22.0MB/s]
LICENSE: 100%|█████████████████████████████| 1.06k/1.06k [00:00<00:00, 5.47MB/s]
README.md: 100%|███████████████████████████| 16.0k/16.0k [00:00<00:00, 4.43MB/s]
benchmark.jpg: 100%|█████████████████████████| 777k/777k [00:00<00:00, 3.90MB/s]
Successfully downloaded model repo and wrote to the following locations:
/code/models/DeepSeek-R1-Distill-Qwen-1.5B/generation_config.json
/code/models/DeepSeek-R1-Distill-Qwen-1.5B/README.md
/code/models/DeepSeek-R1-Distill-Qwen-1.5B/config.json
/code/models/DeepSeek-R1-Distill-Qwen-1.5B/model.safetensors
/code/models/DeepSeek-R1-Distill-Qwen-1.5B/tokenizer_config.json
/code/models/DeepSeek-R1-Distill-Qwen-1.5B/.gitattributes
/code/models/DeepSeek-R1-Distill-Qwen-1.5B/.cache
/code/models/DeepSeek-R1-Distill-Qwen-1.5B/original_repo_id.json
/code/models/DeepSeek-R1-Distill-Qwen-1.5B/LICENSE
/code/models/DeepSeek-R1-D

In [1]:
import torch
from torchtune.training import FullModelHFCheckpointer, ModelType

checkpoint_dir = "models/DeepSeek-R1-Distill-Qwen-1.5B"
output_dir = "models/DeepSeek-R1-Distill-Qwen-1.5B-torchtune"
pytorch_files = [
    "model.safetensors",
]

# Set up the checkpointer and load state dict
checkpointer = FullModelHFCheckpointer(
    checkpoint_dir=checkpoint_dir,
    checkpoint_files=pytorch_files,
    output_dir=output_dir,
    model_type=ModelType.QWEN2,
)
torchtune_sd = checkpointer.load_checkpoint()

CONVERT QWEN2


In [3]:
from torchtune.models.qwen2 import qwen2_1_5b, qwen2

# model = qwen2_1_5b()
model = qwen2(
    vocab_size=151936,
    num_layers=28,
    num_heads=12,
    num_kv_heads=2,
    embed_dim=1536,
    intermediate_dim=8960,
    max_seq_len=32768,
    attn_dropout=0.0,
    norm_eps=1e-06,
    rope_base=1000000.0,
    # tie_word_embeddings=True,
    tie_word_embeddings=False,
)
model.load_state_dict(
    torchtune_sd["model"],
    # strict=False,
)
model = model.eval()
model = model.cuda()

In [4]:
model

TransformerDecoder(
  (tok_embeddings): Embedding(151936, 1536)
  (layers): ModuleList(
    (0-27): 28 x TransformerSelfAttentionLayer(
      (attn): MultiHeadAttention(
        (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
        (k_proj): Linear(in_features=1536, out_features=256, bias=True)
        (v_proj): Linear(in_features=1536, out_features=256, bias=True)
        (output_proj): Linear(in_features=1536, out_features=1536, bias=False)
        (pos_embeddings): Qwen2RotaryPositionalEmbeddings()
      )
      (mlp): FeedForward(
        (w1): Linear(in_features=1536, out_features=8960, bias=False)
        (w2): Linear(in_features=8960, out_features=1536, bias=False)
        (w3): Linear(in_features=1536, out_features=8960, bias=False)
        (activation): SiLU()
      )
      (sa_norm): RMSNorm()
      (mlp_norm): RMSNorm()
      (sa_scale): Identity()
      (mlp_scale): Identity()
    )
  )
  (norm): RMSNorm()
  (output): Linear(in_features=1536, out_features

In [17]:
from torchtune.generation import generate, generate_next_token as sample_next
from torchtune import generation

# output, logits = generate(
#     model, torch.tensor(prompt), max_generated_tokens=100, pad_id=0
# )


def generate_next_token(
    model,
    input_pos,
    x,
    q=None,
    *,
    mask=None,
    temperature: float = 1.0,
    top_k=None,
):
    """
    Generates the next tokens given a prompt, and also returns the corresponding logits.

    Args:
        model (TransformerDecoder): model used for generation
        input_pos (torch.Tensor): tensor with the positional encodings associated with the given prompt,
            with shape [bsz x seq_length].
        x (torch.Tensor): tensor with the token IDs associated with the given prompt,
            with shape [bsz x seq_length].
        q (Optional[torch.Tensor]): randomly sampled tensor for softmax sampling trick.
            See https://github.com/pytorch-labs/gpt-fast/blob/32971d3129541c5bfb4f715abc33d1c5f408d204/generate.py#L40
        mask (Optional[torch.Tensor]): attention mask with shape [bsz x seq_length x seq_length],
            default None.
        temperature (float): value to scale the predicted logits by, default 1.0.
        top_k (Optional[int]): Top-k value to use for sampling, default None.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: tuple of two tensors:
            - tokens (torch.Tensor): tensor with the generated tokens,
                with shape [bsz x 1].
            - logits (torch.Tensor): tensor with the logits associated with the generated tokens,
                with shape [bsz x 1 x vocab_size].

    """
    # model produces logits in [bsz, seq_length, vocab_size]
    # we want to take the last token's logits as the input to the next model call
    if temperature != 0:
        return sample_next(
            model,
            input_pos,
            x,
            q,
            mask,
            temperature,
            top_k,
        )
    else:

        logits = model(x, input_pos=input_pos, mask=mask)[:, -1]
        # logits = torch.nn.functional.softmax(logits, dim=-1)
        return (
            torch.argmax(logits, dim=-1, keepdim=True).to(dtype=torch.int),
            logits.unsqueeze(1),
        )


generation.generate_next_token = generate_next_token

In [6]:
from datasets import load_dataset

dataset = load_dataset(
    # "dim/hendrycks_math_train_12k_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096"
    "dim/hendrycks_math_test_500_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096_greedy"
)

dataset = dataset["train"].train_test_split(
    # test_size=250,
    # test_size=1,
    seed=42,
)
dataset = dataset["test"].filter(lambda x: x["model_answer"].count("</think>") == 1)

from lm_eval.tasks.hendrycks_math.utils import strip_string, remove_boxed, is_equiv
from hidden_capacity_reasoning.evaluation.math_500.utils import (
    dataset_answer_filter,
    model_answer_filter,
)

correct_dataset = []

for pos, item in enumerate(dataset):
    try:
        answer = dataset_answer_filter(item["answer"])
        model_answer = model_answer_filter(item["model_answer"])
        # print(answer, model_answer)
        # break
        if is_equiv(answer, model_answer):
            correct_dataset.append(item)
    except:
        pass

len(dataset), len(correct_dataset), len(correct_dataset) / len(dataset)

'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'


(77, 73, 0.948051948051948)

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import random
import numpy as np

torch.manual_seed(0)
random.seed(0)
np.random.seed(0)
torch.set_grad_enabled(False)
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
model = model.to(torch.float32)

In [20]:
from tqdm.notebook import tqdm
from more_itertools import chunked

batch_size = 1
base_prompt = open(
    "hidden_capacity_reasoning/evaluation/math_500/math_500_prompt"
).read()
batches = []
for batch in chunked(correct_dataset, batch_size):
    batch = [item["problem"] for item in batch]
    batch = [
        tokenizer.apply_chat_template(
            [
                {"role": "user", "content": base_prompt.format(question=item)},
            ],
            tokenize=False,
            add_generation_prompt=True,
        )
        for item in batch
    ]
    batches.append(batch)

    generation_results = []

    device = "cuda"
with torch.no_grad():
    for batch in tqdm(batches):
        model_inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding="longest",
            truncation=False,
            add_special_tokens=False,
        ).to(device)
        # generated_ids = model.generate(
        #     **model_inputs,
        #     max_new_tokens=4096,
        #     do_sample=False,
        #     temperature=None,
        #     top_p=None,
        # )
        generated_ids, logits = generate(
            model,
            model_inputs["input_ids"],
            max_generated_tokens=4096,
            # max_generated_tokens=120,
            pad_id=tokenizer.eos_token_id,
            temperature=0.0,
        )
        generated_ids = [
            output_ids[len(input_ids) :]
            for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        generation_results.extend(responses)
        break

  0%|          | 0/73 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [19]:
generation_results[0]

"Okay, so I have this magic square problem here, and I need to find the value of n. Let me try to figure it out step by step. First, I should probably visualize the square based on the Asymptote code provided. It's a 3x3 grid, right? So, it's a 3x3 grid with some numbers filled in and some variables. Let me try to sketch it out in my mind. The Asymptote code draws a 3x3 grid with some numbers and expressions. Let me see: the center is labeled n-"

In [None]:
# hf sdpa batch 2 float32
# "Okay, so I have this problem here where Rick is thinking of a positive factor of 14, and Steve is thinking of a positive factor of 42. They both are thinking of the same number, and I need to figure out how many possible numbers they could be thinking of. Hmm, let me break this down step by step.\n\nFirst, let me recall what a factor is. A factor of a number is another number that divides into it without leaving a remainder. So, for example, the factors of 14 are numbers that can multiply together to give 14. Similarly, factors of 42 are numbers that can multiply together to give 42.\n\nAlright, so let me list out all the positive factors of 14 and 42. That should help me see which numbers are common to both lists.\n\nStarting with 14. The factors of 14 are the numbers that divide 14 exactly. Let's see:\n\n1 × 14 = 14, so 1 and 14 are factors.\n\n2 × 7 = 14, so 2 and 7 are factors.\n\nIs there any more? 3 doesn't divide 14 evenly because 14 divided by 3 is about 4.666, which isn't a whole number. Similarly, 4 doesn't divide 14 evenly because 14 divided by 4 is 3.5, which isn't a whole number. 5 doesn't divide 14 either, and 6 is already covered by 2 and 3. So, the factors of 14 are 1, 2, 7, and 14.\n\nNow, moving on to 42. Let's list all the factors of 42.\n\n1 × 42 = 42, so 1 and 42 are factors.\n\n2 × 21 = 42, so 2 and 21 are factors.\n\n3 × 14 = 42, so 3 and 14 are factors.\n\n6 × 7 = 42, so 6 and 7 are factors.\n\nWait, let me make sure I haven't missed any. After 1, 2, 3, 6, 7, 14, 21, 42. Yeah, that seems right. So the factors of 42 are 1, 2, 3, 6, 7, 14, 21, and 42.\n\nNow, the problem says that Rick is thinking of a factor of 14, and Steve is thinking of a factor of 42, and they are thinking of the same number. So, we need to find the numbers that are common to both lists of factors.\n\nLooking at the factors of 14: 1, 2, 7, 14.\n\nLooking at the factors of 42: 1, 2, 3, 6, 7, 14, 21, 42.\n\nSo, the common numbers between these two lists are 1, 2, 7, and 14. Let me count them: 1, 2, 7, 14. That's four numbers.\n\nTherefore, there are four possible numbers that Rick and Steve could be thinking of.\n\nWait, let me double-check to make sure I didn't miss any. For 14, the factors are definitely 1, 2, 7, 14. For 42, the factors are 1, 2, 3, 6, 7, 14, 21, 42. So, the intersection is indeed 1, 2, 7, 14. So, four numbers.\n\nIs there a possibility that I missed any factors? Let me think. For 14, 1, 2, 7, 14. For 42, 1, 2, 3, 6, 7, 14, 21, 42. So, yeah, the overlapping numbers are 1, 2, 7, 14. So, four numbers.\n\nTherefore, the number of possible numbers they could be thinking of is 4.\n\n**Final Answer**\nThe number of possible numbers they could be thinking of is \\boxed{4}.\n</think>\n\nRick is thinking of a positive factor of 14, and Steve is thinking of a positive factor of 42. We need to determine how many possible numbers they could be thinking of if they are the same.\n\nFirst, we list the factors of 14:\n- The factors of 14 are 1, 2, 7, and 14.\n\nNext, we list the factors of 42:\n- The factors of 42 are 1, 2, 3, 6, 7, 14, 21, and 42.\n\nWe then find the common factors of both lists:\n- The common factors of 14 and 42 are 1, 2, 7, and 14.\n\nThus, there are four possible numbers that Rick and Steve could be thinking of.\n\nThe number of possible numbers they could be thinking of is \\boxed{4}."


# torchtune
# "Okay, so I have this magic square problem here, and I need to find the value of n. Let me try to figure it out step by step. First, I should probably visualize the square based on the Asymptote code provided. It's a 3x3 grid, right? So, it's a 3x3 grid with some numbers filled in and some variables. Let me try to sketch it out in my mind. The Asymptote code draws a 3x3 grid with some numbers and expressions. Let me see: the center is labeled n-"

# "Okay, so I have this magic square problem here, and I need to find the value of n. Let me try to figure it out step by step. First, I should probably visualize the square based on the Asymptote code provided. It's a 3x3 grid, right? So, it's a 3x3 grid with some numbers filled in and some variables. Let me try to sketch it out in my mind. The Asymptote code draws a 3x3 grid with some numbers and expressions. Let me see: the center is labeled n-"

# "Okay, so I have this magic square problem here, and I need to find the value of n. Let me try to figure it out step by step. First, I should probably visualize the square based on the Asymptote code provided. It's a 3x3 grid, right? So, it's a 3x3 grid with some numbers filled in and some variables. Let me try to sketch it out in my mind. The Asymptote code draws a 3x3 grid with some numbers and expressions. Let me see: the center is labeled n-"