In [1]:
import torch

In [14]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
import matplotlib.pyplot as plt


In [15]:
print("Loading veomni_first_sample.pt...")
data = torch.load("veomni_first_sample.pt")

print(f"Data keys: {data.keys()}")
print(f"Tokens length: {len(data['tokens'])}")
print(f"Log probs length: {len(data['log_probs'])}")

tokens = data["tokens"]
reference_log_probs = data["log_probs"]

# Convert tensors to lists if needed
# if isinstance(tokens, torch.Tensor):
#     tokens = tokens.tolist()
# if isinstance(reference_log_probs, torch.Tensor):
#     reference_log_probs = reference_log_probs.tolist()

# print(f"Tokens type: {type(tokens[0])}")
# print(f"First few tokens: {tokens[:10]}")

# Load Qwen model and tokenizer
print("Loading Qwen/Qwen3-0.6B model and tokenizer...")
model_name = "Qwen/Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, attn_implementation="flash_attention_2", dtype="bfloat16", device_map="auto")

# Convert tokens to text
# text = tokenizer.decode(tokens)
# print(f"Decoded text length: {len(text)} characters")

# Re-tokenize to get proper tokenization
# tokenized = tokenizer(text, return_tensors="pt")
input_ids = torch.tensor(tokens).reshape(1, -1).to(model.device, dtype=torch.int64)
print(f"Input ids shape: {input_ids.shape}")
# Get logits from model
with torch.no_grad():
    outputs = model(input_ids)
    logits = outputs.logits

Loading veomni_first_sample.pt...
Data keys: dict_keys(['tokens', 'log_probs'])
Tokens length: 8391
Log probs length: 8390
Loading Qwen/Qwen3-0.6B model and tokenizer...
Input ids shape: torch.Size([1, 8391])


  input_ids = torch.tensor(tokens).reshape(1, -1).to(model.device, dtype=torch.int64)


In [16]:
log_probs = torch.log_softmax(logits, dim=-1)
print(f"Log probs shape: {log_probs.shape}")

# Extract log probabilities for the actual tokens (excluding the last token)
# We need log_probs[i] for token[i+1]
model_log_probs = []
for i in range(len(tokens) - 1):
    token_id = tokens[i + 1]
    log_prob = log_probs[0, i, token_id].item()
    model_log_probs.append(log_prob)

Log probs shape: torch.Size([1, 8391, 151936])


In [34]:
model_log_probs = torch.tensor(model_log_probs, device=torch.device("cuda")).bfloat16()
model_log_probs

  model_log_probs = torch.tensor(model_log_probs, device=torch.device("cuda")).bfloat16()


tensor([-1.0750e+01, -4.4922e-02, -7.7188e+00,  ..., -2.1935e-05,
        -1.1921e-06, -8.9111e-03], device='cuda:0', dtype=torch.bfloat16)

In [18]:
# l2 norm between model_log_probs and reference_log_probs

torch.norm(model_log_probs - reference_log_probs) / len(model_log_probs)


tensor(0.0004, device='cuda:0', dtype=torch.bfloat16)

In [20]:
samples = torch.load("./test/debug_rollout_data")


In [22]:
megatron_data = torch.load(open("./megatron_rollout_data_0.pt", "rb"))
veomni_data = torch.load(open("./veomni_padded_batches_0.pt", "rb"))
fsdp_data = torch.load(open("./fsdp_padded_batches_0.pt", "rb"))

# megatron_data["abs_advantages"] = [k.abs() for k in megatron_data["advantages"]]
# megatron_data["abs_advantages"] = [k.abs() for k in megatron_data["advantages"]]
# print(megatron_data, veomni_data)


In [None]:
# len(veomni_data)
print(veomni_data[0])
print(len(veomni_data[0]["tokens"][0]))
print(len(veomni_data[0]["log_probs"][0]))
print(len(fsdp_data[0]["log_probs"][0]))
print(veomni_data[0]["loss_masks"].sum())


{'tokens': tensor([[151644,    872,    198,  ...,  13038,     25,  48297]],
       device='cuda:0'), 'loss_masks': tensor([[0, 0, 0,  ..., 1, 1, 1]], device='cuda:0', dtype=torch.int32), 'rewards': tensor([0.], device='cuda:0'), 'raw_reward': [0], 'log_probs': tensor([[-1.0750e+01, -4.4922e-02, -7.7188e+00,  ..., -2.0266e-05,
         -1.0729e-06, -8.9111e-03]], device='cuda:0', dtype=torch.bfloat16), 'advantages': tensor([[0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0'), 'returns': tensor([[0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0'), 'abs_advantages': tensor([[0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')}
8391
8390
8390
tensor(8192, device='cuda:0')


In [33]:
megatron_first_sample_tokens = megatron_data["tokens"][0]
veomni_first_sample_tokens = veomni_data[0]["tokens"][0]
first_samples_tokens = samples['samples'][0]['tokens']
second_samples_tokens = samples['samples'][1]['tokens']

# Convert all to int32 cuda tensors
megatron_first_sample_tokens = torch.tensor(megatron_first_sample_tokens, device=torch.device("cuda")).int()
veomni_first_sample_tokens = torch.tensor(veomni_first_sample_tokens, device=torch.device("cuda")).int()
first_samples_tokens = torch.tensor(first_samples_tokens, device=torch.device("cuda")).int()
second_samples_tokens = torch.tensor(second_samples_tokens, device=torch.device("cuda")).int()

# Assert shpaes are equal for first samples
assert megatron_first_sample_tokens.shape == veomni_first_sample_tokens.shape == first_samples_tokens.shape, f"{megatron_first_sample_tokens.shape} != {veomni_first_sample_tokens.shape} != {first_samples_tokens.shape}"

print(f"shape: {megatron_first_sample_tokens.shape}")
assert torch.allclose(megatron_first_sample_tokens, veomni_first_sample_tokens)
assert torch.allclose(first_samples_tokens, megatron_first_sample_tokens)
assert not torch.allclose(torch.tensor(second_samples_tokens, device=torch.device("cuda")), torch.tensor(megatron_first_sample_tokens, device=torch.device("cuda")))
# print(veomni_first_sample_tokens[0:20])

print(megatron_first_sample_tokens[-20:])
data = {
  'tokens': veomni_first_sample_tokens,
  'log_probs': veomni_data[0]["log_probs"][0],
  'response_lengths': len(megatron_data["log_probs"][0])
}
print(data)

torch.save(data, "veomni_first_sample.pt")
# megatron_first_sample_tokens[0:20]

shape: torch.Size([8391])
tensor([ 4279,     8,   323,   220,    19,    23,   320,  1291,  4279,   568,
         2806,  6144,    13,  2055,  8318,   382, 15666, 13038,    25, 48297],
       device='cuda:0', dtype=torch.int32)
{'tokens': tensor([151644,    872,    198,  ...,  13038,     25,  48297], device='cuda:0',
       dtype=torch.int32), 'log_probs': tensor([-1.0750e+01, -4.4922e-02, -7.7188e+00,  ..., -2.0266e-05,
        -1.0729e-06, -8.9111e-03], device='cuda:0', dtype=torch.bfloat16), 'response_lengths': 8192}


  megatron_first_sample_tokens = torch.tensor(megatron_first_sample_tokens, device=torch.device("cuda")).int()
  veomni_first_sample_tokens = torch.tensor(veomni_first_sample_tokens, device=torch.device("cuda")).int()
  assert not torch.allclose(torch.tensor(second_samples_tokens, device=torch.device("cuda")), torch.tensor(megatron_first_sample_tokens, device=torch.device("cuda")))


In [28]:
megatron_lprobs = megatron_data["log_probs"][0]
veomni_lprobs = veomni_data[0]["log_probs"][0]
fsdp_lprobs = fsdp_data[0]["log_probs"][0]
sample_lprobs = samples['samples'][0]['rollout_log_probs']

# megatron_lprobs
# Convert all to bf16 cuda tensors  
megatron_lprobs = torch.tensor(megatron_lprobs, device=torch.device("cuda")).bfloat16()
veomni_lprobs = torch.tensor(veomni_lprobs, device=torch.device("cuda")).bfloat16()
sample_lprobs = torch.tensor(sample_lprobs, device=torch.device("cuda")).bfloat16()
fsdp_lprobs = torch.tensor(fsdp_lprobs, device=torch.device("cuda")).bfloat16()
# Print lengths
print(len(megatron_lprobs))
print(len(veomni_lprobs))
print(len(sample_lprobs))
veomni_lprobs = veomni_lprobs[-len(sample_lprobs):]
fsdp_lprobs = fsdp_lprobs[-len(sample_lprobs):]
print(len(veomni_lprobs))
print(veomni_lprobs)
# assert torch.allclose(megatron_lprobs, veomni_lprobs)
# assert torch.allclose(sample_lprobs, megatron_lprobs)



8192
8390
8192
8192
tensor([-1.2054e-03,  0.0000e+00, -7.6599e-03,  ..., -2.0266e-05,
        -1.0729e-06, -8.9111e-03], device='cuda:0', dtype=torch.bfloat16)


  megatron_lprobs = torch.tensor(megatron_lprobs, device=torch.device("cuda")).bfloat16()
  veomni_lprobs = torch.tensor(veomni_lprobs, device=torch.device("cuda")).bfloat16()
  fsdp_lprobs = torch.tensor(fsdp_lprobs, device=torch.device("cuda")).bfloat16()


In [None]:
print(megatron_lprobs[:5])
print(veomni_lprobs[:5])
print(fsdp_lprobs[:5])
print(sample_lprobs[:5])

tensor([-3.0994e-05, -5.4598e-05, -1.1921e-06, -0.0000e+00, -3.0975e-03],
       device='cuda:0', dtype=torch.bfloat16)
tensor([-2.5177e-04, -4.8637e-04, -2.0266e-05, -1.0729e-06, -8.9111e-03],
       device='cuda:0', dtype=torch.bfloat16)
tensor([-2.8419e-04, -4.2152e-04, -2.4676e-05, -1.1921e-06, -1.0071e-02],
       device='cuda:0', dtype=torch.bfloat16)
tensor([-2.3484e-05, -5.4598e-05, -8.3447e-07,  0.0000e+00, -3.6163e-03],
       device='cuda:0', dtype=torch.bfloat16)


In [None]:
print(torch.allclose(megatron_lprobs, sample_lprobs, atol=1e-1))
print(torch.allclose(veomni_lprobs, sample_lprobs, atol=1e-1))
print(torch.allclose(megatron_lprobs, veomni_lprobs, atol=1e-1))



In [None]:
# print l2 norm of the difference between megatron_lprobs and veomni_lprobs
print(torch.norm(megatron_lprobs - veomni_lprobs)/len(megatron_lprobs))
print(torch.norm(megatron_lprobs - sample_lprobs)/len(megatron_lprobs))
print(torch.norm(veomni_lprobs - sample_lprobs)/len(veomni_lprobs))
print(torch.norm(fsdp_lprobs - veomni_lprobs)/len(fsdp_lprobs))

In [None]:
# print max of the difference between megatron_lprobs and veomni_lprobs
print(torch.max(torch.abs(megatron_lprobs - veomni_lprobs)))
print(torch.max(torch.abs(megatron_lprobs - sample_lprobs)))
print(torch.max(torch.abs(veomni_lprobs - sample_lprobs)))
