In [35]:
from load_qwen import *

In [36]:
model, tokenizer = load_qwen_model()

Loading tokenizer for Qwen/Qwen2.5-1.5B-Instruct...
Loading model Qwen/Qwen2.5-1.5B-Instruct...
This is a compact 1.5B parameter model, perfect for MacBook Pro!
Model loaded successfully!
Model device: mps
Model dtype: torch.float16


In [15]:
print(model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps=1e-06)
    (rotar

In [16]:
# Storage for MLP inputs and outputs
mlp_activations = {
    'input': None,
    'output': None
}

def mlp_hook(module, input, output):
    """
    Hook function to capture MLP layer inputs and outputs
    
    Args:
        module: The MLP layer
        input: Tuple of input tensors to the layer
        output: Output tensor from the layer
    """
    # Input is a tuple, we take the first element
    mlp_activations['input'] = input[0].detach().cpu()
    mlp_activations['output'] = output.detach().cpu()
    print(f"Captured MLP activations - Input shape: {input[0].shape}, Output shape: {output.shape}")

# Register the hook on layer 19's MLP
layer_19_mlp = model.model.layers[19].mlp
hook_handle = layer_19_mlp.register_forward_hook(mlp_hook)

print(f"Hook registered on layer 19 MLP: {layer_19_mlp}")

Hook registered on layer 19 MLP: Qwen2MLP(
  (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
  (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
  (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
  (act_fn): SiLUActivation()
)


In [21]:
# Run a forward pass to trigger the hook
test_prompt = "What is the capital of France?"
inputs = tokenizer(test_prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model(**inputs)

print(f"\nForward pass complete!")
print(f"MLP input shape: {mlp_activations['input'].shape}")
print(f"MLP output shape: {mlp_activations['output'].shape}")

Captured MLP activations - Input shape: torch.Size([1, 7, 1536]), Output shape: torch.Size([1, 7, 1536])

Forward pass complete!
MLP input shape: torch.Size([1, 7, 1536])
MLP output shape: torch.Size([1, 7, 1536])


In [22]:
# Now you can analyze the saved activations
print("Analyzing MLP activations...")
print(f"Input shape: {mlp_activations['input'].shape}")  # Expected: [batch_size, seq_len, 1536]
print(f"Output shape: {mlp_activations['output'].shape}")  # Expected: [batch_size, seq_len, 1536]

# Example analysis: compute statistics
print(f"\nInput statistics:")
print(f"  Mean: {mlp_activations['input'].mean():.4f}")
print(f"  Std: {mlp_activations['input'].std():.4f}")
print(f"  Min: {mlp_activations['input'].min():.4f}")
print(f"  Max: {mlp_activations['input'].max():.4f}")

print(f"\nOutput statistics:")
print(f"  Mean: {mlp_activations['output'].mean():.4f}")
print(f"  Std: {mlp_activations['output'].std():.4f}")
print(f"  Min: {mlp_activations['output'].min():.4f}")
print(f"  Max: {mlp_activations['output'].max():.4f}")

# Don't forget to remove the hook when done
# hook_handle.remove()

Analyzing MLP activations...
Input shape: torch.Size([1, 7, 1536])
Output shape: torch.Size([1, 7, 1536])

Input statistics:
  Mean: -0.0097
  Std: 0.6733
  Min: -16.2031
  Max: 19.0781

Output statistics:
  Mean: -0.0106
  Std: 0.8530
  Min: -18.4688
  Max: 15.3438


In [23]:
# Decode the outputs to see the text predictions
predicted_token_ids = outputs.logits.argmax(dim=-1)  # Get most likely token at each position
decoded_text = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)

print(f"Input prompt: {test_prompt}")
print(f"Decoded output: {decoded_text}")

Input prompt: What is the capital of France?
Decoded output:  is the value of the?
 The


In [25]:
hook_handle.remove()

In [34]:
# Use generate() for proper text generation
test_prompt = "Beth places four whole ice cubes in a frying pan at the start of the first minute, then five at the start of the second minute and some more at the start of the third minute, but none in the fourth minute. If the average number of ice cubes per minute placed in the pan while it was frying a crispy egg was five, how many whole ice cubes can be found in the pan at the end of the third minute?\nA. 30\nB. 0\nC. 20\nD. 10\nE. 11\nF. 5\n"
inputs = tokenizer(test_prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=1000,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(f"Input: {test_prompt}")
print(f"Generated: {generated_text}")

Input: Beth places four whole ice cubes in a frying pan at the start of the first minute, then five at the start of the second minute and some more at the start of the third minute, but none in the fourth minute. If the average number of ice cubes per minute placed in the pan while it was frying a crispy egg was five, how many whole ice cubes can be found in the pan at the end of the third minute?
A. 30
B. 0
C. 20
D. 10
E. 11
F. 5

Generated: Beth places four whole ice cubes in a frying pan at the start of the first minute, then five at the start of the second minute and some more at the start of the third minute, but none in the fourth minute. If the average number of ice cubes per minute placed in the pan while it was frying a crispy egg was five, how many whole ice cubes can be found in the pan at the end of the third minute?
A. 30
B. 0
C. 20
D. 10
E. 11
F. 5
To solve this problem, we need to determine the total number of ice cubes that were added over the three minutes and then fin