# Neuron-Level Analysis of Transformer Attention Patterns

**Interactive Exploration Notebook**

This notebook walks through each section of the analysis interactively:
1. Load the model and extract raw attention weights
2. Visualise per-head attention heatmaps
3. Analyse sentiment-correlated heads
4. Probe gender-bias differential attention
5. Identify instruction-following heads
6. Explore FFN neuron activations
7. Run TransformerLens ablations


In [None]:
import sys
sys.path.insert(0, '../src')

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from attention_analyzer import (
    load_model, get_attentions, head_entropy,
    top_attended_tokens, sentiment_head_scores,
    bias_differential_attention, instruction_head_scores,
    get_ffn_activations, top_active_neurons,
)
from visualizer import (
    plot_attention_heatmap, plot_layer_heads, plot_entropy_heatmap,
    plot_sentiment_heatmap, plot_bias_heatmap, plot_instruction_heatmap,
    plot_neuron_activations, plot_head_summary,
)
from transformerlens_analysis import (
    load_hooked_model, logit_lens, full_ablation_matrix,
    tl_get_attention_patterns, tl_diagnostic,
)

print('Imports OK')

## 1. Load Model

In [None]:
# Change to 'gpt2-medium', 'gpt2-large', or 'meta-llama/Llama-2-7b-hf'
# For Llama-2 you will need to accept the HuggingFace license and set HF_TOKEN
MODEL_NAME = 'gpt2'

tokenizer, model = load_model(MODEL_NAME)
n_layers = model.config.num_hidden_layers
n_heads  = model.config.num_attention_heads
print(f'Layers: {n_layers}  |  Heads: {n_heads}')

## 2. Raw Attention Extraction & Heatmaps

In [None]:
TEXT = 'The quick brown fox jumps over the lazy dog near the river bank.'

tokens, attn = get_attentions(TEXT, tokenizer, model)
print('Tokens:', tokens)
print('Attention shape:', attn.shape)  # (layers, heads, seq, seq)

In [None]:
# --- Single head heatmap ---
# Try different layer/head combinations to see specialised patterns
LAYER = 5
HEAD  = 2

fig = plot_attention_heatmap(attn, tokens, layer=LAYER, head=HEAD, save=False)
plt.show()

# Top tokens attended to by this head
top = top_attended_tokens(attn, tokens, LAYER, HEAD, top_k=5)
print(f'Top attended tokens (L{LAYER} H{HEAD}):', top)

In [None]:
# --- All heads in one layer ---
fig = plot_layer_heads(attn, tokens, layer=LAYER, save=False)
plt.show()

## 3. Head Entropy — Which Heads Are Most Focused?

In [None]:
entropy = head_entropy(attn)

# Print 5 most focused heads
flat_idx = np.argsort(entropy.flatten())
print('Most focused heads (lowest entropy):')
for i in flat_idx[:5]:
    l, h = i // n_heads, i % n_heads
    print(f'  Layer {l:2d}  Head {h:2d}  entropy={entropy[l,h]:.4f}')

fig = plot_entropy_heatmap(entropy, save=False)
plt.show()

## 4. Sentiment-Correlated Heads

In [None]:
SENTIMENT_DATA = [
    ('This movie was absolutely wonderful and I loved every moment.',            1),
    ('The product quality is excellent and delivery was superb.',               1),
    ('I had an amazing experience at this restaurant, highly recommended.',     1),
    ('The customer service was fantastic and resolved my issue instantly.',     1),
    ('What a beautiful day — everything felt positive and joyful.',             1),
    ('This service was terrible and I am deeply disappointed.',                 0),
    ('The food was disgusting and the staff were rude and unhelpful.',          0),
    ('Worst experience ever. I will never return to this horrible place.',      0),
    ('The product broke immediately — awful quality and poor design.',           0),
    ('I had a dreadful time and regret spending money on this.',                0),
]

corpus = [t for t, _ in SENTIMENT_DATA]
labels = [l for _, l in SENTIMENT_DATA]

sent_corr = sentiment_head_scores(corpus, labels, tokenizer, model)

# Top 5 sentiment-correlated heads
top_idx = np.argsort(np.abs(sent_corr.flatten()))[::-1][:5]
print('Top sentiment-correlated heads:')
for i in top_idx:
    l, h = i // n_heads, i % n_heads
    print(f'  Layer {l:2d}  Head {h:2d}  r={sent_corr[l,h]:+.4f}')

fig = plot_sentiment_heatmap(sent_corr, save=False)
plt.show()

## 5. Gender-Bias Differential Attention

In [None]:
# You can add custom probe pairs here
CUSTOM_PAIRS = [
    ('The scientist presented his findings at the conference.',
     'The scientist presented her findings at the conference.'),
    ('The pilot landed his aircraft safely.',
     'The pilot landed her aircraft safely.'),
    ('The lawyer argued his case before the judge.',
     'The lawyer argued her case before the judge.'),
]

bias_diff = bias_differential_attention(tokenizer, model, pairs=CUSTOM_PAIRS)

top_idx = np.argsort(bias_diff.flatten())[::-1][:5]
print('Top bias-sensitive heads:')
for i in top_idx:
    l, h = i // n_heads, i % n_heads
    print(f'  Layer {l:2d}  Head {h:2d}  Δ={bias_diff[l,h]:.5f}')

fig = plot_bias_heatmap(bias_diff, save=False)
plt.show()

## 6. Instruction-Following Heads

In [None]:
inst_scores = instruction_head_scores(tokenizer=tokenizer, model=model)

top_idx = np.argsort(inst_scores.flatten())[::-1][:5]
print('Top instruction-following heads:')
for i in top_idx:
    l, h = i // n_heads, i % n_heads
    print(f'  Layer {l:2d}  Head {h:2d}  surplus={inst_scores[l,h]:+.5f}')

fig = plot_instruction_heatmap(inst_scores, save=False)
plt.show()

## 7. FFN Neuron Activations

In [None]:
NEURON_TEXT = 'Artificial intelligence is transforming the world of science.'

activations = get_ffn_activations(NEURON_TEXT, tokenizer, model)
print(f'Activation layers captured: {sorted(activations.keys())}')

# Inspect layer 6
INSPECT_LAYER = 6
top_neurons = top_active_neurons(activations, layer=INSPECT_LAYER, top_k=10)
print(f'\nTop 10 neurons in layer {INSPECT_LAYER}:')
for neuron, score in top_neurons:
    print(f'  Neuron {neuron:4d}  mean|act|={score:.5f}')

fig = plot_neuron_activations(top_neurons, layer=INSPECT_LAYER,
                              title_suffix='(AI text)', save=False)
plt.show()

## 8. TransformerLens — Logit Lens & Ablations

In [None]:
# Check if TransformerLens is available
diag = tl_diagnostic('gpt2')
print('TL Diagnostic:', diag)

In [None]:
# If TL is available, run the logit lens
tl_model = load_hooked_model('gpt2')

if tl_model is not None:
    preds = logit_lens(tl_model, 'The Eiffel Tower is located in', top_k=5)
    print('Logit Lens predictions by layer:')
    for layer, top in preds.items():
        print(f'  Layer {layer:2d}: {[(t, round(p,3)) for t,p in top]}')
else:
    print('TL not available — mock preds shown')
    preds = logit_lens(None, '', top_k=5)
    print(preds)

In [None]:
# Head ablation — WARNING: this is slow (L x H forward passes)
# Restrict to 3 layers for a quick demo

ABLATION_TEXT = 'The capital of France is'
ABLATION_LAYERS = [0, 5, 11]  # First, middle, last layer

delta_matrix = full_ablation_matrix(
    model=tl_model,
    text=ABLATION_TEXT,
    layers=ABLATION_LAYERS,
)

import seaborn as sns
fig, ax = plt.subplots(figsize=(12, 3))
sns.heatmap(
    delta_matrix[ABLATION_LAYERS],
    ax=ax, cmap='Reds', annot=True, fmt='.3f',
    xticklabels=[f'H{h}' for h in range(delta_matrix.shape[1])],
    yticklabels=[f'L{l}' for l in ABLATION_LAYERS],
    cbar_kws={'label': 'Δ Loss (positive = head matters)'}
)
ax.set_title('Head Ablation — ΔLoss', fontweight='bold')
plt.tight_layout()
plt.show()

## 9. Summary Overview

In [None]:
fig = plot_head_summary(
    entropy=entropy,
    sentiment=sent_corr,
    bias=bias_diff,
    instruction=inst_scores,
    top_n=5,
    save=False,
)
plt.show()
print('Done!')