# nnsigh walkthrough
https://nnsight.net/notebooks/tutorials/walkthrough/

In [1]:
from collections import OrderedDict
import torch

import nnsight
from nnsight import NNsight

from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

True

In [2]:
!pip show nnsight

Name: nnsight
Version: 0.4.1
Summary: Package for interpreting and manipulating the internals of deep learning models.
Home-page: https://github.com/ndif-team/nnsight
Author: 
Author-email: Jaden Fiotto-Kaufman <jadenfk@outlook.com>
License: MIT License
Location: /home/ailab/python_venv/feature_circuits/lib/python3.11/site-packages
Requires: accelerate, diffusers, einops, ipython, msgspec, protobuf, pydantic, python-socketio, sentencepiece, tokenizers, toml, torch, torchvision, transformers
Required-by: 


## Tracing Context
Everything within the tracing context operates on the intervention graph.
![intervention_graph](../images/nnsight_intervention_graph.png)

In [3]:
input_size = 5
hidden_dims = 10
output_size = 2

net = torch.nn.Sequential(
    OrderedDict(
        [
            ("layer1", torch.nn.Linear(input_size, hidden_dims)),
            ("layer2", torch.nn.Linear(hidden_dims, output_size)),
        ]
    )
).requires_grad_(False)

In [4]:
tiny_model = NNsight(net)
tiny_model

Sequential(
  (layer1): Linear(in_features=5, out_features=10, bias=True)
  (layer2): Linear(in_features=10, out_features=2, bias=True)
)

In [5]:
# random input
input = torch.rand((1, input_size))

with tiny_model.trace(input) as tracer:
    pass

In [6]:
input

tensor([[0.9401, 0.0696, 0.9559, 0.5525, 0.3543]])

## Getting

In [7]:
#output of the model as a whole
with tiny_model.trace(input) as tracer:

    output = tiny_model.output.save()

print(output)

tensor([[-0.1516,  0.0935]])


In [8]:
#Let’s access the output of the first layer
with tiny_model.trace(input) as tracer:

    l1_output = tiny_model.layer1.output.save()

print(l1_output)


tensor([[ 0.1288, -0.5436, -0.1997, -0.0901,  0.1611,  0.5188, -0.1081,  0.9020,
         -0.0742,  0.1666]])


In [9]:
with tiny_model.trace(input):

    l2_input = tiny_model.layer2.input.save()

print(l2_input)

tensor([[ 0.1288, -0.5436, -0.1997, -0.0901,  0.1611,  0.5188, -0.1081,  0.9020,
         -0.0742,  0.1666]])


## Functions, Methods, and Operations

In [10]:
with tiny_model.trace(input):

    # Note we don't need to call .save() on the output,
    # as we're only using its value within the tracing context.
    l1_output = tiny_model.layer1.output

    # We do need to save the argmax tensor however,
    # as we're using it outside the tracing context.
    l1_amax = torch.argmax(l1_output, dim=1).save()

print(l1_amax[0])

tensor(7)


In [11]:
"""Run the model with the given input. 
When the output of tiny_model.layer1 is computed, take its sum. 
Then do the same for tiny_model.layer2. 
Now that both of those are computed, add them and make sure 
not to delete this value as I wish to use it outside of the 
tracing context."""
with tiny_model.trace(input):

    value = (tiny_model.layer1.output.sum() + tiny_model.layer2.output.sum()).save()

print(value)

tensor(0.8037)


## Custom Functions

In [12]:
#THE CODE BELOVE GIVES ERROR. PROBABLY BECAUSE OF THE NNSIGHT VERSION (v 0.2.21)
""" # Take a tensor and return the sum of its elements
def tensor_sum(tensor):
    flat = tensor.flatten()
    total = 0
    for element in flat:
        total += element.item()

    return torch.tensor(total)

with tiny_model.trace(input) as tracer:

    # Specify the function name and its arguments (in a comma-separated form) to add to the intervention graph
    custom_sum = nnsight.apply(tensor_sum, tiny_model.layer1.output).save()
    sum = tiny_model.layer1.output.sum()
    sum.save()


print(custom_sum, sum) """

' # Take a tensor and return the sum of its elements\ndef tensor_sum(tensor):\n    flat = tensor.flatten()\n    total = 0\n    for element in flat:\n        total += element.item()\n\n    return torch.tensor(total)\n\nwith tiny_model.trace(input) as tracer:\n\n    # Specify the function name and its arguments (in a comma-separated form) to add to the intervention graph\n    custom_sum = nnsight.apply(tensor_sum, tiny_model.layer1.output).save()\n    sum = tiny_model.layer1.output.sum()\n    sum.save()\n\n\nprint(custom_sum, sum) '

## Setting

In [13]:
# let’s set the first dimension of the first layer’s output to 0. 
# NNsight makes this really easy using the ‘=’ operator

with tiny_model.trace(input):

    # Save the output before the edit to compare.
    # Notice we apply .clone() before saving as the setting operation is in-place.
    l1_output_before = tiny_model.layer1.output.clone().save()

    # Access the 0th index of the hidden state dimension and set it to 0.
    tiny_model.layer1.output[:, 0] = 0

    # Save the output after to see our edit.
    l1_output_after = tiny_model.layer1.output.save()

print("Before:", l1_output_before)
print("After:", l1_output_after)

Before: tensor([[ 0.1288, -0.5436, -0.1997, -0.0901,  0.1611,  0.5188, -0.1081,  0.9020,
         -0.0742,  0.1666]])
After: tensor([[ 0.0000, -0.5436, -0.1997, -0.0901,  0.1611,  0.5188, -0.1081,  0.9020,
         -0.0742,  0.1666]])


In [14]:
with tiny_model.trace(input):

    # Save the output before the edit to compare.
    # Notice we apply .clone() before saving as the setting operation is in-place.
    l1_output_before = tiny_model.layer1.output.clone().save()

    # Access the last index of the hidden state dimension and set it to 0.
    tiny_model.layer1.output[:, hidden_dims -1] = 0

    # Save the output after to see our edit.
    l1_output_after = tiny_model.layer1.output.save()

print("Before:", l1_output_before)
print("After:", l1_output_after)

Before: tensor([[ 0.1288, -0.5436, -0.1997, -0.0901,  0.1611,  0.5188, -0.1081,  0.9020,
         -0.0742,  0.1666]])
After: tensor([[ 0.1288, -0.5436, -0.1997, -0.0901,  0.1611,  0.5188, -0.1081,  0.9020,
         -0.0742,  0.0000]])


# Scan and Validate
We can enable this features by setting the scan=True and validate=True flag in the trace method. 
+ “Scanning” runs “fake” inputs throught the model to collect information like shapes and types (i.e., scanning will populate all called .inputs and .outputs). 
+ “Validating” attempts to execute the intervention proxies with “fake” inputs to check if they work (i.e., executes all interventions in your code with fake tensors).  

 **The operations are never executed using tensors with real values so it doesn’t incur any memory costs**.  
“Validating” is dependent on “Scanning” to work correctly, so we need to run the scan of the model at least once to debug with validate. Let’s try it out on our example above.

In [15]:
# turn on scan and validate
#The operations are never executed using tensors with real values so 
# it doesn’t incur any memory costs.
with tiny_model.trace(input, scan=True, validate=True):

    l1_output_before = tiny_model.layer1.output.clone().save()

    # the error is happening here
    tiny_model.layer1.output[:, hidden_dims -1] = 0

    l1_output_after = tiny_model.layer1.output.save()

print("Before:", l1_output_before)
print("After:", l1_output_after)

Before: tensor([[ 0.1288, -0.5436, -0.1997, -0.0901,  0.1611,  0.5188, -0.1081,  0.9020,
         -0.0742,  0.1666]])
After: tensor([[ 0.1288, -0.5436, -0.1997, -0.0901,  0.1611,  0.5188, -0.1081,  0.9020,
         -0.0742,  0.0000]])


In [16]:
# THE CODE BELOW GIVES ERROR. PROBABLY BECAUSE OF THE NNSIGHT VERSION (v 0.2.21)
""" with tiny_model.scan(input):

    dim = tiny_model.layer1.output.shape[-1]

print(dim) """

' with tiny_model.scan(input):\n\n    dim = tiny_model.layer1.output.shape[-1]\n\nprint(dim) '

# Gradients
NNsight also lets us apply backpropagation and access gradients with respect to a loss. Like .input and .output on modules, nnsight exposes .grad on Proxies themselves (assuming they are proxies of tensors):

In [17]:
with tiny_model.trace(input):

    # We need to explicitly have the tensor require grad
    # as the model we defined earlier turned off requiring grad.
    tiny_model.layer1.output.requires_grad = True

    # We call .grad on a tensor Proxy to communicate we want to store its gradient.
    # We need to call .save() since .grad is its own Proxy.
    layer1_output_grad = tiny_model.layer1.output.grad.save()
    layer2_output_grad = tiny_model.layer2.output.grad.save()

    # Need a loss to propagate through the later modules in order to have a grad.
    loss = tiny_model.output.sum()
    loss.backward()

print("Layer 1 output gradient:", layer1_output_grad)
print("Layer 2 output gradient:", layer2_output_grad)

Layer 1 output gradient: tensor([[-0.5537, -0.3643, -0.0776,  0.2580,  0.2223, -0.2875,  0.3514,  0.3844,
         -0.1547, -0.3447]])
Layer 2 output gradient: tensor([[1., 1.]])


In [18]:
# we can apply operations to and edit the gradients. 
# Let’s zero the grad of layer1 and double the grad of layer2

# THE CODE BELOW GIVES ERROR. PROBABLY BECAUSE OF THE NNSIGHT VERSION (v 0.2.21)
""" with tiny_model.trace(input):

    # We need to explicitly have the tensor require grad
    # as the model we defined earlier turned off requiring grad.
    tiny_model.layer1.output.requires_grad = True

    tiny_model.layer1.output.grad[:] = 0
    tiny_model.layer2.output.grad = tiny_model.layer2.output.grad * 2

    layer1_output_grad = tiny_model.layer1.output.grad.save()
    layer2_output_grad = tiny_model.layer2.output.grad.save()

    # Need a loss to propagate through the later modules in order to have a grad.
    loss = tiny_model.output.sum()
    loss.backward()

print("Layer 1 output gradient:", layer1_output_grad)
print("Layer 2 output gradient:", layer2_output_grad) """


' with tiny_model.trace(input):\n\n    # We need to explicitly have the tensor require grad\n    # as the model we defined earlier turned off requiring grad.\n    tiny_model.layer1.output.requires_grad = True\n\n    tiny_model.layer1.output.grad[:] = 0\n    tiny_model.layer2.output.grad = tiny_model.layer2.output.grad * 2\n\n    layer1_output_grad = tiny_model.layer1.output.grad.save()\n    layer2_output_grad = tiny_model.layer2.output.grad.save()\n\n    # Need a loss to propagate through the later modules in order to have a grad.\n    loss = tiny_model.output.sum()\n    loss.backward()\n\nprint("Layer 1 output gradient:", layer1_output_grad)\nprint("Layer 2 output gradient:", layer2_output_grad) '

# Early Stopping
If we are only interested in a model’s intermediate computations, we can halt a forward pass run at any module level, reducing runtime and conserving compute resources.  
One examples where this could be particularly useful would if we are working with SAEs - we can train an SAE on one layer and then stop the execution.

In [19]:
# THE CODE BELOW GIVES ERROR. PROBABLY BECAUSE OF THE NNSIGHT VERSION (v 0.2.21)
""" with tiny_model.trace(input):
   l1_out = tiny_model.layer1.output.save()
   tiny_model.layer1.output.stop()

# get the output of the first layer and stop tracing
print("L1 - Output: ", l1_out) """

' with tiny_model.trace(input):\n   l1_out = tiny_model.layer1.output.save()\n   tiny_model.layer1.output.stop()\n\n# get the output of the first layer and stop tracing\nprint("L1 - Output: ", l1_out) '

# Conditional Interventions

In [20]:
# THE CODE BELOW GIVES ERROR. PROBABLY BECAUSE OF THE NNSIGHT VERSION (v 0.2.21)
""" with tiny_model.trace(input) as tracer:

  rand_int = torch.randint(low=-10, high=10, size=(1,))

  with tracer.cond(rand_int % 2 == 0):
    tracer.log("Random Integer ", rand_int, " is Even")

  with tracer.cond(rand_int % 2 == 1):
    tracer.log("Random Integer ", rand_int, " is Odd") """

' with tiny_model.trace(input) as tracer:\n\n  rand_int = torch.randint(low=-10, high=10, size=(1,))\n\n  with tracer.cond(rand_int % 2 == 0):\n    tracer.log("Random Integer ", rand_int, " is Even")\n\n  with tracer.cond(rand_int % 2 == 1):\n    tracer.log("Random Integer ", rand_int, " is Odd") '

# Iterative Interventions

In [21]:
# THE CODE BELOW GIVES ERROR. PROBABLY BECAUSE OF THE NNSIGHT VERSION (v 0.2.21)
""" with tiny_model.session() as session:

  li = nnsight.list() # an NNsight built-in list object
  [li.append([num]) for num in range(0, 3)] # adding [0], [1], [2] to the list
  li2 = nnsight.list().save()

  # You can create nested Iterator contexts
  with session.iter(li) as item:
    with session.iter(item) as item_2:
      li2.append(item_2)

print("\nList: ", li2) """

' with tiny_model.session() as session:\n\n  li = nnsight.list() # an NNsight built-in list object\n  [li.append([num]) for num in range(0, 3)] # adding [0], [1], [2] to the list\n  li2 = nnsight.list().save()\n\n  # You can create nested Iterator contexts\n  with session.iter(li) as item:\n    with session.iter(item) as item_2:\n      li2.append(item_2)\n\nprint("\nList: ", li2) '

# 2. Bigger
if you’d like to load a Language Model from HuggingFace with its tokenizer, theLanguageModel subclass greatly simplifies this process.  

LanguageModel is a subclass of NNsight. While we could define and create a model to pass in directly, LanguageModel includes special support for Huggingface language models, including automatically loading models from a Huggingface ID, and loading the model together with the appropriate tokenizer. Here is how we can use LanguageModel to load GPT-2:

**dispatch**  
When we initialize LanguageModel, we aren’t yet loading the parameters of the model into memory. We are actually loading a ‘meta’ version of the model which doesn’t take up any memory, but still allows us to view and trace actions on it. After exiting the first tracing context, the model is then fully loaded into memory. To load into memory on initialization, you can pass dispatch=True into LanguageModel like LanguageModel('openai-community/gpt2', device_map="auto", dispatch=True)

In [22]:
from nnsight import LanguageModel

#Model Initialization
llm = LanguageModel("openai-community/gpt2", device_map="auto")

print(llm)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  (generator): Generator(
    (streamer): Streamer()
  )
)


Unlike NNsight, LanguageModel does define logic to pre-process inputs upon entering the tracing context. This makes interacting with the model simpler (i.e., you can send prompts to the model without having to directly access the tokenizer). In the following example, we ablate the value coming from the last layer’s MLP module and decode the logits to see what token the model predicts without influence from that particular module:

In [23]:
from transformers import GPT2Tokenizer

# Initialize the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Input string
text = "The Eiffel Tower is in the city of"

# Tokenize the input string
tokens = tokenizer.encode(text)

# Count the number of tokens
num_tokens = len(tokens)

print(f"Number of GPT tokens: {num_tokens}")

Number of GPT tokens: 10


In [24]:
with llm.trace("The Eiffel Tower is in the city of"):

    # Access the last layer using h[-1] as it's a ModuleList
    # Access the first index of .output as that's where the hidden states are.
    input = llm.transformer.wpe.output.save()
    llm.transformer.h[-1].mlp.output[0][:] = 0
    ln_0 = llm.transformer.h[0].ln_1.output.save()
    attn_0 = llm.transformer.h[0].attn.output.save()

    # Logits come out of model.lm_head and we apply argmax to get the predicted token ids.
    token_ids = llm.lm_head.output.argmax(dim=-1).save()

print("\nToken IDs:", token_ids)

# Apply the tokenizer to decode the ids into words after the tracing context.
print("Prediction:", llm.tokenizer.decode(token_ids[0][-1]))


Token IDs: tensor([[ 262,   12,  417, 8765,   11,  257,  262, 3504,  338, 3576]],
       device='cuda:0')
Prediction:  London


In [25]:
input.shape, ln_0.shape

(torch.Size([1, 10, 768]), torch.Size([1, 10, 768]))

In [26]:
attn_0[0].shape, len(attn_0[1]), attn_0[1][0].shape, attn_0[1][1].shape

(torch.Size([1, 10, 768]),
 2,
 torch.Size([1, 12, 10, 64]),
 torch.Size([1, 12, 10, 64]))

In [27]:
ln_0.shape

torch.Size([1, 10, 768])

In [28]:
# Read the token from the environment variable
hf_token = os.getenv('HUGGINGFACE_TOKEN')
ndif_token = os.getenv('NDIF_TOKEN')

In [29]:
from nnsight import CONFIG

CONFIG.set_default_api_key(ndif_token)

In [30]:
import os

# Llama 3.1 8b is a gated model, so you need to apply for access on HuggingFace and include your token.
os.environ['HF_TOKEN'] = hf_token

# Change nnsight version
```bash
pip install --upgrade nnsight  
pip install nnsight==0.2.21

In [32]:
print(nnsight.__version__) 

0.4.1


In [33]:
from nnsight import LanguageModel
model = LanguageModel('meta-llama/Meta-Llama-3.1-8B')
with model.trace('The Eiffel Tower is in the city of ', remote=True):
    output = model.output.save() # save model output
print('The model predicts', output)

ValidationError: 1 validation error for ResponseModel
received
  Input should be a valid datetime or date, input is too short [type=datetime_from_date_parsing, input_value='None', input_type=str]
    For further information visit https://errors.pydantic.dev/2.10/v/datetime_from_date_parsing

In [34]:
from nnsight import LanguageModel

llama = LanguageModel("meta-llama/Meta-Llama-3.1-8B")
with llama.trace("The Eiffel Tower is in the city of", remote=True) as runner:
    hidden_states = llama.model.layers[-1].output.save()
    output = llama.output.save()

print(hidden_states)
print(output["logits"])

ValidationError: 1 validation error for ResponseModel
received
  Input should be a valid datetime or date, input is too short [type=datetime_from_date_parsing, input_value='None', input_type=str]
    For further information visit https://errors.pydantic.dev/2.10/v/datetime_from_date_parsing

In [35]:
from nnsight import LanguageModel

# We'll never actually load the parameters locally, so no need to specify a device_map.
llama = LanguageModel("meta-llama/Meta-Llama-3.1-8B")

# All we need to specify using NDIF vs executing locally is remote=True.
with llama.trace("The Eiffel Tower is in the city of", remote=True) as runner:

    hidden_states = llama.model.layers[-1].output.save()

    output = llama.output.save()

print(hidden_states)

print(output["logits"])

ValidationError: 1 validation error for ResponseModel
received
  Input should be a valid datetime or date, input is too short [type=datetime_from_date_parsing, input_value='None', input_type=str]
    For further information visit https://errors.pydantic.dev/2.10/v/datetime_from_date_parsing

In [36]:
with llama.session(remote=True) as session:

  with llama.trace("The Eiffel Tower is in the city of") as t1:
    # capture the hidden state from layer 32 at the last token
    hs_31 = llama.model.layers[31].output[0][:, -1, :] # no .save()
    t1_tokens_out = llama.lm_head.output.argmax(dim=-1).save()

  with llama.trace("Buckingham Palace is in the city of") as t2:
    llama.model.layers[1].output[0][:, -1, :] = hs_31[:]
    t2_tokens_out = llama.lm_head.output.argmax(dim=-1).save()

print("\nT1 - Original Prediction: ", llama.tokenizer.decode(t1_tokens_out[0][-1]))
print("T2 - Modified Prediction: ", llama.tokenizer.decode(t2_tokens_out[0][-1]))

ValidationError: 1 validation error for ResponseModel
received
  Input should be a valid datetime or date, input is too short [type=datetime_from_date_parsing, input_value='None', input_type=str]
    For further information visit https://errors.pydantic.dev/2.10/v/datetime_from_date_parsing

In [None]:
with llama.session(remote=True) as session:

  with llama.trace("The Eiffel Tower is in the city of") as t1:
    # capture the hidden state from layer 32 at the last token
    hs_31 = llama.model.layers[31].output[0][:, -1, :] # no .save()
    t1_tokens_out = llama.lm_head.output.argmax(dim=-1).save()

  with llama.trace("Buckingham Palace is in the city of") as t2:
    llama.model.layers[1].output[0][:, -1, :] = hs_31[:]
    t2_tokens_out = llama.lm_head.output.argmax(dim=-1).save()

print("\nT1 - Original Prediction: ", llama.tokenizer.decode(t1_tokens_out[0][-1]))
print("T2 - Modified Prediction: ", llama.tokenizer.decode(t2_tokens_out[0][-1]))

ConnectionError: Internal Server Error