In [1]:
import torch
import matplotlib.pyplot as plt
import argparse
from transformers import AutoModelForCausalLM

def load_model(checkpoint_path):
    """
    Loads a finetuned Llama model from a checkpoint path.
    This uses the Hugging Face AutoModelForCausalLM loader.
    """
    # Note: The AutoModelForCausalLM.from_pretrained() call can load either a local directory or a hub ID.
    model = AutoModelForCausalLM.from_pretrained(checkpoint_path, torch_dtype=torch.float16)
    model.eval()  # Set model to evaluation mode
    # Optionally, move to CPU if desired: model.to("cpu")
    return model

In [2]:

def get_neuron_weight_norms(model):
    """
    Computes neuron weight norms for each MLP layer in the model.
    
    For each layer, the neuron weight norm is computed as:
    
        norm = sqrt( sum(gate_proj.weight^2, dim=1)
                   + sum(up_proj.weight^2, dim=1)
                   + sum(down_proj.weight^2, dim=0) )
    
    This function assumes that the model has an attribute 'model.layers', where each layer
    has an 'mlp' module with 'gate_proj', 'up_proj' and 'down_proj' projection layers.
    """
    neuron_norms_all = []
    
    # Loop over layers. For Llama models the transformer layers live in model.model.layers.
    # If needed, adjust this for your particular architecture.
    for i, layer in enumerate(model.model.layers):
        # Access the three projection layers in the MLP component of each transformer layer.
        gate_proj = layer.mlp.gate_proj
        up_proj = layer.mlp.up_proj
        down_proj = layer.mlp.down_proj

        # Compute the L2 (Euclidean) norm for each neuron:
        # - For gate_proj and up_proj, each row corresponds to a neuron so we sum over dim=1.
        # - For down_proj, since weight.shape is [embedding_dim, hidden_dim],
        #   we sum over dim=0 so that each neuron corresponds to a column.
        neuron_norms_layer = torch.sqrt(
            gate_proj.weight.pow(2).sum(dim=1) +
            up_proj.weight.pow(2).sum(dim=1) +
            down_proj.weight.pow(2).sum(dim=0)
        )
        neuron_norms_all.append(neuron_norms_layer.detach().cpu())
    
    # Concatenate norms from all layers into a single tensor and convert to numpy.
    all_norms = torch.cat(neuron_norms_all, dim=0).numpy()
    return all_norms

def plot_norm_distribution(norms):
    """
    Plots the histogram of neuron weight norms and prints some global quantiles.
    """
    plt.figure(figsize=(12, 8))
    plt.hist(norms, bins=100, density=True, alpha=0.75)
    plt.xlabel("Neuron Weight Norm")
    plt.ylabel("Density")
    plt.title("Distribution of Neuron Weight Norms")
    plt.grid(True)
    plt.show()

    # Compute and print several global percentiles.
    norms_tensor = torch.tensor(norms)
    percentiles = [1,5, 10, 25, 50, 75, 90, 95]
    print("Global neuron weight norm percentiles:")
    for p in percentiles:
        q_value = torch.quantile(norms_tensor, p / 100.0).item()
        print(f"  {p}th percentile: {q_value:.4f}")

In [3]:
print("j")

j


In [6]:
checkpoint = "/afs/csail.mit.edu/u/a/asher/narrow/experiments/weightpruning1/logs/checkpoint-2000"
model = load_model(checkpoint)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at /afs/csail.mit.edu/u/a/asher/narrow/experiments/weightpruning1/logs/checkpoint-2000 were not used when initializing LlamaForCausalLM: {'model.layers.2.mlp.up_proj.weight_mask', 'model.layers.14.mlp.up_proj.weight_orig', 'model.layers.1.mlp.up_proj.weight_orig', 'model.layers.8.mlp.gate_proj.weight_mask', 'model.layers.14.mlp.gate_proj.weight_mask', 'model.layers.6.mlp.gate_proj.weight_orig', 'model.layers.5.mlp.down_proj.weight_orig', 'model.layers.13.mlp.gate_proj.weight_orig', 'model.layers.13.mlp.down_proj.weight_orig', 'model.layers.6.mlp.down_proj.weight_orig', 'model.layers.0.mlp.up_proj.weight_orig', 'model.layers.7.mlp.down_proj.weight_mask', 'model.layers.15.mlp.up_proj.weight_orig', 'model.layers.5.mlp.up_proj.weight_mask', 'model.layers.14.mlp.down_proj.weight_mask', 'model.layers.11.mlp.gate_proj.weight_orig', 'model.layers.11.mlp.up_proj.weight_mask', 'model.layers.14.mlp.up_proj.weight_mask', 'model.layers.15.mlp.down_proj.weight_ma

In [5]:
print("Computing neuron weight norms...")
norms = get_neuron_weight_norms(model)
print(f"Computed neuron weight norms for {len(norms)} neurons.")
plot_norm_distribution(norms)

: 

In [None]:
print("jo")