In [1]:
"""
Main script for collecting and analyzing CrossCoder layer activations, 
logits, and decoder directions.

This script:
1. Collects layer activations and logits from both base and IT models
2. Computes metrics for decoder directions (norms, cosine similarities)
3. Calculates KL divergence between model logits
4. Analyzes and visualizes the results
5. Identifies interesting decoder directions for further study
"""

import os
import torch as th
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle


In [2]:

# Import your modules
from dictionary_learning.dictionary import BatchTopKCrossCoder
from nnsight import LanguageModel

# Import the utility functions we created
# You'll need to save these as separate Python modules
import importlib
import data_collection
importlib.reload(data_collection)

from data_collection import collect_with_cpu_dataframes

from analysis_utils import (
    find_interesting_directions, 
    analyze_feature_occurrence, 
    plot_decoder_stats, 
    plot_kl_divergence_analysis,
    generate_feature_report
)


In [3]:

# Create directories for outputs
os.makedirs('saved_data', exist_ok=True)
os.makedirs('plots', exist_ok=True)
os.makedirs('reports', exist_ok=True)

# Set up models (assuming you've already loaded them as in your notebook)
print("Loading models...")
crosscoder = BatchTopKCrossCoder.from_pretrained(
    "science-of-finetuning/gemma-2-2b-L13-k100-lr1e-04-local-shuffling-CCLoss", 
    from_hub=True, 
    device="cuda"
)

gemma_2 = LanguageModel("google/gemma-2-2b", device_map="cuda")
gemma_2_it = LanguageModel("google/gemma-2-2b-it", device_map="cuda")


Loading models...


In [4]:
# Clear CUDA memory
import torch
torch.cuda.empty_cache()
torch.cuda.synchronize()  # Force clear by waiting for all CUDA operations to finish
print("CUDA Memory Stats:")
print(f"Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
print(f"Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
print(f"Max Allocated: {torch.cuda.max_memory_allocated() / 1024**2:.2f} MB")
torch.cuda.empty_cache()


CUDA Memory Stats:
Allocated: 2592.30 MB
Reserved: 2594.00 MB
Max Allocated: 2592.30 MB


In [None]:
# Load your jokes dataset
print("Loading jokes data...")
jokes_df = pd.read_csv('shortjokes_500.csv')

# Get the joke texts
jokes = jokes_df["Joke"].tolist()

# Set the token index to analyze (default from your notebook: -5)
token_index = -5


In [None]:

# Collect data and get CPU-based DataFrames
jokes_df, features_df, global_df = collect_with_cpu_dataframes(
    jokes=jokes,
    gemma_2=gemma_2,
    gemma_2_it=gemma_2_it,
    crosscoder=crosscoder,
    token_index=-5,
    save_dir='saved_data'
)


In [None]:

# Find the most interesting directions
interesting_features = find_interesting_directions(
    global_df, 
    cosine_threshold=0.8,
    norm_threshold=0.1
)


In [None]:

# Analyze which of those interesting features appear most often
feature_counts = features_df[features_df['feature_index'].isin(interesting_features['feature_index'])]
feature_occurrence = feature_counts['feature_index'].value_counts().reset_index()
feature_occurrence.columns = ['feature_index', 'occurrence_count']

# Plot the results
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
plt.scatter(global_df['cosine_similarity'], global_df['l2_norm_base'], alpha=0.5)
plt.xlabel('Cosine Similarity')
plt.ylabel('L2 Norm (Base Model)')
plt.title('Relationship between Cosine Similarity and Norm')
plt.show()

In [5]:


# Collect data and compute metrics
print("Collecting model data and computing metrics...")
df, raw_data = collect_model_data(
    jokes, 
    gemma_2, 
    gemma_2_it, 
    crosscoder, 
    token_index=token_index,
    save_dir='saved_data',
    batch_size=1
)


Loading jokes data...
Collecting model data and computing metrics...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Error processing joke 0: CUDA out of memory. Tried to allocate 82.00 MiB. GPU 0 has a total capacity of 79.25 GiB of which 79.19 MiB is free. Process 1879730 has 59.33 GiB memory in use. Process 2134231 has 19.83 GiB memory in use. Of the allocated memory 19.31 GiB is allocated by PyTorch, and 31.40 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


NameError: name 'gc' is not defined

In [None]:

# At this point, you should have the following files:
# - saved_data/crosscoder_metrics.csv (DataFrame with metrics)
# - saved_data/raw_activations_logits.pkl (Raw model outputs)
# - saved_data/global_decoder_stats.csv (Stats for all decoder directions)

# If you want to load the data later instead of collecting it again:
# df = pd.read_csv('saved_data/crosscoder_metrics.csv')
# with open('saved_data/raw_activations_logits.pkl', 'rb') as f:
#     raw_data = pickle.load(f)
# global_df = pd.read_csv('saved_data/global_decoder_stats.csv')

# Load the global decoder stats
global_df = pd.read_csv('saved_data/global_decoder_stats.csv')

# Find interesting decoder directions (low cosine similarity, non-small norms)
print("Finding interesting decoder directions...")
interesting_df = find_interesting_directions(
    global_df, 
    cosine_threshold=0.3,  # Adjust as needed
    norm_threshold=0.1     # Adjust as needed
)

# Get the list of interesting feature indices
interesting_features = interesting_df['feature_index'].tolist()
print(f"Found {len(interesting_features)} interesting features.")

# Analyze how often these features occur in the dataset
print("Analyzing feature occurrence...")
feature_df = analyze_feature_occurrence(df, interesting_features)

# Create visualizations
print("Generating visualizations...")
plot_decoder_stats(global_df, save_dir='plots')
plot_kl_divergence_analysis(df, save_dir='plots')

# Generate a report of the most interesting features
print("Generating feature reports...")
feature_report = generate_feature_report(global_df, feature_df, top_n=50, save_dir='reports')

# Print summary of the most interesting features
print("\nTop 10 most interesting features:")
interesting_active = feature_report.head(10)
for _, row in interesting_active.iterrows():
    feat_idx = int(row['feature_index'])
    cos_sim = row['cosine_similarity']
    l2_base = row['l2_norm_base']
    l2_it = row['l2_norm_it']
    occurrences = row['occurrence_count']
    
    print(f"Feature {feat_idx}: cos_sim={cos_sim:.4f}, l2_base={l2_base:.4f}, "
          f"l2_it={l2_it:.4f}, occurrences={occurrences}")

print("\nAnalysis complete! Results saved to 'saved_data', 'plots', and 'reports' directories.")