## 1. Visualize Dataset-level Statistics

In [1]:
from repepo.data.make_dataset import list_datasets
from repepo.experiments_2.utils.config import DATASET_DIR

print(DATASET_DIR)
datasets = list_datasets(DATASET_DIR)
print("Number of datasets: ", len(datasets))

/home/daniel/ml_workspace/datasets
Number of datasets:  135


In [2]:
from repepo.experiments_2.utils.helpers import (
    ConceptVectorsConfig,
    load_activation_differences
)

for dataset_name in datasets:
    continue
    # TODO
    config = ConceptVectorsConfig()
    config.train_dataset_spec.name = dataset_name 
    difference_vectors = load_activation_differences(config)

## 2. Visualize Individual Concept Vectors

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import torch
from sklearn.decomposition import PCA

def run_pca(data: np.ndarray, n_components: int = 1):
    """
    Run PCA on a batch of N x D vectors.

    Parameters:
    - data: N x D array where N is the number of samples and D is the number of features.

    Returns:
    - principal_components: Principal components obtained from PCA.
    - explained_variance: Explained variance corresponding to each principal component.
    """

    pca = PCA(n_components=n_components)
    coefficients = pca.fit_transform(data)
    components = pca.components_
    explained_variance = pca.explained_variance_ratio_

    return components, explained_variance

def get_component_and_variance(
    config: ConceptVectorsConfig,
    layer: int,
    n_components: int = 1
):
    """
    Get the principal components and explained variance of the activation differences at a given layer.

    Parameters:
    - config: Configuration object containing the dataset name and other parameters.
    - layer: Layer index for which to obtain the principal components and explained variance.

    Returns:
    - principal_components: Principal components obtained from PCA.
    - explained_variance: Explained variance corresponding to each principal component.
    """

    # Load activation differences
    activation_differences = load_activation_differences(config)

    # Convert to numpy
    activation_differences_np = torch.stack(activation_differences[layer]).numpy()

    # Run PCA
    principal_components, explained_variance = run_pca(
        activation_differences_np, n_components=n_components
    )

    return principal_components, explained_variance

In [11]:
from repepo.data.make_dataset import DatasetSpec

datasets = [
    "truthfulqa",
    "subscribes-to-virtue-ethics",
    "interest-in-math",
    "anti-immigration",
    "has-disability"
]

tqa_config = ConceptVectorsConfig(
    train_dataset_spec = (
        DatasetSpec(
            name = "truthfulqa",
        )
    )
)

ethics_config = ConceptVectorsConfig(
    train_dataset_spec = (
        DatasetSpec(
            name = "subscribes-to-virtue-ethics",
            split=":1%"
        )
    )
)

math_config = ConceptVectorsConfig(
    train_dataset_spec = (
        DatasetSpec(
            name = "interest-in-math",
            split=":1%"
        )
    )
)

In [13]:
tqa_components, tqa_variance = get_component_and_variance(
    tqa_config, 13
)
print(tqa_components.shape)
print(tqa_variance)

(1, 5120)
[0.50660064]


In [7]:
for layer in range(40):
    ethics_components, ethics_variance = get_component_and_variance(ethics_config, layer)
    math_components, math_variance = get_component_and_variance(math_config, layer)
    
    # Cosine similarity between the principal components
    cosine_sim = np.dot(ethics_components, math_components.T)
    results_str = f"layer: {layer} | cos_sim: {cosine_sim[0, 0]:.2f} | var_ethics: {ethics_variance[0]:.2f} | var_math: {math_variance[0]:.2f}"
    print(results_str)

layer: 0 | cos_sim: 1.00 | var_ethics: 1.00 | var_math: 1.00
layer: 1 | cos_sim: 1.00 | var_ethics: 1.00 | var_math: 1.00
layer: 2 | cos_sim: 1.00 | var_ethics: 1.00 | var_math: 1.00
layer: 3 | cos_sim: 0.99 | var_ethics: 1.00 | var_math: 1.00
layer: 4 | cos_sim: 0.99 | var_ethics: 1.00 | var_math: 1.00
layer: 5 | cos_sim: 0.99 | var_ethics: 1.00 | var_math: 1.00
layer: 6 | cos_sim: 0.99 | var_ethics: 1.00 | var_math: 1.00
layer: 7 | cos_sim: 0.99 | var_ethics: 1.00 | var_math: 1.00
layer: 8 | cos_sim: 0.99 | var_ethics: 1.00 | var_math: 1.00
layer: 9 | cos_sim: 0.99 | var_ethics: 0.99 | var_math: 1.00
layer: 10 | cos_sim: 0.99 | var_ethics: 0.99 | var_math: 0.99
layer: 11 | cos_sim: 0.99 | var_ethics: 0.99 | var_math: 0.99
layer: 12 | cos_sim: 0.99 | var_ethics: 0.99 | var_math: 0.99
layer: 13 | cos_sim: 0.98 | var_ethics: 0.98 | var_math: 0.98
layer: 14 | cos_sim: 0.98 | var_ethics: 0.97 | var_math: 0.98
layer: 15 | cos_sim: 0.98 | var_ethics: 0.97 | var_math: 0.98
layer: 16 | cos_si