# MI of exaggerated safety
Internal analysis of exaggerated safety behaviors of LLMs.

In [2]:
import torch as th
from utils import load_model

hf_model = "gpt2"
tl_model = "gpt2-small"
adapter = ""

model = load_model(hf_model, tl_model, adapter, device='cuda', n_devices=4, dtype=th.bfloat16)
model.eval()

nl = len(model.blocks)

Loading the model...
Loaded pretrained model gpt2-small into HookedTransformer
Loaded model into HookedTransformer


### Data processing

In [6]:
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import os

chat = 'none'
component = 'resid_post'

model_folder = tl_model if adapter == "" else adapter
activ_path = os.path.join('activations', model_folder)

xs_activations = th.load(os.path.join(activ_path, f"xsafety_{chat}_{component}.pt")).to(th.float32)
full_activations = th.load(os.path.join(activ_path, f"blocked_{chat}_{component}.pt")).to(th.float32)

y_xs = pd.read_csv(f"data/xsafety.csv")['label'].values[:len(xs_activations)]
y_full = pd.read_csv(f"data/blocked.csv")['label'].values[:len(full_activations)]

safe_xs_activations_mean = xs_activations[~y_xs.astype(bool)].mean(0).type(th.bfloat16) # [l dm]
safe_full_activations_mean = full_activations[~y_full.astype(bool)].mean(0).type(th.bfloat16) # [l dm]

## Classification

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from utils import FastPCA

In [None]:
xs_rocs = []
full_rocs = []

nc = 10

model = LogisticRegression(penalty=None, verbose=0) # LienarDiscriminantAnalysis()

for l in tqdm(range(nl)):

    pca = FastPCA(n_components=nc)

    X_xs = xs_activations[:, l].type(th.float32) #.type(th.float32).numpy()
    X_full = full_activations[:, l].type(th.float32) #.type(th.float32).numpy()
    
    # Train test split
    X_chat_sys_train, X_chat_sys_test, y_train, y_test = train_test_split(X_chat_sys, y, random_state=42, stratify=y)
    X_chat_nosys_train, X_chat_nosys_test, y_train, y_test = train_test_split(X_chat_nosys, y, random_state=42, stratify=y)
    X_chat_raw_train, X_chat_raw_test, y_train, y_test = train_test_split(X_chat_raw, y, random_state=42, stratify=y)
    X_unsafe_train, X_unsafe_test, y_train, y_test = train_test_split(X_unsafe, y, random_state=42, stratify=y)

    y_rand = th.randint(0, 2, (len(y_train), 1))[:, 0]

    # Compute PCA
    X_chat_sys_train_pca = pca.fit_transform(X_chat_sys_train)
    X_chat_sys_test_pca = pca.transform(X_chat_sys_test)

    X_chat_nosys_train_pca = pca.fit_transform(X_chat_nosys_train)
    X_chat_nosys_test_pca = pca.transform(X_chat_nosys_test)

    X_chat_raw_train_pca = pca.fit_transform(X_chat_raw_train)
    X_chat_raw_test_pca = pca.transform(X_chat_raw_test)

    X_unsafe_train_pca = pca.fit_transform(X_unsafe_train)
    X_unsafe_test_pca = pca.transform(X_unsafe_test)

    # Compute scores
    model.fit(X_chat_sys_train_pca, y_train)
    y_proba = model.predict_proba(X_chat_sys_test_pca)[:, 1]
    chat_sys_rocs.append(roc_auc_score(y_test, y_proba))

    model.fit(X_chat_nosys_train_pca, y_train)
    y_proba = model.predict_proba(X_chat_nosys_test_pca)[:, 1]
    chat_nosys_rocs.append(roc_auc_score(y_test, y_proba))

    model.fit(X_chat_raw_train_pca, y_train)
    y_proba = model.predict_proba(X_chat_raw_test_pca)[:, 1]
    chat_raw_rocs.append(roc_auc_score(y_test, y_proba))

    model.fit(X_unsafe_train_pca, y_train)
    y_proba = model.predict_proba(X_unsafe_test_pca)[:, 1]
    unsafe_rocs.append(roc_auc_score(y_test, y_proba))

In [19]:
import plotly.express as px

px.line(pd.DataFrame({
    'Chat (SYS)': chat_sys_rocs,
    'Chat (NO-SYS)': chat_nosys_rocs,
    'Chat (RAW)': chat_raw_rocs,
    'Unsafe Model': unsafe_rocs,
}), title=f'Classification of exaggerated safety prompts with {nc} PCs of Llama2 activations')

Now we're interested to observe where activations from exaggerated safety prmompts are mapped into the subspace spanned by the first PC of a larger dataset of safe/unsafe prompts.

In [11]:
# REPLACE WITH ACTIVATIONS
full_safe_activations = th.load(f"activations/blocked_chat_nosys_resid_post.pt").to('cuda:1')
#full_unsafe_activations = th.load(f"activations/unsafe_full_prompts_resid_post.pt").to('cuda:2')
y_full = th.cat([th.ones(len(full_safe_activations) // 2), th.zeros(len(full_safe_activations) // 2)])

In [26]:
y = prompts['label'][:len(prompts) // bs * bs].values
#fig = plot_pc(full_safe_activations, y_full, val=chat_nosys_activations, y_val=th.tensor(y), rows=8, cols=4, center=True, n_comp=2)

In [156]:
from sklearn.preprocessing import StandardScaler

pca = FastPCA(n_components=2, center=True)
scaler = StandardScaler()

l = 10

x = scaler.fit_transform(pca.fit_transform(full_safe_activations[:, l])[:, :1])
x_val = scaler.transform(pca.transform(chat_nosys_activations[:, l])[:, :1])

In [157]:
import plotly.express as px
px.histogram(pd.DataFrame({
     'activ': x[:, 0],
     'label': y_full
}), color='label', barmode="overlay", nbins=100, width=900)

In [158]:
px.histogram(pd.DataFrame({
     'activ': x_val[:, 0],
     'label': y
}), color='label', barmode="overlay", nbins=100, width=900)

In [160]:
import plotly.express as px
px.histogram(pd.DataFrame({
     'activ': np.r_[x_val, x][:, 0],
     'label': np.concatenate([y+2, y_full])
}), color='label', barmode="overlay", nbins=100000, width=900)

In [161]:
pca = FastPCA(n_components=2, center=True)
scaler = StandardScaler()

x = scaler.fit_transform(pca.fit_transform(chat_nosys_activations[:, l])[:, :1])
x_val = scaler.transform(pca.transform(full_safe_activations[:, l])[:, :1])

In [166]:
import plotly.express as px
px.histogram(pd.DataFrame({
     'activ': x[:, 0],
     'label': y
}), color='label', barmode="overlay", nbins=50, width=900)

In [167]:
import plotly.express as px
px.histogram(pd.DataFrame({
     'activ': x_val[:, 0],
     'label': y_full
}), color='label', barmode="overlay", nbins=100, width=900)

In [1]:
import plotly.express as px
px.histogram(pd.DataFrame({
     'activ': np.r_[x_val, x][:, 0],
     'label': np.concatenate([y_full, y+2])
}), color='label', barmode="overlay", nbins=100, width=900)

NameError: name 'pd' is not defined

### KDE plots

In [None]:
from plotly.subplots import make_subplots
from scipy.stats import gaussian_kde
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler

# Params
def plot_pc(activations, y, rows, cols, n_comp, val=None, y_val=None, center=True):
    
    # Plot initialization
    fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'L {i + 1}' for i in range(rows * cols)],
                        shared_yaxes=True)

    height = int(rows * 300)
    alpha = 0.8

    # Grid initialization
    x_values = np.linspace(-3.5, 3.5, 100) 
    y_values = np.linspace(-2, 2, 100)
    X, Y = np.meshgrid(x_values, y_values)
    positions = np.vstack([X.ravel(), Y.ravel()])

    pca = FastPCA(n_components=n_comp, center=center)
    scaler = StandardScaler()

    l = 0
    for i in range(rows):
        for j in range(cols):
            
            # PCA fit
            x = pca.fit_transform(activations[:, l])[:, :n_comp]
            x_std = scaler.fit_transform(x)

            if val is not None:
                x_val = scaler.transform(pca.transform(val[:, l])[:, :n_comp])

            # KDE fit
            kde_0 = gaussian_kde(x_std[~y.type(th.bool)].T)
            kde_1 = gaussian_kde(x_std[y.type(th.bool)].T)

            if val is not None:
                kde_val_0 = gaussian_kde(x_val[~y_val.type(th.bool), :].T)
                kde_val_1 = gaussian_kde(x_val[y_val.type(th.bool), :].T)

            if n_comp == 1:
                fig.add_trace(
                    go.Scatter(x=x_values, y=kde_0(x_values), mode='lines', name='Class 0', line=dict(color='blue')),
                    row=i + 1, col=j + 1)
                fig.add_trace(
                    go.Scatter(x=x_values, y=kde_1(x_values), mode='lines', name='Class 1', line=dict(color='red')),
                    row=i + 1, col=j + 1)
                
                if val is not None:
                    fig.add_trace(
                        go.Scatter(x=x_values, y=kde_val_0(x_values), mode='lines', name='Class 0', line=dict(color='green')),
                        row=i + 1, col=j + 1)
                    fig.add_trace(
                        go.Scatter(x=x_values, y=kde_val_1(x_values), mode='lines', name='Class 1', line=dict(color='orange')),
                        row=i + 1, col=j + 1)

            else:
                Z_0 = np.reshape(kde_0(positions).T, X.shape)
                Z_1 = np.reshape(kde_1(positions).T, X.shape)

                fig.add_trace(
                    go.Heatmap(z=Z_0, x=x_values, y=y_values, colorscale='Reds', showscale=False, opacity=alpha),
                    row=i + 1,
                    col=j + 1
                )

                fig.add_trace(
                    go.Heatmap(z=Z_1, x=x_values, y=y_values, colorscale='Blues', showscale=False, opacity=alpha),
                    row=i + 1,
                    col=j + 1
                )

                fig.add_trace(
                    go.Contour(
                        z=Z_0, x=x_values, y=y_values,
                        contours=dict(coloring='heatmap', showlabels=True, start=0, end=1, size=0.1),
                        line=dict(width=2),
                        showscale=False, colorscale='Reds', opacity=0.3
                    ),
                    row=i + 1, col=j + 1
                )

                fig.add_trace(
                    go.Contour(
                        z=Z_1, x=x_values, y=y_values,
                        contours=dict(coloring='heatmap', showlabels=True, start=0, end=1, size=0.1),
                        line=dict(width=2),
                        showscale=False, colorscale='Blues', opacity=0.3
                    ),
                    row=i + 1, col=j + 1
                )

                if val is not None:
                    Z_val_0 = np.reshape(kde_val_0(positions).T, X.shape)
                    Z_val_1 = np.reshape(kde_val_1(positions).T, X.shape)

                    fig.add_trace(
                        go.Heatmap(z=Z_val_0, x=x_values, y=y_values, colorscale='Reds', showscale=False, opacity=alpha),
                        row=i + 1,
                        col=j + 1
                    )

                    fig.add_trace(
                        go.Heatmap(z=Z_val_1, x=x_values, y=y_values, colorscale='Blues', showscale=False, opacity=alpha),
                        row=i + 1,
                        col=j + 1
                    )

                    fig.add_trace(
                        go.Contour(
                            z=Z_val_0, x=x_values, y=y_values,
                            contours=dict(coloring='heatmap', showlabels=True, start=0, end=1, size=0.1),
                            line=dict(width=2),
                            showscale=False, colorscale='Greens', opacity=0.3
                        ),
                        row=i + 1, col=j + 1
                    )

                    fig.add_trace(
                        go.Contour(
                            z=Z_val_1, x=x_values, y=y_values,
                            contours=dict(coloring='heatmap', showlabels=True, start=0, end=1, size=0.1),
                            line=dict(width=2),
                            showscale=False, colorscale='Oranges', opacity=0.3
                        ),
                        row=i + 1, col=j + 1
                    )

            l += 1

    fig.update_layout(height=height, width=1600, title_text="Logistic Regression on the 1st PC")
    return fig