In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import os
import json
import re
import random
import csv
# install bitsandbytes and restart

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
def get_dataset(path):
  with open(path, 'r', encoding='utf-8') as f:
    data = json.load(f)
  filtered_data = [
      {'prompt': f'{entry['story']} {entry['question']}', 'belief': 1 if entry['belief'] else 0}
      for entry in data
  ]
  return filtered_data

In [22]:
tomi = get_dataset("/content/drive/MyDrive/SEF/Data/ToMi/tomi_all.json")
bigtom = get_dataset("/content/drive/MyDrive/SEF/Data/BigToM/bigtom_all.json")

In [7]:
import os
os.environ["HF_HOME"] = "/content/hf_cache"

In [15]:
model_id = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id, padding=True, truncation=True, model_max_length=512)
tokenizer.padding_side = "right"
tokenizer.truncation_side = "right"

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    dtype=torch.float16,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
directory = "/content/hf_cache/mistral-7b"
model.save_pretrained(directory)
tokenizer.save_pretrained(directory)

('/content/hf_cache/mistral-7b/tokenizer_config.json',
 '/content/hf_cache/mistral-7b/special_tokens_map.json',
 '/content/hf_cache/mistral-7b/tokenizer.model',
 '/content/hf_cache/mistral-7b/added_tokens.json',
 '/content/hf_cache/mistral-7b/tokenizer.json')

In [19]:
import torch
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    "/content/hf_cache/mistral-7b",
    quantization_config=bnb_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained("/content/hf_cache/mistral-7b")
model.eval()


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

# Extract Last-Token Activation Vectors


In [11]:
num_layers = model.config.num_hidden_layers
num_heads = model.config.num_attention_heads
head_dim = model.config.hidden_size

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [13]:
def extract_activations(prompts, batch_size=8):
  all_hidden_states = []
  for i in range(0, len(prompts), batch_size):
    batch_prompts = prompts[i:i+batch_size]
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
      outputs = model(**inputs, output_hidden_states=True, attn_implementation="eager")

    hidden_states = outputs.hidden_states[1:]

    attention_mask = inputs["attention_mask"]  # (bs, seq_len)
    lengths = attention_mask.sum(dim=1) - 1     # index of last real token
    batch_last_hidden = []
    for layer_hs in hidden_states:
        # layer_hs: (bs, seq_len, hidden)
        last_tokens = layer_hs[torch.arange(layer_hs.size(0)), lengths]
        batch_last_hidden.append(last_tokens)

    batch_last_hidden = torch.stack(batch_last_hidden, dim=1)
    all_hidden_states.append(batch_last_hidden.cpu())

  global_all_hidden = torch.cat(all_hidden_states, dim=0)

  return global_all_hidden


In [23]:
tomi_prompts = [entry['prompt'] for entry in tomi]
bigtom_prompts = [entry['prompt'] for entry in bigtom]

last_token_hidden_tomi = extract_activations(tomi_prompts)
last_token_hidden_bigtom = extract_activations(bigtom_prompts)

# Extract Mean-Pooled Activations

In [33]:
def extract_mean_pooled_hidden(prompts, batch_size=8):
  all_pooled = []
  for i in range(0, len(prompts), batch_size):
    batch = prompts[i:i+batch_size]
    inputs = tokenizer(
        batch,
        padding=True,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
      outputs = model(**inputs, output_hidden_states=True)

    hidden_states = outputs.hidden_states[1:]
    attention_mask = inputs["attention_mask"] # [bs, seq_len]
    mask = attention_mask.unsqueeze(-1)
    lengths = attention_mask.sum(dim=1).unsqueeze(-1)

    pooled_layers = []
    for hs in hidden_states:
        # hs: [bs, seq_len, hidden_dim]
        masked_sum = (hs * mask).sum(dim=1)     # [bs, hidden_dim]
        mean_pooled = masked_sum / lengths      # [bs, hidden_dim]
        pooled_layers.append(mean_pooled)

    # Stack layers: [bs, num_layers, hidden_dim]
    batch_pooled = torch.stack(pooled_layers, dim=1)
    all_pooled.append(batch_pooled.cpu())

  return torch.cat(all_pooled, dim=0)

In [34]:
mean_pooled_tomi = extract_mean_pooled_hidden(tomi_prompts)
mean_pooled_bigtom = extract_mean_pooled_hidden(bigtom_prompts)

# Train Linear Probes

In [18]:
for param in model.parameters():
    param.requires_grad = False

In [20]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [25]:
def train_layerwise_probes(
    last_token_hidden,
    labels,
    test_size = 0.2,
    random_state=42,
    max_iter=1000
):
  X = last_token_hidden.cpu().numpy()
  y = np.asarray(labels).astype(int)
  N, num_layers, d = X.shape
  X_train, X_test, y_train, y_test = train_test_split(
      X, y, test_size=test_size, stratify=y, random_state=random_state
  )
  results = {
      'accuracy': [],
      'auc': [],
      'weights': []
  }

  for layer in range(num_layers):
    # extract activations for this layer
    X_train_l = X_train[:, layer, :]
    X_test_l = X_test[:, layer, :]
    probe = LogisticRegression(
        penalty="l2",
        C = 1.0,
        solver = "lbfgs",
        max_iter=max_iter
    )
    probe.fit(X_train_l, y_train)

    y_pred = probe.predict(X_test_l)
    y_prob = probe.predict_proba(X_test_l)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    results['accuracy'].append(acc)
    results['auc'].append(auc)
    results['weights'].append(probe.coef_[0])

  return results



In [36]:
last_token_tomi_results = train_layerwise_probes(last_token_hidden_tomi, [entry['belief'] for entry in tomi])
last_token_bigtom_results = train_layerwise_probes(last_token_hidden_bigtom, [entry['belief'] for entry in bigtom])

In [42]:
mean_pooled_tomi_results = train_layerwise_probes(mean_pooled_tomi, [entry['belief'] for entry in tomi])
mean_pooled_bigtom_results = train_layerwise_probes(mean_pooled_bigtom, [entry['belief'] for entry in bigtom])

In [52]:
np.mean(mean_pooled_tomi_results['auc'])

np.float64(0.5971875)

In [53]:
np.mean(last_token_tomi_results['auc'])

np.float64(0.570859375)