# Sheet 5

## 2 Logistic regression: an LLM lie detector

This is how you can load a dataset of LLM activations. Use a new Datamanager if you want to have a new dataset. Use `add_dataset` multiple times on the same data manager if you want to combine datasets.

In [144]:
import torch as t
import pandas as pd
import numpy as np
import os
from glob import glob
import random

In [145]:
from lie_detection_utils import DataManager

path_to_datasets = "data/lie_detection/datasets"
path_to_acts = "data/lie_detection/acts"

# check if the datasets and activations are available
assert os.path.exists(path_to_datasets), "The path to the datasets does not exist."
assert os.path.exists(path_to_acts), "The path to the activations does not exist."

# these are the different datasets containing true and false factual statements about different topics
dataset_names = ["cities", "neg_cities", "sp_en_trans", "neg_sp_en_trans"]
dataset_name = dataset_names[0] # choose some dataset from the above datasets, index "0" loads the "cities" dataset for example

# the dataloader automatically loads the training data for us
dm = DataManager()
dm.add_dataset(dataset_name, "Llama3", "8B", "chat", layer=12, split=0.8, center=False,
                device='cpu', path_to_datasets=path_to_datasets, path_to_acts=path_to_acts)
acts_train, labels_train = dm.get('train') # train set
acts_test, labels_test = dm.get('val')
print(acts_train.shape, labels_train.shape)

torch.Size([1196, 4096]) torch.Size([1196])


In [146]:
# have a look at the statements that were fed to the LLM to produce the activations:
df = pd.read_csv(f"{path_to_datasets}/{dataset_name}.csv")
print(df.head(10))

                                        statement  label       city  \
0             The city of Krasnodar is in Russia.      1  Krasnodar   
1       The city of Krasnodar is in South Africa.      0  Krasnodar   
2                  The city of Lodz is in Poland.      1       Lodz   
3  The city of Lodz is in the Dominican Republic.      0       Lodz   
4            The city of Maracay is in Venezuela.      1    Maracay   
5                The city of Maracay is in China.      0    Maracay   
6              The city of Baku is in Azerbaijan.      1       Baku   
7                 The city of Baku is in Ukraine.      0       Baku   
8                  The city of Baoji is in China.      1      Baoji   
9              The city of Baoji is in Guatemala.      0      Baoji   

                  country correct_country  
0                  Russia          Russia  
1            South Africa          Russia  
2                  Poland          Poland  
3  the Dominican Republic          Poland 

### (a)

In [147]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [148]:
# Create and train the Logistic Regression model
model = LogisticRegression(penalty=None, max_iter=1000)  # No regularization
model.fit(acts_train, labels_train) 

# Make predictions on the test set
predictions = model.predict(acts_test)
# Evaluate the model's performance
accuracy = accuracy_score(labels_test, predictions)
print(f"Test set accuracy: {accuracy * 100:.2f}%")

Test set accuracy: 100.00%


We get an accuracy of 100 % for all four data sets, so the activation vectors are linearly separable.

### (b)

In [149]:
# Train on cities dataset
train_dataset = "cities"
dm.add_dataset(train_dataset, "Llama3", "8B", "chat", layer=12, split=0.8, center=False,
                device='cpu', path_to_datasets=path_to_datasets, path_to_acts=path_to_acts)
acts_train, labels_train = dm.get('train') # train set

In [150]:
# No regularization

# Create and train the Logistic Regression model
model = LogisticRegression(penalty=None, max_iter=1000)
model.fit(acts_train, labels_train)

# Evaluate on other datasets
for test_dataset in dataset_names:
    if test_dataset == train_dataset:
        continue
    dm.add_dataset(test_dataset, "Llama3", "8B", "chat", layer=12, split=0.8, center=False,
                    device='cpu', path_to_datasets=path_to_datasets, path_to_acts=path_to_acts)
    acts_test, labels_test = dm.get('val')
    predictions = model.predict(acts_test)
    accuracy = accuracy_score(labels_test, predictions)
    print(f"Test set accuracy on {test_dataset}: {accuracy * 100:.2f}%")

Test set accuracy on neg_cities: 75.00%
Test set accuracy on sp_en_trans: 74.37%
Test set accuracy on neg_sp_en_trans: 71.83%


In [151]:
# With regularization

# Create and train the Logistic Regression model
model = LogisticRegression(max_iter=1000)  # With regularization
model.fit(acts_train, labels_train)

# Evaluate on other datasets
for test_dataset in dataset_names:
    if test_dataset == train_dataset:
        continue
    dm.add_dataset(test_dataset, "Llama3", "8B", "chat", layer=12, split=0.8, center=False,
                    device='cpu', path_to_datasets=path_to_datasets, path_to_acts=path_to_acts)
    acts_test, labels_test = dm.get('val')
    predictions = model.predict(acts_test)
    accuracy = accuracy_score(labels_test, predictions)
    print(f"Test set accuracy on {test_dataset}: {accuracy * 100:.2f}%")

Test set accuracy on neg_cities: 75.47%
Test set accuracy on sp_en_trans: 75.20%
Test set accuracy on neg_sp_en_trans: 76.82%


We can observe that all performances are >50%, so the results are above chance, yet do not fully generalize to other topics. We can further see that the accuracy drops when negation is introduced.

### (c)

In [152]:
# Combine cities and neg cities datasets for training
train_datasets = ["cities", "neg_cities"]
acts_train_list = []
labels_train_list = []  
for dataset in train_datasets:
    dm.add_dataset(dataset, "Llama3", "8B", "chat", layer=12, split=0.8, center=False,
                    device='cpu', path_to_datasets=path_to_datasets, path_to_acts=path_to_acts)
    acts, labels = dm.get('train')
    acts_train_list.append(acts)
    labels_train_list.append(labels)
acts_train = np.concatenate(acts_train_list)
labels_train = np.concatenate(labels_train_list)

In [153]:
# Create and train the Logistic Regression model
model = LogisticRegression(penalty=None, max_iter=1000)  # No regularization
model.fit(acts_train, labels_train) 

# Evaluate on sp_en_trans and neg_sp_en_trans datasets
test_datasets = ["sp_en_trans", "neg_sp_en_trans"]
for test_dataset in test_datasets:  
    dm.add_dataset(test_dataset, "Llama3", "8B", "chat", layer=12, split=0.8, center=False,
                    device='cpu', path_to_datasets=path_to_datasets, path_to_acts=path_to_acts)
    acts_test, labels_test = dm.get('val')
    predictions = model.predict(acts_test)
    accuracy = accuracy_score(labels_test, predictions)
    print(f"Test set accuracy on {test_dataset}: {accuracy * 100:.2f}%")

Test set accuracy on sp_en_trans: 99.60%
Test set accuracy on neg_sp_en_trans: 99.60%


## 3 Log-sum-exp and soft(arg)max
### (b)

### (c)

## 4 Linear regions of MLPs