In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json

from cupbearer import data, detectors, models, scripts, tasks, utils
from torch.utils.data import DataLoader

# Training a backdoored classifier
First, we train a classifier on poisoned data:

In [None]:
train_data = data.MNIST()
val_data = data.MNIST(train=False)

In [None]:
model = models.MLP(input_shape=(28, 28), hidden_dims=[128, 128], output_dim=10)

In [None]:
scripts.train_classifier(
    path=(classifier_path := utils.log_path("logs/demo/classifier")),
    model=model,
    train_loader=DataLoader(
        data.BackdoorDataset(
            # Poison 5% of the training data
            original=train_data,
            backdoor=data.CornerPixelBackdoor(p_backdoor=0.05),
        ),
        batch_size=64,
        shuffle=True,
    ),
    num_classes=10,
    val_loaders={
        "clean": DataLoader(val_data, batch_size=1024, shuffle=False),
        "backdoor": DataLoader(
            data.BackdoorDataset(
                # By default, the poison rate is 100%, so this will let us evaluate
                # performance on completely poisoned data
                original=val_data,
                backdoor=data.CornerPixelBackdoor(),
            ),
            batch_size=1024,
            shuffle=False,
        ),
    },
    max_epochs=3,
)

We can also explicitly evaluate the trained model (right now this is pretty limited and doesn't support multiple datasets at once). In this case it doesn't tell us anything new, but it can be useful if we want to evaluate a model on additional data later:

In [None]:
scripts.eval_classifier(
    data=val_data,
    model=model,
    path=classifier_path,
)

These results will also have been stored to `<log path>/eval.json` if we want to process them further (e.g. to compare many runs):

In [None]:
with open(classifier_path / "eval.json") as f:
    print(json.load(f))

# Training a backdoor detector
We'll train a very simple detector using the Mahalanobis distance. Our model is still in memory, but just for demonstration let's load it again:

In [None]:
# Initialize a new model with the same architecture as before:
model = models.MLP(input_shape=(28, 28), hidden_dims=[128, 128], output_dim=10)
# Load the weights:
models.load(model, classifier_path)

In [None]:
model

In [None]:
scripts.train_detector(
    save_path=(detector_path := utils.log_path("logs/demo/detector")),
    task=tasks.backdoor_detection(
        model, train_data, val_data, data.CornerPixelBackdoor()
    ),
    detector=detectors.MahalanobisDetector(
        activation_names=[
            # "layers.linear_0.output",
            "layers.linear_1.output",
            # "layers.linear_2.output",
        ]
    ),
    num_classes=10,
)

As we can see, this was a trivial detection task. As an ablation, we can test whether the detector specifically flags backdoored inputs as anomalous, or just anything out of distribution. Let's again reload the detector just to show how that works:

In [None]:
detector = detectors.MahalanobisDetector(activation_names=["layers.linear_1.output"])
# TODO: The fact that weights are saved in "detector" is just a convention used by
# the train_detector script, this is kind of weird.
detector.load_weights(detector_path / "detector")

In [None]:
scripts.eval_detector(
    detector=detector,
    # We save to a different directory to avoid overwriting the existing default eval:
    save_path=detector_path / "ood_eval",
    task=tasks.Task.from_separate_data(
        model=model,
        # TODO: this won't actually be used, plausibly Tasks should be split better
        # into their training and test data.
        trusted_data=train_data,
        # Our anomalous data is the backdoor data from above, except we use the
        # MNIST test split.
        anomalous_test_data=data.BackdoorDataset(
            original=val_data,
            backdoor=data.CornerPixelBackdoor(),
        ),
        # Our normal data is MNIST with added noise, this makes the images OOD
        # but they shouldn't be mechanistically anomalous.
        clean_test_data=data.TransformDataset(val_data, data.GaussianNoise(0.3)),
    ),
)

As we can see, adding noise did make the images quite a bit more "anomalous" according to our detector (the blue histogram has shifted to the right to higher anomaly scores). But we still have a pretty clear separation between these "merely noisy" inputs and the backdoored inputs. (This is a very easy to detect backdoor.)