ON THE HUMAN-RECOGNIZABILITY PHENOMENON OF ADVERSARIALLY TRAINED DEEP IMAGE CLASSIFIERS

Copyright 2020 Carnegie Mellon University.

NO WARRANTY. THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE 
MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON UNIVERSITY MAKES NO 
WARRANTIES OF ANY KIND, EITHER EXPRESSED OR IMPLIED, AS TO ANY MATTER INCLUDING, 
BUT NOT LIMITED TO, WARRANTY OF FITNESS FOR PURPOSE OR MERCHANTABILITY, 
EXCLUSIVITY, OR RESULTS OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON 
UNIVERSITY DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM FROM 
PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.

Released under a MIT (SEI)-style license, please see license.txt or contact permission@sei.cmu.edu for full terms.

[DISTRIBUTION STATEMENT A] This material has been approved for public release and unlimited distribution.  
Please see Copyright notice for non-US Government use and distribution.

Carnegie Mellon® is registered in the U.S. Patent and Trademark Office by Carnegie Mellon University.
This Software includes and/or makes use of the following Third-Party Software subject to its own license:

1. Python (https://docs.python.org/3/license.html#psf-license-agreement-for-python-release) Copyright 2001-2020 
Python Software Foundation 2001-2020.

2. PyTorch (https://github.com/pytorch/pytorch/blob/master/LICENSE#L3-L11) Copyright 2016 Facebook Inc.

3. Torchvision (https://github.com/pytorch/vision/blob/master/LICENSE) Copyright 2016 Soumith Chintala.

4. NumPy (https://github.com/numpy/numpy/blob/master/LICENSE.txt) Copyright 2005-2020 NumPy Developers.

5. tqdm (https://github.com/tqdm/tqdm/blob/master/LICENCE) Copyright noamraph 2013.

6. Jupyter (https://github.com/jupyter/notebook/blob/master/LICENSE) Copyright IPython Development Team 
2001-2015, Jupyter Development Team 2015-2020 IPython Development Team 2001-2015, Jupyter Development 
Team 2015-2020.

DM20-1153

In [None]:
%load_ext autoreload
%autoreload 2

import os
import shutil

from tqdm.notebook import tqdm
import numpy as np
import torch
import torchvision as tv
import torchvision.transforms as transforms
import torchvision.datasets as datasets

import matplotlib.pyplot as plt
%matplotlib inline

import sys
sys.path.insert(0,'../')
from models.preact_resnet import PreActResNet18
from attack.inversion import inversion
from constants import (LABEL_MAP, 
                       CIFAR10_SHAPE,
                       CIFAR10_MEAN, CIFAR10_STD, 
                       MU, STD,
                       UPPER_LIMIT, LOWER_LIMIT)
from utils import load_img, normalize_img

In [None]:
data_dir = os.environ['DATA_DIR']
assert os.path.isdir(data_dir)

# `final` dir is not created by default, change this according to your own model directory structure
model_dir = os.path.join(os.environ['OUT_DIR'], 'final')
assert os.path.isdir(model_dir)

device = 'cuda:0'

# list available models
available_models = os.listdir(model_dir)
available_models

In [None]:
available_models = ['model_preact_resnet18_pgd.pt']

In [None]:
def load_model(model_name  :str):
    pdict = torch.load(
        os.path.join(model_dir, model_name),
        map_location='cpu')
    model = PreActResNet18()
    assert len(pdict) == len(model.state_dict())
    model.load_state_dict(pdict)
    model = model.to(device)
    model.eval()
    model.requires_grad_(False)
    return model


model = load_model(available_models[0])
print(model)

In [None]:
train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(CIFAR10_MEAN, CIFAR10_STD)])
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(CIFAR10_MEAN, CIFAR10_STD)])

kwargs = dict(
    batch_size =1,
    shuffle    =True,
    pin_memory =True,
    num_workers=4)
trainloader = torch.utils.data.DataLoader(
    dataset=datasets.CIFAR10(data_dir, 
        train    =True, 
        transform=train_transform, 
        download =False), 
    **kwargs)
testloader = torch.utils.data.DataLoader(
    dataset=datasets.CIFAR10(data_dir, 
        train    =False, 
        transform=test_transform, 
        download =False), 
    **kwargs)

# Total & per-class accuracy

In [None]:
preds_all = []
targets_all = []
for data, target in tqdm(testloader, desc='Evaluating', total=len(testloader)):
    targets_all.append(target)
    
    data = data.to(device)
    target = target.to(device)
    
    with torch.no_grad():
        logits = model(data)
        preds = logits.softmax(dim=1)
        preds_all.append(preds.cpu())

targets_all = torch.cat(targets_all)
preds_all = torch.cat(preds_all)

In [None]:
score_thresholds = torch.linspace(0.,1.,11)

accuracies = []
pred_labels = preds_all.argmax(dim=1)
for i in range(len(LABEL_MAP)):
    indices = (targets_all == i)
    acc_i = (pred_labels[indices] == targets_all[indices]).float().mean()
    accuracies.append(acc_i)

print('Total: ', sum(accuracies).item() / len(accuracies))
print()
for i, acc in enumerate(accuracies):
    print(f'{LABEL_MAP[i]}: {acc.item():.2f}')

# Inversion attack via minimizing cross-entropy loss wrt target label

In [None]:
%%time

clamp = [
    [(0. - mu) / std, (1. - mu) / std] 
    for mu, std in zip(CIFAR10_MEAN, CIFAR10_STD)]

n_samples = len(LABEL_MAP)
shape = CIFAR10_SHAPE
x0 = STD * torch.empty(n_samples, *shape).uniform_(-1.,1.)
for i in range(x0.shape[1]):
    x0[:,i] = torch.clamp(x0[:,i], *clamp[i])
    
stepsize = 2. / 255.
max_iters = 1024

x_inv = inversion(
    model    =model,
    x0       =x0,
    category =torch.arange(len(x0)),
    stepsize =stepsize,
    max_iters=max_iters,
    clamp    =clamp,
    geometry ='linf').cpu()

In [None]:
plt.figure(figsize=(100,50))
for i, img in enumerate(x_inv):
    pred = model(img[None].to(device)).argmax(1).item()
    
    plt.subplot(1, len(x_inv), i + 1)
    plt.imshow(normalize_img(img).permute(1,2,0))
    plt.axis('off')
    plt.title(f'{LABEL_MAP[i].split(",")[0]}', fontsize=64, fontweight='bold')

# Test set seeding

In [None]:
def sample_from_y(target):
    for x, y in testloader:
        x = x.squeeze()
        if y.item() == target:
            found = True
            return x
        
clamp = [
    [(0. - MU) / STD, (1. - MU) / STD] 
    for MU, STD in zip(CIFAR10_MEAN, CIFAR10_STD)]

# fix identical seeds
seeds = [sample_from_y(y) for y in range(len(LABEL_MAP))]

In [None]:
stepsize = 1e-1
max_iters = 16

all_paths = []
all_labels = []
all_paths_adv = []
all_preds_adv = []
model_name = available_models[0]
print(model_name)
model = load_model(model_name)
for target in tqdm(range(10)):
    path = []
    path_adv = []
    pred_adv = []
    num_queries = []
    x0 = seeds[target].unsqueeze(0)
    path.append(x0.clone())
    for y in tqdm(range(10), desc='Targeting'):
        x_inv = inversion(
            model    =model,
            x0       =x0,
            category =y,
            stepsize =stepsize,
            max_iters=max_iters,
            clamp    =clamp,
            geometry ='linf').cpu()
        
        path_adv.append(x_inv)
    
    all_paths.append(path[0])
    all_paths_adv.append(path_adv)

In [None]:
plt.figure(figsize=(20,20))
count = 1
fontsize = 16
fontweight = 'bold'
for i, (path, path_adv) in enumerate(zip(all_paths, all_paths_adv)):
    plt.subplot(10, len(path_adv) + 1, count)
    count += 1
    plt.imshow(normalize_img(path[0]).squeeze().permute(1,2,0))
    plt.xticks([])
    plt.yticks([])
    plt.ylabel(f'{LABEL_MAP[i]}', fontsize=fontsize, fontweight=fontweight)
    if i == 0:
        plt.title('Seed $x_0$', fontsize=fontsize, fontweight=fontweight)
    for j, img in enumerate(path_adv):
        plt.subplot(10, len(path_adv) + 1, count)
        count += 1
        img = normalize_img(img).squeeze().permute(1,2,0)
        ax = plt.imshow(img)
        plt.xticks([])
        plt.yticks([])
        if i == 0:
            plt.title(f'{LABEL_MAP[j]}', fontsize=fontsize, fontweight=fontweight)