In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import matplotlib
import matplotlib.pyplot as plt
import gc
from captum.attr import *
import quantus
from torch.utils.data import DataLoader
import gc
import torchvision.transforms as transforms

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
#for natural and adversarial LeNet Model 
class LeNet_normal(torch.nn.Module):
    """Network architecture from: https://github.com/ChawDoe/LeNet5-MNIST-PyTorch."""
    def __init__(self):
        super().__init__()
        self.conv_1 = torch.nn.Conv2d(1, 6, 5)
        self.pool_1 = torch.nn.MaxPool2d(2, 2)
        self.relu_1 = torch.nn.ReLU()
        self.conv_2 = torch.nn.Conv2d(6, 16, 5)
        self.pool_2 = torch.nn.MaxPool2d(2, 2)
        self.relu_2 = torch.nn.ReLU()
        self.fc_1 = torch.nn.Linear(256, 120)
        self.relu_3 = torch.nn.ReLU()
        self.fc_2 = torch.nn.Linear(120, 84)
        self.relu_4 = torch.nn.ReLU()
        self.fc_3 = torch.nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool_1(self.relu_1(self.conv_1(x)))
        x = self.pool_2(self.relu_2(self.conv_2(x)))
        x = x.view(x.shape[0], -1)
        x = self.relu_3(self.fc_1(x))
        x = self.relu_4(self.fc_2(x))
        x = self.fc_3(x)
        return x

In [5]:
def load_mnist_model(path):
    model = LeNet_normal()
    model.to(device)
    model.load_state_dict(torch.load(path))
    model.to('cuda')
    model.train(False)
    return model

In [6]:
modelpath = "/data/virtual environments/adv detection by robustness/adv_detection/Adaptive attacks/Models/MNIST/mnist_model.pth"

In [7]:
normal_model = load_mnist_model(modelpath)
normal_model.to(device)
normal_model.eval()

LeNet_normal(
  (conv_1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool_1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (relu_1): ReLU()
  (conv_2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (pool_2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (relu_2): ReLU()
  (fc_1): Linear(in_features=256, out_features=120, bias=True)
  (relu_3): ReLU()
  (fc_2): Linear(in_features=120, out_features=84, bias=True)
  (relu_4): ReLU()
  (fc_3): Linear(in_features=84, out_features=10, bias=True)
)

In [8]:
test_set = torchvision.datasets.MNIST(root='./sample_data', train=True, transform=torchvision.transforms.ToTensor(), download=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=10, pin_memory=True)

# For given adversarial images and benign images, collect metrics of feature attribution sensitivity and model prediction sensitivity. Save in csv that will be used for inspecting detection performance. 

In [9]:
adv_path1 = 'adv samples/MNIST/FGSM/0.03137254901960784eps.npz'
adv_path2 = 'adv samples/MNIST/FGSM/0.06274509803921569eps.npz'
adv_path3 = 'adv samples/MNIST/FGSM/0.12549019607843137eps.npz'
adv_path4 = 'adv samples/MNIST/FGSM/0.25098039215686274eps.npz'

In [10]:
import pandas as pd

In [11]:
def make_noise(x_batch, y_batch, spread):
    new_x_batch = []
    for x in x_batch:
        x = x.data.cpu().numpy()
        stdev = spread * (np.max(x)-np.min(x))
        noise = np.random.normal(0, stdev, x.shape).astype(np.float32)
        x_plus_noise = x + noise
        x_plus_noise = np.clip(x_plus_noise, 0, 1)
        x_plus_noise = torch.from_numpy(x_plus_noise).cpu()
        new_x_batch.append(x_plus_noise)
    new_batch = torch.stack(new_x_batch).to(device)
    return new_batch

In [12]:
def compute_metrics_benign(adv_path, normal_model): 
    
    print("Computing metrics for {} for benign")
    
    npobj = np.load(adv_path)
    adaptive_image = npobj['b_images']
    adaptive_label = npobj['b_labels']
    
    
    #attribution robustness
    attribution_gaussian1 = []
    attribution_gaussian2 = []
    attribution_gaussian3 = []
    
    #logit robustness
    logit_gaussian1 = []
    logit_gaussian2 = []
    logit_gaussian3 = []
    
    images, labels = torch.from_numpy(adaptive_image), torch.from_numpy(adaptive_label)
    #images, labels = images.to(device), labels.to(device)
    
    end = len(adaptive_label)
    if end > 1000:
        end = 1000
    
    for i in range(0, end, 2):
        
        images_adv, y_pred_adv = images[i:i+2], labels[i:i+2]
        images_adv, y_pred_adv = images_adv.to(device), y_pred_adv.to(device)
        
        x_logits = normal_model(images_adv)
        gc.collect()
        torch.cuda.empty_cache()
        
        #approach: attribution and logit robustness
        a_batch = quantus.explain(
            model=normal_model, inputs=images_adv, targets=y_pred_adv, **{"method:": "Saliency", "device": device})
        
        gaussian_noisy_images_1 = make_noise(images_adv, y_pred_adv, spread = 0.005)
        gaussian_logits_1 = normal_model(gaussian_noisy_images_1)
        gaussian_noisy_images_2 = make_noise(images_adv, y_pred_adv, spread = 0.01)
        gaussian_logits_2 = normal_model(gaussian_noisy_images_2)
        gaussian_noisy_images_3 = make_noise(images_adv, y_pred_adv, spread = 0.05)
        gaussian_logits_3 = normal_model(gaussian_noisy_images_3)
        
        diff1 = torch.norm(x_logits-gaussian_logits_1,p=1, dim=1) 
        diff2 = torch.norm(x_logits-gaussian_logits_2,p=1, dim=1) 
        diff3 = torch.norm(x_logits-gaussian_logits_3,p=1, dim=1) 
        
        logit_gaussian1.extend(diff1.detach().cpu().numpy())
        logit_gaussian2.extend(diff2.detach().cpu().numpy())
        logit_gaussian3.extend(diff3.detach().cpu().numpy())
        
        
        a_batch_gaussian1 = quantus.explain(
        model=normal_model, inputs=gaussian_noisy_images_1, targets=y_pred_adv, **{"method:": "Saliency", "device": device})
        
        a_batch_gaussian2 = quantus.explain(
        model=normal_model, inputs=gaussian_noisy_images_2, targets=y_pred_adv, **{"method:": "Saliency", "device": device})
        
        a_batch_gaussian3 = quantus.explain(
        model=normal_model, inputs=gaussian_noisy_images_3, targets=y_pred_adv, **{"method:": "Saliency", "device": device})
        
        for a, b in zip(a_batch, a_batch_gaussian1):
            c = np.linalg.norm(a.flatten()-b.flatten(),ord=1 )
            attribution_gaussian1.append(c)
            
        for a, b in zip(a_batch, a_batch_gaussian2):
            c = np.linalg.norm(a.flatten()-b.flatten(),ord=1 )
            attribution_gaussian2.append(c)
        
        for a, b in zip(a_batch, a_batch_gaussian3):
            c = np.linalg.norm(a.flatten()-b.flatten(),ord=1 )
            attribution_gaussian3.append(c)
        
        
    df = pd.DataFrame([
            
            attribution_gaussian1,
            attribution_gaussian2,
            attribution_gaussian3,
            logit_gaussian1,
            logit_gaussian2,
            logit_gaussian3], index = [
            "Gaussian1 attribution", 
            "Gaussian2 attribution", 
            "Gaussian3 attribution", 
            "Gaussian1 logit robusntess",
            "Gaussian2 logit robusntess",
            "Gaussian3 logit robusntess",
                    ])
            
    path = "adaptive_Benign.csv"
    df.to_csv(path)

In [13]:
def compute_metrics_adv(adv_path, normal_model): 
    
    print("Computing metrics for {} for adv")
    npobj = np.load(adv_path)
    adaptive_image = npobj['a_images']
    adaptive_label = npobj['a_labels']
    
    #attribution robustness
    attribution_gaussian1 = []
    attribution_gaussian2 = []
    attribution_gaussian3 = []
    
    #logit robustness
    logit_gaussian1 = []
    logit_gaussian2 = []
    logit_gaussian3 = []
    
    images, labels = torch.from_numpy(adaptive_image), torch.from_numpy(adaptive_label)
    #images, labels = images.to(device), labels.to(device)
    
    end = len(adaptive_label)
    if end > 1000:
        end = 1000
    
    for i in range(0, end, 2):
        
        images_adv, y_pred_adv = images[i:i+2], labels[i:i+2]
        images_adv, y_pred_adv = images_adv.to(device), y_pred_adv.to(device)
        
        x_logits = normal_model(images_adv)
        gc.collect()
        torch.cuda.empty_cache()
        
        #approach: attribution and logit robustness
        a_batch = quantus.explain(
            model=normal_model, inputs=images_adv, targets=y_pred_adv, **{"method:": "Saliency", "device": device})
        
        gaussian_noisy_images_1 = make_noise(images_adv, y_pred_adv, spread = 0.005)
        gaussian_logits_1 = normal_model(gaussian_noisy_images_1)
        gaussian_noisy_images_2 = make_noise(images_adv, y_pred_adv, spread = 0.01)
        gaussian_logits_2 = normal_model(gaussian_noisy_images_2)
        gaussian_noisy_images_3 = make_noise(images_adv, y_pred_adv, spread = 0.05)
        gaussian_logits_3 = normal_model(gaussian_noisy_images_3)
        
        diff1 = torch.norm(x_logits-gaussian_logits_1,p=1, dim=1) 
        diff2 = torch.norm(x_logits-gaussian_logits_2,p=1, dim=1) 
        diff3 = torch.norm(x_logits-gaussian_logits_3,p=1, dim=1) 
        
        logit_gaussian1.extend(diff1.detach().cpu().numpy())
        logit_gaussian2.extend(diff2.detach().cpu().numpy())
        logit_gaussian3.extend(diff3.detach().cpu().numpy())
        
        
        a_batch_gaussian1 = quantus.explain(
        model=normal_model, inputs=gaussian_noisy_images_1, targets=y_pred_adv, **{"method:": "Saliency", "device": device})
        
        a_batch_gaussian2 = quantus.explain(
        model=normal_model, inputs=gaussian_noisy_images_2, targets=y_pred_adv, **{"method:": "Saliency", "device": device})
        
        a_batch_gaussian3 = quantus.explain(
        model=normal_model, inputs=gaussian_noisy_images_3, targets=y_pred_adv, **{"method:": "Saliency", "device": device})
        
        for a, b in zip(a_batch, a_batch_gaussian1):
            c = np.linalg.norm(a.flatten()-b.flatten(),ord=1 )
            attribution_gaussian1.append(c)
            
        for a, b in zip(a_batch, a_batch_gaussian2):
            c = np.linalg.norm(a.flatten()-b.flatten(),ord=1 )
            attribution_gaussian2.append(c)
        
        for a, b in zip(a_batch, a_batch_gaussian3):
            c = np.linalg.norm(a.flatten()-b.flatten(),ord=1 )
            attribution_gaussian3.append(c)
        
        
    df = pd.DataFrame([
            
            attribution_gaussian1,
            attribution_gaussian2,
            attribution_gaussian3,
            logit_gaussian1,
            logit_gaussian2,
            logit_gaussian3], index = [
            "Gaussian1 attribution", 
            "Gaussian2 attribution", 
            "Gaussian3 attribution", 
            "Gaussian1 logit robusntess",
            "Gaussian2 logit robusntess",
            "Gaussian3 logit robusntess",
                    ])
            
    path = "adaptive_Adv.csv"
    df.to_csv(path)

In [14]:
def compute_TPR(adv1, a, b, adv2, c, d): 
    TN=0
    FN=0
    FP = 0 
    TP=0
    
    for value1, value2 in zip(adv1, adv2): 
        if value1<a or value1>b:
            TP += 1
        else:
            if value2<c or value2>d:
                TP+=1
            else: 
                FN+=1
    
    return (TP/(TP+FN))*100

In [15]:
def compute_FPR(ap2a, k, l, ap2b, m, n): 
    TN=0
    FN=0
    FP=0 
    TP=0
    
    for value6, value7 in zip(ap2a,ap2b):
        if value6<k or value6>l:
            FP +=1
        else:
            if value7<m or value7>n:
                FP +=1

    return (FP/(len(ap2a)))*100

In [16]:
import sklearn
from sklearn.metrics import roc_auc_score

In [17]:
def return_auc(adv_path, model):
    #logitgaussian3
    
    k=[0.07,0.07,0.07,0.07, 0.07,0.07,0.07,0.07,0.07,0.07, 0.07]
    l=[3.3,2.5,2.1,1.7,1.5, 1.3, 1.1, 0.7, 0.5, 0.3, 0.1]

    #attr
    m=[0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1]
    n=[250,200,150,120, 110, 100, 90, 80, 70, 30,10]

    
    compute_metrics_benign(adv_path, model)
    compute_metrics_adv(adv_path, model)
    df_cifar = pd.read_csv("adaptive_Benign.csv")
    attr_gaussian3 = df_cifar.iloc[0].values.flatten().tolist()[1:]
    logit_gaussian3 = df_cifar.iloc[3].values.flatten().tolist()[1:]
        
    fpr_results =[]
    for t1,t2,t3,t4 in zip(k,l,m,n):
        FPR = compute_FPR(logit_gaussian3, t1,t2, attr_gaussian3,t3,t4)
        fpr_results.append(FPR/100)
        
    df_pgd_eps1 = pd.read_csv("adaptive_Adv.csv")
    attr_gaussian3_eps1 = df_pgd_eps1.iloc[0].values.flatten().tolist()[1:]
    logit_gaussian3_eps1 = df_pgd_eps1.iloc[3].values.flatten().tolist()[1:]
    
    tpr_results =[]
    for t1,t2,t3,t4 in zip(k,l,m,n):
        TPR = compute_TPR(logit_gaussian3_eps1, t1,t2, attr_gaussian3_eps1,t3,t4)
        tpr_results.append(TPR/100)
    return sklearn.metrics.auc(fpr_results, tpr_results), fpr_results, tpr_results 

In [18]:
print(return_auc(adv_path1, normal_model))
print('---')


Computing metrics for {} for benign
Computing metrics for {} for adv
(0.9675370000000001, [0.026000000000000002, 0.049, 0.095, 0.184, 0.237, 0.32, 0.397, 0.635, 0.802, 0.949, 0.9990000000000001], [0.902, 0.959, 0.985, 0.9940000000000001, 0.9940000000000001, 0.995, 0.997, 0.9990000000000001, 1.0, 1.0, 1.0])
---


In [19]:
print(return_auc(adv_path2, normal_model))
print('---')
print( return_auc(adv_path3, normal_model))
print('---')
print( return_auc(adv_path4, normal_model))

Computing metrics for {} for benign
Computing metrics for {} for adv
(0.9789340000000001, [0.018, 0.04, 0.076, 0.166, 0.214, 0.282, 0.37, 0.612, 0.775, 0.944, 0.998], [0.966, 0.987, 0.995, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
---
Computing metrics for {} for benign
Computing metrics for {} for adv
(0.9814570000000001, [0.017, 0.048, 0.103, 0.177, 0.23, 0.287, 0.377, 0.627, 0.794, 0.954, 0.9990000000000001], [0.983, 0.995, 0.9990000000000001, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
---
Computing metrics for {} for benign
Computing metrics for {} for adv
(0.9830000000000001, [0.017, 0.042, 0.083, 0.168, 0.228, 0.291, 0.38, 0.616, 0.7829999999999999, 0.9469999999999998, 1.0], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])


# PGD

In [20]:
adv_path1 = 'adv samples/MNIST/PGD/0.03137254901960784eps.npz'
adv_path2 = 'adv samples/MNIST/PGD/0.06274509803921569eps.npz'
adv_path3 = 'adv samples/MNIST/PGD/0.12549019607843137eps.npz'
adv_path4 = 'adv samples/MNIST/PGD/0.25098039215686274eps.npz'

In [21]:
return_auc(adv_path1, normal_model)

Computing metrics for {} for benign
Computing metrics for {} for adv


(0.9840000000000001,
 [0.016, 0.047, 0.103, 0.172, 0.226, 0.3, 0.396, 0.633, 0.789, 0.959, 1.0],
 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])

In [22]:
print(return_auc(adv_path2, normal_model))
print('---')
print( return_auc(adv_path3, normal_model))
print('---')
print( return_auc(adv_path4, normal_model))

Computing metrics for {} for benign
Computing metrics for {} for adv
(0.978, [0.022, 0.052000000000000005, 0.099, 0.174, 0.22699999999999998, 0.31, 0.397, 0.629, 0.791, 0.956, 1.0], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
---
Computing metrics for {} for benign
Computing metrics for {} for adv
(0.976, [0.024, 0.049, 0.098, 0.18099999999999997, 0.23799999999999996, 0.307, 0.397, 0.63, 0.773, 0.949, 1.0], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
---
Computing metrics for {} for benign
Computing metrics for {} for adv
(0.969, [0.029000000000000005, 0.051, 0.091, 0.171, 0.218, 0.302, 0.403, 0.627, 0.789, 0.9519999999999998, 0.998], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])


# BIM

In [23]:
adv_path1 = 'adv samples/MNIST/BIM/0.03137254901960784eps.npz'
adv_path2 = 'adv samples/MNIST/BIM/0.06274509803921569eps.npz'
adv_path3 = 'adv samples/MNIST/BIM/0.12549019607843137eps.npz'
adv_path4 = 'adv samples/MNIST/BIM/0.25098039215686274eps.npz'

In [24]:
return_auc(adv_path1, normal_model)

Computing metrics for {} for benign
Computing metrics for {} for adv


(0.57069,
 [0.022,
  0.049,
  0.10400000000000001,
  0.17800000000000002,
  0.23,
  0.3,
  0.37200000000000005,
  0.624,
  0.785,
  0.959,
  1.0],
 [0.038, 0.078, 0.143, 0.24, 0.31, 0.389, 0.489, 0.714, 0.865, 0.968, 1.0])

In [25]:
print(return_auc(adv_path2, normal_model))
print('---')
print( return_auc(adv_path3, normal_model))
print('---')
print( return_auc(adv_path4, normal_model))

Computing metrics for {} for benign
Computing metrics for {} for adv
(0.5509325000000002, [0.019, 0.045, 0.092, 0.156, 0.203, 0.27, 0.363, 0.613, 0.7879999999999999, 0.945, 0.9990000000000001], [0.027000000000000003, 0.063, 0.11600000000000002, 0.22, 0.279, 0.35600000000000004, 0.449, 0.666, 0.8269999999999998, 0.971, 1.0])
---
Computing metrics for {} for benign
Computing metrics for {} for adv
(0.5161410000000001, [0.02, 0.04100000000000001, 0.09, 0.17800000000000002, 0.228, 0.289, 0.37799999999999995, 0.624, 0.77, 0.936, 0.9990000000000001], [0.042, 0.08200000000000002, 0.141, 0.207, 0.26, 0.33, 0.413, 0.619, 0.774, 0.935, 0.997])
---
Computing metrics for {} for benign
Computing metrics for {} for adv
(0.5319345, [0.025, 0.049, 0.10100000000000002, 0.182, 0.231, 0.295, 0.391, 0.628, 0.7760000000000001, 0.9419999999999998, 1.0], [0.051, 0.085, 0.132, 0.221, 0.276, 0.355, 0.44800000000000006, 0.655, 0.797, 0.948, 0.9990000000000001])


# CW

In [26]:
adv_path1 = 'adv samples/MNIST/CW/00.15eps.npz'

return_auc(adv_path1, normal_model)

Computing metrics for {} for benign
Computing metrics for {} for adv


(0.3929875000000001,
 [0.024,
  0.05,
  0.086,
  0.179,
  0.23,
  0.296,
  0.384,
  0.628,
  0.7829999999999999,
  0.951,
  1.0],
 [0.018,
  0.029000000000000005,
  0.05600000000000001,
  0.10100000000000002,
  0.129,
  0.17299999999999996,
  0.23200000000000004,
  0.4640000000000001,
  0.659,
  0.905,
  0.996])