# Experiment 1


- Start with a single model trained on sample (0-9)
- Take adversarial attack of certain type, get new adversarial samples (10-19).
- Make a filter to distinguish the two samples. Also migrate to get a model for the adversarial samples.
- Take a different type of attack (on the original model). Get another set of adversrial samples.
- Take the normalized dot product between the two set of adversarial samples. If it is close to negative, then label them differently (20-29).
- If the dot product is big (angle is small), then compare the lengths of the two perturbations. If they are largely different, then still make different labels.
- Train new filter to distinguish the three. Also migrate.
- Keep repeating. The new adversarial sample is compared with the different old samples. If all dot products are close to negative, then make new labels.
- Such dot product measures the C^1 distance between different models.

Idea is ...

Suppose we have models $M_i$, $i = 0, 1, 2, \ldots$. Each has its own fuzziness, and adversarial attacks are generated based on the fuzziness (Fast Gradient Method, Projective gradient method, CW method, DeepFool method etc.). $\textrm{dist}_{C_0}(M_i, M_j)$ on a set of features $X$ is distance between function values (between fuzziness) and $\textrm{dist}_{C_1}(M_i, M_j)$ will be the distance between gradients at each features, which can be represented by adversarial attacks.

-> Similar models behave similarly upto $C_1$ distance, which can be verified via adversarial attacks. And different models shows different fuzziness. $C_0$ distance may not work in this way because models provide similar answer if they perform well.

What we need?

Similar models, different models, adversarial attacks.

Models :
ResNet20v1, ResNet20v2, ResNet56v1, ResNet56v2, VGG11, VGG13, VGG16, VGG19, 3 of each, labeled as 0, 1, 2

adversarial attacks dimensions :

Generating models: ResNet56v1, VGG16

Attack types: PGD (relatively large perturbation. **what should be the epsilon?  0.376 ~ 96/255**) and CWL2 (Low perturbation, model specified)

Attack directions : Targeted, Untargeted

In [None]:

import tensorflow as tf
import cleverhans
from tensorflow import keras
from keras.models import Model, load_model
from keras.datasets import cifar10
import numpy as np
from sklearn.model_selection import train_test_split
import os


cifar10_dataset = cifar10.load_data()
training, test = cifar10_dataset
x_tr, x_v, y_tr, y_v = train_test_split(training[0], training[1], test_size=0.2, random_state=42)
x_tr = x_tr.reshape(x_tr.shape[0],32,32,3)
x_v = x_v.reshape(x_v.shape[0],32,32,3)
x_v = x_v.astype('float32')
x_tr = x_tr.astype('float32')
x_v /= 255
x_tr /= 255
y_tr = keras.utils.to_categorical(y_tr, 10)
y_v = keras.utils.to_categorical(y_v,10)
x_test, y_test = test
x_test = x_test.reshape(x_test.shape[0],32,32,3)
x_test = x_test.astype('float32')
x_test /= 255 
y_test = keras.utils.to_categorical(y_test, 10)








In [None]:
model_name = 'n_9_v1_cifar10_1.keras'# ResNet56. 0.85M number of parameters.
GENmodel = load_model(f'CIFAR10models/{model_name}') 

## PGD method

In [None]:
# Generating Adversarial Examples using PGD method

# from cleverhans.tf2.attacks import projected_gradient_descent as pgd

'''
eps = 216/255 # value ranges between 0 - 255. allow to perturbe 216 at most.

eps_iters = [9/255,27/255,48/255,96/255]

nb_iter = 8
norm = 2
targeted = False

'''
nb_iter = 8
norm = 2

eps = 216/255
eps_iter = 96/255

label = np.round(eps_iter,3)
print(f'epsilon = {eps_iter}')


In [None]:
# targeted = False
# advs_x =[]
# for i in range(4):
#     print(f'{2500*i} - {2500*(i+1)}', end = ' ')
#     adv = pgd.projected_gradient_descent(GENmodel, x_test[2500*i:2500*(i+1)],eps,eps_iter,nb_iter,norm,y=tf.argmax(y_tr[2500*i:2500*(i+1)],1))
#     advs_x.append(adv)
# advs_x = np.concatenate(advs_x, axis=0)
# print(f"advs_x shape: {advs_x.shape}")
# np.save(f'adversarial_examples/pgd_{label}_x_test_untarget_gen_by_{model_name}.npy', advs_x)


# logits = GENmodel.predict(x_test, batch_size=2000)
# least_likely = tf.argmin(logits,axis=-1)
# y= least_likely

# targeted = True
# advs_x =[]
# for i in range(4):
#     print(f'{2500*i} - {2500*(i+1)}', end = ' ')
#     adv = pgd.projected_gradient_descent(GENmodel,x_test[2500*i:2500*(i+1)],eps,eps_iter,nb_iter,norm,y=y[2500*i:2500*(i+1)],targeted=targeted)
#     advs_x.append(adv) 
# advs_x = np.concatenate(advs_x, axis=0)
# print(f"advs_x_test shape: {advs_x.shape}")
# np.save(f'adversarial_examples/pgd_{label}_x_test_target_to_ll_gen_by_{model_name}.npy', advs_x)


## CWL2 method

In [None]:
# CarliniWagnerL2
# from cleverhans.tf2.attacks import carlini_wagner_l2 as cw 

In [None]:
# x = x_test
# y = y_test
# y = tf.argmax(y,axis=1)

In [None]:
# model_name = 'n_9_v1_cifar10.keras'# ResNet56. 0.85M number of parameters.
# GENmodel = load_model(f'CIFAR10models/{model_name}') 

# logits = GENmodel(x_test)
# least_likely = tf.argmin(logits,axis=-1)
# y= least_likely

In [None]:

# advs_x=[]
# adv_cwl2= cw.CarliniWagnerL2(GENmodel,max_iterations=100,binary_search_steps=5)
# merge_list_untargeted = []
# for j in range(10000):
#     adv_x = adv_cwl2.attack(x[j:j+1])
#     advs_x.append(adv_x)
#     if j == 0:
#         continue
#         # 0 ~ 1000, 1001 numbers
#         # 1001 ~ 2000 , ... , 39001 ~ 39999, 1000 numbers each
#     elif j%1000==0:
#         k = j//1000
#         adv = np.concatenate(advs_x, axis=0)

#         print(f"We are at {j}th adversarials. advs_x shape: {adv.shape}")
#         np.save(f'adversarial_examples/cwl2_x_test_untargeted_{(k-1)*1000}to{k*1000}_gen_by_{model_name}.npy', adv)
#         merge_list_untargeted.append(f'adversarial_examples/cwl2_x_test_untargeted_{(k-1)*1000}to{k*1000}_gen_by_{model_name}.npy')
#         print(f'saved to adversarial_examples with file name "cwl2_x_test_untargeted_{(k-1)*1000}to{k*1000}_gen_by_{model_name}.npy" ')
#         advs_x=[]
#     elif j == 9999:
#         k = 10
#         adv = np.concatenate(advs_x, axis=0)
#         print(f"We are at {j}th adversarials. advs_x shape: {adv.shape}")
#         np.save(f'adversarial_examples/cwl2_x_test_untargeted_{(k-1)*1000}to{k*1000}_gen_by_{model_name}.npy', adv)
#         merge_list_untargeted.append(f'adversarial_examples/cwl2_x_test_untargeted_{(k-1)*1000}to{k*1000}_gen_by_{model_name}.npy')
#         print(f'saved to adversarial_examples with file name "cwl2_x_test_untargeted_{(k-1)*1000}to{k*1000}_gen_by_{model_name}.npy" ')

# advs_x=[]     
# merge_list_targeted_to_ll = []
# for j in range(10000):
#     adv_cwl2= cw.CarliniWagnerL2(GENmodel,max_iterations=100,binary_search_steps=5,y=y[j:j+1],targeted=True)
#     adv_x = adv_cwl2.attack(x[j:j+1])
#     advs_x.append(adv_x)
#     if j == 0:
#         continue
#         # 0 ~ 1000, 1001 numbers
#         # 1001 ~ 2000 , ... , 39001 ~ 39999, 1000 numbers each
#     elif j%1000==0:
#         k = j//1000
#         adv = np.concatenate(advs_x, axis=0)

#         print(f"We are at {j}th adversarials. advs_x shape: {adv.shape}")
#         np.save(f'adversarial_examples/cwl2_x_test_targeted_to_ll_{(k-1)*1000}to{k*1000}_gen_by_{model_name}.npy', adv)
#         merge_list_targeted_to_ll.append(f'adversarial_examples/cwl2_x_test_targeted_to_ll_{(k-1)*1000}to{k*1000}_gen_by_{model_name}.npy')
#         print(f'saved to adversarial_examples with file name "cwl2_x_test_targeted_to_ll_{(k-1)*1000}to{k*1000}_gen_by_{model_name}.npy" ')
#         advs_x=[]
#     elif j == 9999:
#         k = 10
#         adv = np.concatenate(advs_x, axis=0)
#         print(f"We are at {j}th adversarials. advs_x shape: {adv.shape}")
#         np.save(f'adversarial_examples/cwl2_x_test_targeted_to_ll_{(k-1)*1000}to{k*1000}_gen_by_{model_name}.npy', adv)
#         merge_list_targeted_to_ll.append(f'adversarial_examples/cwl2_x_test_targeted_to_ll_{(k-1)*1000}to{k*1000}_gen_by_{model_name}.npy')
#         print(f'saved to adversarial_examples with file name "cwl2_x_test_targeted_to_ll_{(k-1)*1000}to{k*1000}_gen_by_{model_name}.npy" ')

    

Models I have:

ResNet20v1 * 8, ResNet20v2 * 4, ResNet56v1 * 4, ResNet56v2 * 4

VGG11 * 4, VGG13 * 4, VGG16 * 4. VGG19 * 4

Adversarial Examples I have: (Only testing dataset)

PGD method, 96/255 ($\simeq 0.376$), generated by each model, untargeted or targeted to least likely class

CWL2 method, two examples, generated by ResNet56v1 (without label)

**Would it be better to have adversarial example from validation dataset?**


## Experiment 1 design

Hypothesis:

Resnet models share similar structure (and some of them are exactly the same), but possibly different weights. They behave similary to each adversarial attacks. Bad performance to adversarial examples generated by ResNet20v1 models, and high dot-product average for each adversaries.

That should be the same as for VGG models.

However, since CWL2 is very model specific attack with low perturbation yet effective, the dot product method to determine whether given models are similar would be not promising.

### import modules

In [3]:
import tensorflow as tf
import keras
import sys, os
print(tf.__version__)
sys.path.append("/Users/inkeejung/Library/CloudStorage/Dropbox-BOSTONUNIVERSITY/Inkee Jung/Inkee Jung’s files/Interpretability/AdvLogifold/computer/modules")

from keras.models import Model, load_model
from keras.datasets import cifar10
import logifoldv1_4 as logifold
from sklearn.model_selection import train_test_split
import numpy as np
from resnet import ResNet
from keras.layers import Dense
import matplotlib.pyplot as plt
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, ReduceLROnPlateau
from keras import layers
import pickle
import pandas as pd
print('Modules are loaded. (local)')

dropbox_path = "/Users/inkeejung/Library/CloudStorage/Dropbox-BOSTONUNIVERSITY/Inkee Jung/Inkee Jung’s files/Interpretability/AdvLogifold/computer/"
base_models_path = dropbox_path + 'Base_Models/'
adversarial_examples_path = dropbox_path + 'Adversarial_Examples/'
adversarial_models_path = dropbox_path + 'Adversarial_Models/'

# test_logifold_path = dropbox_path + 'testfolder/logifold_test'+ date + '/'

2025-03-06 21:32:58.049177: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Original dataset loading

In [5]:
(x, y), (x_test, y_test)= cifar10.load_data()
x_tr, x_v, y_tr, y_v = train_test_split(x, y, test_size=0.2, random_state=42)
x_tr = x_tr.reshape(x_tr.shape[0],32,32,3)
x_v = x_v.reshape(x_v.shape[0],32,32,3)
x_v = x_v.astype('float32')
x_tr = x_tr.astype('float32')
x_v /= 255
x_tr /= 255
x_train, x_val, y_train, y_val = x_tr,x_v,y_tr,y_v
x_test = x_test.reshape(x_test.shape[0],32,32,3)
x_test = x_test.astype('float32')
x_test /= 255
original_dataset_table = {'train':(x_tr,y_tr)
                         ,'val':(x_val,y_val)
                         ,'test':(x_test,y_test)
                         } 
print('original dataset has been loaded.')
print("Hashtable 'original_dataset_table'. \n original_dataset_table[train/val/test] = (X,Y)")


original dataset has been loaded.
Hashtable 'original_dataset_table'. 
 original_dataset_table[train/val/test] = (X,Y)


### Adversarial examples loading

In [6]:
adv_x_tr = np.load(adversarial_examples_path + 'by_ResNet56/' + "pgd_0.376_x_untarget.npy")
adv_x_val = np.load(adversarial_examples_path + 'by_ResNet56/' + "pgd_0.376_x_val_untarget.npy")
adv_x_test = np.load(adversarial_examples_path + 'by_ResNet56/'+"pgd_0.376_x_test_untarget.npy")
adv_ll_x_tr = np.load(adversarial_examples_path + 'by_ResNet56/'+"pgd_0.376_x_target_to_ll.npy")
adv_ll_x_v = np.load(adversarial_examples_path + 'by_ResNet56/'+"pgd_0.376_x_val_target_to_ll.npy")
adv_ll_x_test = np.load(adversarial_examples_path + 'by_ResNet56/'+"pgd_0.376_x_test_target_to_ll.npy")
adv_2nd_x_tr = np.load(adversarial_examples_path + 'by_ResNet56/'+"pgd_0.376_x_target_to_second.npy")
adv_2nd_x_v = np.load(adversarial_examples_path + 'by_ResNet56/'+"pgd_0.376_x_val_target_to_second.npy")
adv_2nd_x_test = np.load(adversarial_examples_path + 'by_ResNet56/'+"pgd_0.376_x_test_target_to_second.npy")
adv_cwl2_x_tr = np.load(adversarial_examples_path + 'by_ResNet56/'+"cwl2_x_tr_untargeted.npy")
adv_cwl2_x_v = np.load(adversarial_examples_path + 'by_ResNet56/'+"cwl2_x_v_untargeted.npy")
adv_cwl2_x_test = np.load(adversarial_examples_path + 'by_ResNet56/'+"cwl2_x_test_untargeted.npy")
adv_cwl2_ll_x_test = np.load(adversarial_examples_path+'by_ResNet56/'+"cwl2_x_test_targeted_to_ll.npy")
print('adversarial dataset has been loaded.')
print('CWL2(untargeted) and PGD(untargeted, targeted to least likely class). Training, validation, and testing data')
VGG_adv_x_tr = np.load(adversarial_examples_path + 'by_VGG16/' + "pgd_0.376_x_untarget.npy")
VGG_adv_x_val = np.load(adversarial_examples_path + 'by_VGG16/' + "pgd_0.376_x_val_untarget.npy")
VGG_adv_x_test = np.load(adversarial_examples_path + 'by_VGG16/'+"pgd_0.376_x_test_untarget.npy")
VGG_adv_ll_x_tr = np.load(adversarial_examples_path + 'by_VGG16/'+"pgd_0.376_x_target_to_ll.npy")
VGG_adv_ll_x_v = np.load(adversarial_examples_path + 'by_VGG16/'+"pgd_0.376_x_val_target_to_ll.npy")
VGG_adv_ll_x_test = np.load(adversarial_examples_path + 'by_VGG16/'+"pgd_0.376_x_test_target_to_ll.npy")

print('adversarial dataset generated by VGG16 has been loaded.')
print('PGD(untargeted, targeted to least likely class). Training, validation, and testing data')


adversarial dataset has been loaded.
CWL2(untargeted) and PGD(untargeted, targeted to least likely class). Training, validation, and testing data
adversarial dataset generated by VGG16 has been loaded.
PGD(untargeted, targeted to least likely class). Training, validation, and testing data


### Adversarial dataset hash table

In [55]:
def convert_name(resnet):
    if resnet == '31':
        return 'ResNet20v1'
    elif resnet == '32':
        return 'ResNet20v2'
    elif resnet == '91':
        return 'ResNet56v1'
    elif resnet == '92':
        return 'ResNet56v2'

In [88]:
adv_dataset_table = {}
adv_dataset_table['ResNet'] = {}
adv_dataset_table['VGG'] = {}
for vgg in ['11','13','16','19']:
    adv_dataset_table['VGG'][vgg] = {}
    adv_dataset_table['VGG'][vgg]['untargeted'] = []
    adv_dataset_table['VGG'][vgg]['targeted'] = []
    for directory in os.listdir(adversarial_examples_path+'by_VGG'):
        if vgg in directory:
            if 'untarget' in directory:
                if len(directory) == 38:
                    adv_dataset_table['VGG'][vgg]['untargeted'].append(('0',np.load(adversarial_examples_path + 'by_VGG/' + directory)))
                else:
                    adv_dataset_table['VGG'][vgg]['untargeted'].append((str(int(directory[-5])+1),np.load(adversarial_examples_path + 'by_VGG/' + directory)))
            else:
                if len(directory) == 42:
                    adv_dataset_table['VGG'][vgg]['targeted'].append(('0',np.load(adversarial_examples_path + 'by_VGG/' + directory)))
                else:
                    
                    adv_dataset_table['VGG'][vgg]['targeted'].append((str(int(directory[-5])+1),np.load(adversarial_examples_path + 'by_VGG/' + directory)))



In [98]:
len(adv_dataset_table['VGG']['16']['untargeted'])

4

In [99]:
for resnet in ['31','32','91','92']:
    name = convert_name(resnet)
    adv_dataset_table['ResNet'][name] = {}
    adv_dataset_table['ResNet'][name]['untargeted'] = []
    adv_dataset_table['ResNet'][name]['targeted'] = []
    for directory in os.listdir(adversarial_examples_path+'by_ResNet'):
        
        if 'pgd' in directory and 'test' in directory and 'gen_by' in directory:
            
            if 'untarget' in directory:
                which_it_is = directory[35] + directory[38]
                if resnet == which_it_is:
                    if len(directory) == 57:
                        adv_dataset_table['ResNet'][name]['untargeted'].append(('0',np.load(adversarial_examples_path + 'by_ResNet/' + directory)))
                    else:
                        adv_dataset_table['ResNet'][name]['untargeted'].append((directory[-11],np.load(adversarial_examples_path + 'by_ResNet/' + directory)))

            else:
                which_it_is = directory[39] + directory[42]
                if resnet == which_it_is:
                    if len(directory) == 61:
                        adv_dataset_table['ResNet'][name]['targeted'].append(('0',np.load(adversarial_examples_path + 'by_ResNet/' + directory)))
                    else:
                        adv_dataset_table['ResNet'][name]['targeted'].append((directory[-11],np.load(adversarial_examples_path + 'by_ResNet/' + directory)))

        

### dot product def

In [21]:
def dot_product(x,y,application_pt):
    x = x.flatten()
    y = y.flatten()
    p = application_pt.flatten()
    pert_x = x - p
    pert_y = y - p
    size_of_pert_x = np.dot(pert_x,pert_x)**(0.5)
    size_of_pert_y = np.dot(pert_y,pert_y)**(0.5)
    if size_of_pert_x == 0 or size_of_pert_y == 0:
        return 2
    return np.dot(pert_x/size_of_pert_x,pert_y/size_of_pert_y)

In [8]:
num_samples = 10000 
indices = np.arange(num_samples)


### Experiment 1-1

Compare VGG, PGD, Untargeted with ResNet, PGD, Untargeted adversarials.

In [104]:
for data in adv_dataset_table['ResNet']['ResNet20v1']['untargeted']:
    print(data[0])

5
3
4
2
7
0
1
6


In [138]:
def function(modeltype1, modelname1,modeltype2, modelname2, adversarial_direction1, adversarial_direction2, label1, label2):
    dot_products = {
    "test": [],
    "average_test" : [],
    
    }
    cnt = 0


    for dataset in dot_products.keys():
        if dataset[:2] == 'av':
            continue
        for i in range(num_samples):
            app_pt = original_dataset_table[dataset][0][i] 
            dot_product_value = dot_product(adv_dataset_table[f'{modeltype1}'][f'{modelname1}'][f'{adversarial_direction1}'][label1][1][i], 
                            adv_dataset_table[f'{modeltype2}'][f'{modelname2}'][f'{adversarial_direction2}'][label2][1][i], 
                            app_pt)
            if dot_product_value == 2:
                cnt += 1
                dot_product_value = 0
            dot_products[dataset].append(
                dot_product_value
                )

    average_test = np.sum(dot_products['test'])/num_samples 

    dot_products['average_test'] = [average_test for _ in range(num_samples)]

    # plt.figure(figsize=(8, 2))
    # for label, values in dot_products.items():
    #     if label[-2:] == 'in':
    #         color = 'blue'
    #     elif label[-2:] == 'al':
    #         color = 'green'
    #     else:
    #         color = 'red'
    #     if label[:2] == 'av':
    #         plt.scatter(indices, values, alpha=1, s=0.01, color = color)
    #     else:
    #         plt.scatter(indices, values, label=label, alpha=0.6, s=0.1,color = color)

    # plt.xlabel(f"Average = {average_test}\n and there are {cnt} number of non-perturbed adversarial (which induce ZeroDivisionError)")
    # plt.ylabel("dot product")
    # plt.title(f"Dot Product Behavior Across Adversarial Examples\n {modelname1, adv_dataset_table[f'{modeltype1}'][f'{modelname1}'][f'{adversarial_direction1}'][label1][0]} - {modelname2, adv_dataset_table[f'{modeltype2}'][f'{modelname1}'][f'{adversarial_direction1}'][label2][0]} (PGD, {adversarial_direction1})")
    # plt.axhline(0, color='black', linewidth=0.8, linestyle='--')
    # plt.ylim(-1, 1)
    # plt.grid(False)

    # plt.show()
    return average_test

In [128]:
modeltype1 = 'ResNet'
modeltype2 = 'ResNet'
modelname1 = 'ResNet56v1'
modelname2 = 'ResNet56v1'
adversarial_direction1 = 'untargeted'
adversarial_direction2 = 'untargeted'
values = {}
for label1 in range(4):
    for label2 in range(4):
        values[(label1,label2)] = round(function(modeltype1,modelname1,modeltype2,modelname2,adversarial_direction1,adversarial_direction2,label1,label2)
                                        ,4)



# Create a DataFrame to represent the table
table = pd.DataFrame(np.nan, index=range(4), columns=range(4))  # Initialize with NaN

# Fill in the values
for (x, y), val in values.items():
    table.loc[y, x] = val
    
print(table)


        0       1       2       3
0  1.0000  0.1021 -0.0506  0.1023
1  0.1021  1.0000 -0.0520  0.1005
2 -0.0506 -0.0520  1.0000 -0.0509
3  0.1023  0.1005 -0.0509  0.9999


ResNet 20 v1 , v2, ResNet56 v1, v2  4 number of models
VGG 11, 13 , 16, 19 3 number of models

PGD type adversarial examples, targeted or untargeted 28 examples

CWL2 , PGD 200/255 : cap of perturbation, 96/255

In [161]:
for modeltype in ['ResNet20v1','ResNet20v2','ResNet56v1', 'ResNet56v2']:
    modeltype1 = modeltype2 = 'ResNet'
    modelname1 = modelname2 = modeltype
    for adversarial_direction in ['untargeted', 'targeted']:
        adversarial_direction1 = adversarial_direction2 = adversarial_direction
        values = {}
        for label1 in range(4):
            for label2 in range(4):
                values[(label1,label2)] = function(modeltype1,modelname1,modeltype2,modelname2,adversarial_direction1,adversarial_direction2,label1,label2)




        # Create a DataFrame to represent the table
        table = pd.DataFrame(np.nan, index=range(4), columns=range(4))  # Initialize with NaN

        # Fill in the values
        for (x, y), val in values.items():
            table.loc[y, x] = 10**val
        
        av = 0
        for label1 in range(4):
            for label2 in range(4):
                if label1 > label2:
                    av += 10**values[(label1,label2)]-1
        av = (av)/6
        
        print('------------------------------------------------------------------\n',f'{modeltype}, {adversarial_direction}','\n', table,'\n', 'average = ', av,'\n------------------------------------------------------------------\n')

------------------------------------------------------------------
 ResNet20v1, untargeted 
           0          1          2         3
0  9.997697   1.270610   1.284378  1.263122
1  1.270610  10.000000   1.269944  1.248033
2  1.284378   1.269944  10.000000  1.264265
3  1.263122   1.248033   1.264265  9.997697 
 average =  0.2667253898881082 
------------------------------------------------------------------

------------------------------------------------------------------
 ResNet20v1, targeted 
           0         1         2         3
0  9.997698  1.052775  1.052395  1.053828
1  1.052775  9.997698  1.057413  1.061182
2  1.052395  1.057413  9.997698  1.060418
3  1.053828  1.061182  1.060418  9.997698 
 average =  0.056335181065390226 
------------------------------------------------------------------

------------------------------------------------------------------
 ResNet20v2, untargeted 
            0          1          2          3
0  10.000000   1.269701   1.245731   1.2558

$x^* - x$

$f, g$ 

$f(x) - g(x)$

$f(x^*) - f(x) ,g(x^*)-g(x)$

In [162]:
for modeltype in ['VGG11','VGG13','VGG16', 'VGG19']:
    modeltype1 = modeltype2 = 'VGG'
    modelname1 = modelname2 = modeltype[-2:]
    for adversarial_direction in ['untargeted', 'targeted']:
        adversarial_direction1 = adversarial_direction2 = adversarial_direction
        values = {}
        for label1 in range(3):
            for label2 in range(3):
                values[(label1,label2)] = function(modeltype1,modelname1,modeltype2,modelname2,adversarial_direction1,adversarial_direction2,label1,label2)




        # Create a DataFrame to represent the table
        table = pd.DataFrame(np.nan, index=range(3), columns=range(3))  # Initialize with NaN

        # Fill in the values
        for (x, y), val in values.items():
            table.loc[y, x] = 10**val
        
        av = 0
        for label1 in range(3):
            for label2 in range(3):
                if label1 > label2:
                    av += 10**values[(label1,label2)]-1
        av = av/3
        
        print('------------------------------------------------------------------\n',f'{modeltype}, {adversarial_direction}','\n', table,'\n', 'average = ', av,'\n------------------------------------------------------------------\n')

------------------------------------------------------------------
 VGG11, untargeted 
           0         1         2
0  9.970111  1.554128  1.531339
1  1.554128  9.876430  1.524588
2  1.531339  1.524588  9.960932 
 average =  0.5366849963271966 
------------------------------------------------------------------

------------------------------------------------------------------
 VGG11, targeted 
           0         1         2
0  9.908320  1.210229  1.194050
1  1.210229  9.871883  1.191703
2  1.194050  1.191703  9.954054 
 average =  0.19866071881528913 
------------------------------------------------------------------

------------------------------------------------------------------
 VGG13, untargeted 
           0         1         2
0  9.767873  1.676182  1.696624
1  1.676182  9.781377  1.705147
2  1.696624  1.705147  9.736435 
 average =  0.6926507275206912 
------------------------------------------------------------------

--------------------------------------------------

In [163]:
for modeltype in ['VGG11','VGG13','VGG16', 'VGG19']:
    modeltype1 = 'VGG'
    modelname1 = modeltype[-2:]
    for modeltype in ['ResNet20v1','ResNet20v2','ResNet56v1', 'ResNet56v2']:
        modeltype2 = 'ResNet'
        modelname2 = modeltype
        for adversarial_direction in ['untargeted', 'targeted']:
            adversarial_direction1 = adversarial_direction2 = adversarial_direction
            values = {}
            for label1 in range(3):
                for label2 in range(4):
                    values[(label1,label2)] = function(modeltype1,modelname1,modeltype2,modelname2,adversarial_direction1,adversarial_direction2,label1,label2)




            # Create a DataFrame to represent the table
            table = pd.DataFrame(np.nan, index=range(3), columns=range(3)) 
            # Fill in the values
            for (x, y), val in values.items():
                table.loc[y, x] = val
            
            av = 0
            for label1 in range(3):
                for label2 in range(4):
                    if label1 != label2:
                        av += 10**values[(label1,label2)]-1
            av = av/12
            
            print('------------------------------------------------------------------\n',
                  f'{modeltype1, modelname1} and {modeltype2, modelname2}, {adversarial_direction}','\n', 
                #   table,'\n', 
                  'average = ', av,'\n------------------------------------------------------------------\n')

------------------------------------------------------------------
 ('VGG', '11') and ('ResNet', 'ResNet20v1'), untargeted 
 average =  0.10725363658277 
------------------------------------------------------------------

------------------------------------------------------------------
 ('VGG', '11') and ('ResNet', 'ResNet20v1'), targeted 
 average =  0.0292808256659276 
------------------------------------------------------------------

------------------------------------------------------------------
 ('VGG', '11') and ('ResNet', 'ResNet20v2'), untargeted 
 average =  0.09912972351456852 
------------------------------------------------------------------

------------------------------------------------------------------
 ('VGG', '11') and ('ResNet', 'ResNet20v2'), targeted 
 average =  0.024285991651180856 
------------------------------------------------------------------

------------------------------------------------------------------
 ('VGG', '11') and ('ResNet', 'ResNet56v

In [164]:
for modeltype in ['VGG11','VGG13','VGG16', 'VGG19']:
    modeltype1 = modeltype2 = 'VGG'
    modelname1 = modelname2 = modeltype[-2:]
    adversarial_direction1= 'targeted'
    adversarial_direction2 = 'untargeted'

    values = {}
    for label1 in range(3):
        for label2 in range(3):
            values[(label1,label2)] = function(modeltype1,modelname1,modeltype2,modelname2,adversarial_direction1,adversarial_direction2,label1,label2)




    # Create a DataFrame to represent the table
    table = pd.DataFrame(np.nan, index=range(3), columns=range(3))  # Initialize with NaN

    # Fill in the values
    for (x, y), val in values.items():
        table.loc[y, x] = val
    
    av = 0
    for label1 in range(3):
        for label2 in range(3):
            if label1 > label2:
                av += 10**values[(label1,label2)]-1
    av = av/9
    
    print('------------------------------------------------------------------\n',f'{modeltype}, {adversarial_direction1} vs {adversarial_direction2} ','\n', table,'\n', 'average = ', av,'\n------------------------------------------------------------------\n')

------------------------------------------------------------------
 VGG11, targeted vs untargeted  
           0         1         2
0 -0.091176 -0.092717 -0.086082
1 -0.091023 -0.417757 -0.083761
2 -0.089296 -0.089314 -0.402873 
 average =  -0.12314057316789546 
------------------------------------------------------------------

------------------------------------------------------------------
 VGG13, targeted vs untargeted  
           0         1         2
0 -0.101370 -0.102338 -0.392049
1 -0.103295 -0.102279 -0.098092
2 -0.105765 -0.388785 -0.102004 
 average =  -0.22510542487426802 
------------------------------------------------------------------

------------------------------------------------------------------
 VGG16, targeted vs untargeted  
           0         1         2
0 -0.089668 -0.096429 -0.356016
1 -0.090286 -0.092445 -0.092628
2 -0.086901 -0.343667 -0.092016 
 average =  -0.20738378233904695 
------------------------------------------------------------------

----

In [165]:
for modeltype in ['ResNet20v1','ResNet20v2','ResNet56v1', 'ResNet56v2']:
    modeltype1 = modeltype2 = 'ResNet'
    modelname1 = modelname2 = modeltype
    adversarial_direction1 = 'targeted'
    adversarial_direction2 = 'untargeted'
    values = {}
    for label1 in range(4):
        for label2 in range(4):
            values[(label1,label2)] = function(modeltype1,modelname1,modeltype2,modelname2,adversarial_direction1,adversarial_direction2,label1,label2)




    # Create a DataFrame to represent the table
    table = pd.DataFrame(np.nan, index=range(4), columns=range(4))  # Initialize with NaN

    # Fill in the values
    for (x, y), val in values.items():
        table.loc[y, x] = val
    
    av = 0
    for label1 in range(4):
        for label2 in range(4):
            if label1 != label2:
                av += 10**values[(label1,label2)]-1
    av = av/16
    
    print('------------------------------------------------------------------\n',f'{modeltype}, {adversarial_direction1} vs {adversarial_direction2} ','\n', table,'\n', 'average = ', av,'\n------------------------------------------------------------------\n')

------------------------------------------------------------------
 ResNet20v1, targeted vs untargeted  
           0         1         2         3
0 -0.033329 -0.037570 -0.035756 -0.276788
1 -0.033649 -0.036564 -0.034849 -0.038322
2 -0.035641 -0.037021 -0.037464 -0.040476
3 -0.031743 -0.036378 -0.033863 -0.037260 
 average =  -0.08404103423522183 
------------------------------------------------------------------

------------------------------------------------------------------
 ResNet20v2, targeted vs untargeted  
           0         1         2         3
0 -0.035140 -0.034728 -0.246032 -0.037597
1 -0.036561 -0.034409 -0.036496 -0.251973
2 -0.250291 -0.032939 -0.034084 -0.036675
3 -0.034003 -0.231369 -0.035331 -0.035992 
 average =  -0.14702068308807834 
------------------------------------------------------------------

------------------------------------------------------------------
 ResNet56v1, targeted vs untargeted  
           0         1         2         3
0 -0.037315 -0

In [166]:
for modeltype in ['VGG11','VGG13','VGG16', 'VGG19']:
    modeltype1 = 'VGG'
    modelname1 = modeltype[-2:]
    for modeltype in ['ResNet20v1','ResNet20v2','ResNet56v1', 'ResNet56v2']:
        modeltype2 = 'ResNet'
        modelname2 = modeltype
    
        adversarial_direction1 = 'untargeted'
        adversarial_direction2 = 'targeted'
        values = {}
        for label1 in range(3):
            for label2 in range(4):
                values[(label1,label2)] = function(modeltype1,modelname1,modeltype2,modelname2,adversarial_direction1,adversarial_direction2,label1,label2)




        # Create a DataFrame to represent the table
        table = pd.DataFrame(np.nan, index=range(3), columns=range(3)) 
        # Fill in the values
        for (x, y), val in values.items():
            table.loc[y, x] = val
        
        av = 0
        for label1 in range(3):
            for label2 in range(4):
                if label1 != label2:
                    av += 10**values[(label1,label2)]-1
        av = av/12
        
        print('------------------------------------------------------------------\n',
              f'{modeltype1, modelname1, adversarial_direction1} and {modeltype2, modelname2, adversarial_direction2}','\n', 
            #   table,'\n', 
              'average = ', av,'\n------------------------------------------------------------------\n')

------------------------------------------------------------------
 ('VGG', '11', 'untargeted') and ('ResNet', 'ResNet20v1', 'targeted') 
 average =  -0.03503888139526693 
------------------------------------------------------------------

------------------------------------------------------------------
 ('VGG', '11', 'untargeted') and ('ResNet', 'ResNet20v2', 'targeted') 
 average =  -0.03091077572598024 
------------------------------------------------------------------

------------------------------------------------------------------
 ('VGG', '11', 'untargeted') and ('ResNet', 'ResNet56v1', 'targeted') 
 average =  -0.03213812351429383 
------------------------------------------------------------------

------------------------------------------------------------------
 ('VGG', '11', 'untargeted') and ('ResNet', 'ResNet56v2', 'targeted') 
 average =  -0.02686232816667559 
------------------------------------------------------------------

----------------------------------------

## Experiment 1 result.

Perhaps due to the high dimensionanlity, perturbed example is nearly orthogonal to the original regardless of the attack types and directions.
Thankfully, there were tendencies. To see the difference more saliently, we take power of 10, and subtract 1 because if it was negative, then it would be better to have negative after taking power.

.|average dot product |
--- | --- |
Untargeted, same model(ResNet) | 0.2 - 0.25 (One exception, which is about 0.07) |
Targeted, same model(ResNet) | 0.05 - 0.07 |
Untargeted, same model(VGG) | 0.5 - 0.7 |
Targeted, same model(VGG) | 0.15 - 0.2|
Untargeted, diff model | 0.06 - 0.2 |
Targeted, diff model | 0.02 - 0.05 |
Targeted vs Untargeted, same model(VGG) | $-0.12$ - $-0.22$|
Targeted vs Untargeted, same model(ResNet) | $-0.08$ - $-0.14$|
Different directions, different model | $-0.03$ - $-0.06$|



# Experiment 2

Adversarial examples generated by other model won't be effective when domain is restricted to certain part.

testing dataset -> prediction via model A -> take maximum , cut off by certainty threshold

$x_1, x_2, x_3$ -> 0.9, 0.6, 0.8, threshold = 0.9 -> x_1 


$x^*_1, x^*_2, x^*_3$ -> 0.6, 0.9, 0.8, threhold = 0.9 -> x_2

In [201]:
# Original dataset : testing dataset
cifar10_dataset = cifar10.load_data()
training, test = cifar10_dataset
x_tr, x_v, y_tr, y_v = train_test_split(training[0], training[1], test_size=0.2, random_state=42)
x_tr = x_tr.reshape(x_tr.shape[0],32,32,3)
x_v = x_v.reshape(x_v.shape[0],32,32,3)
x_v = x_v.astype('float32')
x_tr = x_tr.astype('float32')
x_v /= 255
x_tr /= 255
y_tr = keras.utils.to_categorical(y_tr, 10)
y_v = keras.utils.to_categorical(y_v,10)
x_test, y_test = test
x_test = x_test.reshape(x_test.shape[0],32,32,3)
x_test = x_test.astype('float32')
x_test /= 255 
y_test = keras.utils.to_categorical(y_test, 10)

dropbox_path = "/Users/inkeejung/Library/CloudStorage/Dropbox-BOSTONUNIVERSITY/Inkee Jung/Inkee Jung’s files/Interpretability/AdvLogifold/computer/"
base_models_path = dropbox_path + 'Base_Models/'
adversarial_examples_path = dropbox_path + 'Adversarial_Examples/'
# load model A and adversarial testing dataset generated by model A
model_name = 'ResNet/n_9_v2_cifar10_1.keras'
modelA = load_model(base_models_path + f'CIFAR10models/{model_name}') 
adv_A = np.load(adversarial_examples_path+
                'by_ResNet/'+
                'pgd_0.376_x_test_target_to_ll_gen_by_n_9_v2_cifar10_1.keras.npy')
# load model B and adversarial testing dataset generated by model B
model_name = 'ResNet/n_9_v2_cifar10_2.keras'
modelB = load_model(base_models_path + f'CIFAR10models/{model_name}') 
adv_B = np.load(adversarial_examples_path+
                'by_ResNet/'+
                'pgd_0.376_x_test_target_to_ll_gen_by_n_9_v2_cifar10_2.keras.npy')
# See the accuracy of each one.
prediction_A_to_original = modelA.predict(x_test)
prediction_B_to_original = modelB.predict(x_test)
prediction_A_to_adv_A = modelA.predict(adv_A)
prediction_B_to_adv_A = modelB.predict(adv_A)
prediction_A_to_adv_B = modelA.predict(adv_B)
prediction_B_to_adv_B = modelB.predict(adv_B)
right_answer =np.argmax(y_test,axis=-1)
print(
    'A to original',np.sum(np.argmax(prediction_A_to_original,axis=-1) == right_answer)/10000,
    'B to original',np.sum(np.argmax(prediction_B_to_original,axis=-1) == right_answer)/10000,
    'A to advA',np.sum(np.argmax(prediction_A_to_adv_A,axis=-1) == right_answer)/10000,
    'B to advA',np.sum(np.argmax(prediction_B_to_adv_A,axis=-1) == right_answer)/10000,
    'A to advB',np.sum(np.argmax(prediction_A_to_adv_B,axis=-1) == right_answer)/10000,
    'B to advB',np.sum(np.argmax(prediction_B_to_adv_B,axis=-1) == right_answer)/10000,
    
)
# Check the dot product average

def dot_product(x,y,application_pt):
    x = x.flatten()
    y = y.flatten()
    p = application_pt.flatten()
    pert_x = x - p
    pert_y = y - p
    size_of_pert_x = np.dot(pert_x,pert_x)**(0.5)
    size_of_pert_y = np.dot(pert_y,pert_y)**(0.5)
    if size_of_pert_x == 0 or size_of_pert_y == 0:
        return 2
    return np.dot(pert_x/size_of_pert_x,pert_y/size_of_pert_y)

cnt = 0
dot_product_value  = 0
for i in range(10000):
    dot_product_value += dot_product(
        adv_A[i],
        adv_B[i],
        x_test[i])
    if dot_product_value == 2: # it's 2 if there is zero perturbation at least one adversarial
        cnt += 1
        dot_product_value += 0
    
average_dot_product = dot_product_value/10000
average_dot_product = 10**average_dot_product - 1 # It makes range into (-0.9, 10).
print('average dot', average_dot_product,'# of non-perturbation',cnt)
# Find certain part of A and B in the testing dataset.

def find_cert_part(x, alpha):
    return np.where(np.max(x,axis=-1)>alpha)[0]
    
alpha = 0.9
cert_A = find_cert_part(prediction_A_to_original, alpha)
cert_B = find_cert_part(prediction_B_to_original, alpha)


# compute the accuracy on the certain part for each dataset.

prediction_on_cert_modelA_to_adv_A = modelA.predict(adv_A[cert_A])
print('A to advA, cert',np.sum(np.argmax(prediction_on_cert_modelA_to_adv_A,axis=-1) == right_answer[cert_A])/10000)
prediction_on_cert_modelA_to_adv_B = modelA.predict(adv_B[cert_A])
print('A to advB, cert',np.sum(np.argmax(prediction_on_cert_modelA_to_adv_B,axis=-1) == right_answer[cert_A])/10000)
prediction_on_cert_modelB_to_adv_A = modelB.predict(adv_A[cert_B])
print('B to advA, cert',np.sum(np.argmax(prediction_on_cert_modelB_to_adv_A,axis=-1) == right_answer[cert_B])/10000)
prediction_on_cert_modelB_to_adv_B = modelB.predict(adv_B[cert_B])
print('B to advB, cert',np.sum(np.argmax(prediction_on_cert_modelB_to_adv_B,axis=-1) == right_answer[cert_B])/10000)
# Check the dot product average on the common certain part.


common_cert = np.where(np.prod([np.max(prediction_A_to_original,axis=-1)>alpha,np.max(prediction_B_to_original,axis=-1)>alpha],axis=0)==1)[0]

cnt = 0
dot_product_value  = 0
for i in common_cert:
    dot_product_value += dot_product(
        adv_A[i],
        adv_B[i],
        x_test[i])
    if dot_product_value == 2: # it's 2 if there is zero perturbation at least one adversarial
        cnt += 1
        dot_product_value += 0
    
average_dot_product = dot_product_value/len(common_cert)
average_dot_product = 10**average_dot_product - 1 # It makes range into (-0.9, 10).
print('size of common cert', len(common_cert),'average dot', average_dot_product)



print(f'summary: A to advB  \n{np.sum(np.argmax(prediction_A_to_adv_B,axis=-1) == right_answer)/10000} -> {np.sum(np.argmax(prediction_on_cert_modelA_to_adv_B,axis=-1) == right_answer[cert_A])/len(cert_A)},')
print(f'summary: B to advA  \n{np.sum(np.argmax(prediction_B_to_adv_A,axis=-1) == right_answer)/10000} -> {np.sum(np.argmax(prediction_on_cert_modelB_to_adv_A,axis=-1) == right_answer[cert_B])/len(cert_B)},')



    

A to original 0.8355 B to original 0.8065 A to advA 0.2025 B to advA 0.4037 A to advB 0.5301 B to advB 0.206
average dot 0.0656876316957633 # of non-perturbation 0
A to advA, cert 0.164
A to advB, cert 0.4711
B to advA, cert 0.3604
B to advB, cert 0.165
size of common cert 6384 average dot 0.07718825990023648
summary: A to advB  
0.5301 -> 0.6135712425110705,
summary: B to advA  
0.4037 -> 0.4897404538660144,


In [202]:
# Original dataset : testing dataset
cifar10_dataset = cifar10.load_data()
training, test = cifar10_dataset
x_tr, x_v, y_tr, y_v = train_test_split(training[0], training[1], test_size=0.2, random_state=42)
x_tr = x_tr.reshape(x_tr.shape[0],32,32,3)
x_v = x_v.reshape(x_v.shape[0],32,32,3)
x_v = x_v.astype('float32')
x_tr = x_tr.astype('float32')
x_v /= 255
x_tr /= 255
y_tr = keras.utils.to_categorical(y_tr, 10)
y_v = keras.utils.to_categorical(y_v,10)
x_test, y_test = test
x_test = x_test.reshape(x_test.shape[0],32,32,3)
x_test = x_test.astype('float32')
x_test /= 255 
y_test = keras.utils.to_categorical(y_test, 10)

dropbox_path = "/Users/inkeejung/Library/CloudStorage/Dropbox-BOSTONUNIVERSITY/Inkee Jung/Inkee Jung’s files/Interpretability/AdvLogifold/computer/"
base_models_path = dropbox_path + 'Base_Models/'
adversarial_examples_path = dropbox_path + 'Adversarial_Examples/'
# load model A and adversarial testing dataset generated by model A
model_name = 'ResNet/n_9_v2_cifar10_1.keras'
modelA = load_model(base_models_path + f'CIFAR10models/{model_name}') 
adv_A = np.load(adversarial_examples_path+
                'by_ResNet/'+
                'pgd_0.376_x_test_target_to_ll_gen_by_n_9_v2_cifar10_1.keras.npy')
# load model B and adversarial testing dataset generated by model B
model_name = 'ResNet/n_9_v2_cifar10_2.keras'
modelB = load_model(base_models_path + f'CIFAR10models/{model_name}') 
adv_B = np.load(adversarial_examples_path+
                'by_ResNet/'+
                'pgd_0.376_x_test_target_to_ll_gen_by_n_9_v2_cifar10_2.keras.npy')
# See the accuracy of each one.
prediction_A_to_original = modelA.predict(x_test)
prediction_B_to_original = modelB.predict(x_test)
prediction_A_to_adv_A = modelA.predict(adv_A)
prediction_B_to_adv_A = modelB.predict(adv_A)
prediction_A_to_adv_B = modelA.predict(adv_B)
prediction_B_to_adv_B = modelB.predict(adv_B)
right_answer =np.argmax(y_test,axis=-1)
print(
    'A to original',np.sum(np.argmax(prediction_A_to_original,axis=-1) == right_answer)/10000,
    'B to original',np.sum(np.argmax(prediction_B_to_original,axis=-1) == right_answer)/10000,
    'A to advA',np.sum(np.argmax(prediction_A_to_adv_A,axis=-1) == right_answer)/10000,
    'B to advA',np.sum(np.argmax(prediction_B_to_adv_A,axis=-1) == right_answer)/10000,
    'A to advB',np.sum(np.argmax(prediction_A_to_adv_B,axis=-1) == right_answer)/10000,
    'B to advB',np.sum(np.argmax(prediction_B_to_adv_B,axis=-1) == right_answer)/10000,
    
)
# Check the dot product average

def dot_product(x,y,application_pt):
    x = x.flatten()
    y = y.flatten()
    p = application_pt.flatten()
    pert_x = x - p
    pert_y = y - p
    size_of_pert_x = np.dot(pert_x,pert_x)**(0.5)
    size_of_pert_y = np.dot(pert_y,pert_y)**(0.5)
    if size_of_pert_x == 0 or size_of_pert_y == 0:
        return 2
    return np.dot(pert_x/size_of_pert_x,pert_y/size_of_pert_y)

cnt = 0
dot_product_value  = 0
for i in range(10000):
    dot_product_value += dot_product(
        adv_A[i],
        adv_B[i],
        x_test[i])
    if dot_product_value == 2: # it's 2 if there is zero perturbation at least one adversarial
        cnt += 1
        dot_product_value += 0
    
average_dot_product = dot_product_value/10000
average_dot_product = 10**average_dot_product - 1 # It makes range into (-0.9, 10).
print('average dot', average_dot_product,'# of non-perturbation',cnt)
# Find certain part of A and B in the testing dataset.

def find_cert_part(x, alpha):
    return np.where(np.max(x,axis=-1)>alpha)[0]
    
alpha = 0.99999
cert_A = find_cert_part(prediction_A_to_original, alpha)
cert_B = find_cert_part(prediction_B_to_original, alpha)


# compute the accuracy on the certain part for each dataset.

prediction_on_cert_modelA_to_adv_A = modelA.predict(adv_A[cert_A])
print('A to advA, cert',np.sum(np.argmax(prediction_on_cert_modelA_to_adv_A,axis=-1) == right_answer[cert_A])/10000)
prediction_on_cert_modelA_to_adv_B = modelA.predict(adv_B[cert_A])
print('A to advB, cert',np.sum(np.argmax(prediction_on_cert_modelA_to_adv_B,axis=-1) == right_answer[cert_A])/10000)
prediction_on_cert_modelB_to_adv_A = modelB.predict(adv_A[cert_B])
print('B to advA, cert',np.sum(np.argmax(prediction_on_cert_modelB_to_adv_A,axis=-1) == right_answer[cert_B])/10000)
prediction_on_cert_modelB_to_adv_B = modelB.predict(adv_B[cert_B])
print('B to advB, cert',np.sum(np.argmax(prediction_on_cert_modelB_to_adv_B,axis=-1) == right_answer[cert_B])/10000)
# Check the dot product average on the common certain part.


common_cert = np.where(np.prod([np.max(prediction_A_to_original,axis=-1)>alpha,np.max(prediction_B_to_original,axis=-1)>alpha],axis=0)==1)[0]

cnt = 0
dot_product_value  = 0
for i in common_cert:
    dot_product_value += dot_product(
        adv_A[i],
        adv_B[i],
        x_test[i])
    if dot_product_value == 2: # it's 2 if there is zero perturbation at least one adversarial
        cnt += 1
        dot_product_value += 0
    
average_dot_product = dot_product_value/len(common_cert)
average_dot_product = 10**average_dot_product - 1 # It makes range into (-0.9, 10).
print('size of common cert', len(common_cert),'average dot', average_dot_product)



print(f'summary: A to advB  \n{np.sum(np.argmax(prediction_A_to_adv_B,axis=-1) == right_answer)/10000} -> {np.sum(np.argmax(prediction_on_cert_modelA_to_adv_B,axis=-1) == right_answer[cert_A])/len(cert_A)},')
print(f'summary: B to advA  \n{np.sum(np.argmax(prediction_B_to_adv_A,axis=-1) == right_answer)/10000} -> {np.sum(np.argmax(prediction_on_cert_modelB_to_adv_A,axis=-1) == right_answer[cert_B])/len(cert_B)},')



    

A to original 0.8355 B to original 0.8065 A to advA 0.2025 B to advA 0.4037 A to advB 0.5301 B to advB 0.206
average dot 0.0656876316957633 # of non-perturbation 0
A to advA, cert 0.0474
A to advB, cert 0.1456
B to advA, cert 0.1091
B to advB, cert 0.0388
size of common cert 953 average dot 0.1000952576630274
summary: A to advB  
0.5301 -> 0.8529584065612185,
summary: B to advA  
0.4037 -> 0.787725631768953,


In [191]:
# Original dataset : testing dataset
cifar10_dataset = cifar10.load_data()
training, test = cifar10_dataset
x_tr, x_v, y_tr, y_v = train_test_split(training[0], training[1], test_size=0.2, random_state=42)
x_tr = x_tr.reshape(x_tr.shape[0],32,32,3)
x_v = x_v.reshape(x_v.shape[0],32,32,3)
x_v = x_v.astype('float32')
x_tr = x_tr.astype('float32')
x_v /= 255
x_tr /= 255
y_tr = keras.utils.to_categorical(y_tr, 10)
y_v = keras.utils.to_categorical(y_v,10)
x_test, y_test = test
x_test = x_test.reshape(x_test.shape[0],32,32,3)
x_test = x_test.astype('float32')
x_test /= 255 
y_test = keras.utils.to_categorical(y_test, 10)

dropbox_path = "/Users/inkeejung/Library/CloudStorage/Dropbox-BOSTONUNIVERSITY/Inkee Jung/Inkee Jung’s files/Interpretability/AdvLogifold/computer/"
base_models_path = dropbox_path + 'Base_Models/'
adversarial_examples_path = dropbox_path + 'Adversarial_Examples/'
# load model A and adversarial testing dataset generated by model A
model_name = 'ResNet/n_9_v2_cifar10_1.keras'
modelA = load_model(base_models_path + f'CIFAR10models/{model_name}') 
adv_A = np.load(adversarial_examples_path+
                'by_ResNet/'+
                'pgd_0.376_x_test_target_to_ll_gen_by_n_9_v2_cifar10_1.keras.npy')
# load model B and adversarial testing dataset generated by model B
model_name = 'ResNet/n_9_v2_cifar10_2.keras'
modelB = load_model(base_models_path + f'CIFAR10models/{model_name}') 
adv_B = np.load(adversarial_examples_path+
                'by_ResNet/'+
                'pgd_0.376_x_test_target_to_ll_gen_by_n_9_v2_cifar10_2.keras.npy')
# See the accuracy of each one.
prediction_A_to_original = modelA.predict(x_test)
prediction_B_to_original = modelB.predict(x_test)
prediction_A_to_adv_A = modelA.predict(adv_A)
prediction_B_to_adv_A = modelB.predict(adv_A)
prediction_A_to_adv_B = modelA.predict(adv_B)
prediction_B_to_adv_B = modelB.predict(adv_B)
right_answer =np.argmax(y_test,axis=-1)
print(
    'A to original',np.sum(np.argmax(prediction_A_to_original,axis=-1) == right_answer)/10000,
    'B to original',np.sum(np.argmax(prediction_B_to_original,axis=-1) == right_answer)/10000,
    'A to advA',np.sum(np.argmax(prediction_A_to_adv_A,axis=-1) == right_answer)/10000,
    'B to advA',np.sum(np.argmax(prediction_B_to_adv_A,axis=-1) == right_answer)/10000,
    'A to advB',np.sum(np.argmax(prediction_A_to_adv_B,axis=-1) == right_answer)/10000,
    'B to advB',np.sum(np.argmax(prediction_B_to_adv_B,axis=-1) == right_answer)/10000,
    
)
# Check the dot product average

def dot_product(x,y,application_pt):
    x = x.flatten()
    y = y.flatten()
    p = application_pt.flatten()
    pert_x = x - p
    pert_y = y - p
    size_of_pert_x = np.dot(pert_x,pert_x)**(0.5)
    size_of_pert_y = np.dot(pert_y,pert_y)**(0.5)
    if size_of_pert_x == 0 or size_of_pert_y == 0:
        return 2
    return np.dot(pert_x/size_of_pert_x,pert_y/size_of_pert_y)

cnt = 0
dot_product_value  = 0
for i in range(10000):
    dot_product_value += dot_product(
        adv_A[i],
        adv_B[i],
        x_test[i])
    if dot_product_value == 2: # it's 2 if there is zero perturbation at least one adversarial
        cnt += 1
        dot_product_value += 0
    
average_dot_product = dot_product_value/10000
average_dot_product = 10**average_dot_product - 1 # It makes range into (-0.9, 10).
print('average dot', average_dot_product,'# of non-perturbation',cnt)
# Find certain part of A and B in the testing dataset.

def find_cert_part(x, alpha):
    return np.where(np.max(x,axis=-1)>alpha)[0]
    
alpha = 0.95
cert_A = find_cert_part(prediction_A_to_original, alpha)
cert_B = find_cert_part(prediction_B_to_original, alpha)


# compute the accuracy on the certain part for each dataset.

prediction_on_cert_modelA_to_adv_A = modelA.predict(adv_A[cert_A])
print('A to advA, cert',np.sum(np.argmax(prediction_on_cert_modelA_to_adv_A,axis=-1) == right_answer[cert_A])/10000)
prediction_on_cert_modelA_to_adv_B = modelA.predict(adv_B[cert_A])
print('A to advB, cert',np.sum(np.argmax(prediction_on_cert_modelA_to_adv_B,axis=-1) == right_answer[cert_A])/10000)
prediction_on_cert_modelB_to_adv_A = modelB.predict(adv_A[cert_B])
print('B to advA, cert',np.sum(np.argmax(prediction_on_cert_modelB_to_adv_A,axis=-1) == right_answer[cert_B])/10000)
prediction_on_cert_modelB_to_adv_B = modelB.predict(adv_B[cert_B])
print('B to advB, cert',np.sum(np.argmax(prediction_on_cert_modelB_to_adv_B,axis=-1) == right_answer[cert_B])/10000)
# Check the dot product average on the common certain part.


common_cert = np.where(np.prod([np.max(prediction_A_to_original,axis=-1)>alpha,np.max(prediction_B_to_original,axis=-1)>alpha],axis=0)==1)[0]

cnt = 0
dot_product_value  = 0
for i in common_cert:
    dot_product_value += dot_product(
        adv_A[i],
        adv_B[i],
        x_test[i])
    if dot_product_value == 2: # it's 2 if there is zero perturbation at least one adversarial
        cnt += 1
        dot_product_value += 0
    
average_dot_product = dot_product_value/len(common_cert)
average_dot_product = 10**average_dot_product - 1 # It makes range into (-0.9, 10).
print('size of common cert', len(common_cert),'average dot', average_dot_product)

print(f'size of cert part of A {len(cert_A)},size of cert part of B {len(cert_B)}')
print(f'summary: A to advB  \n{np.sum(np.argmax(prediction_A_to_adv_B,axis=-1) == right_answer)/10000} -> {np.sum(np.argmax(prediction_on_cert_modelA_to_adv_B,axis=-1) == right_answer[cert_A])/10000},')
print(f'summary: B to advA  \n{np.sum(np.argmax(prediction_B_to_adv_A,axis=-1) == right_answer)/10000} -> {np.sum(np.argmax(prediction_on_cert_modelB_to_adv_A,axis=-1) == right_answer[cert_B])/10000},')



    

A to original 0.8355 B to original 0.8065 A to advA 0.2025 B to advA 0.4037 A to advB 0.5301 B to advB 0.206
average dot 0.0656876316957633 # of non-perturbation 0
A to advA, cert 0.1544
A to advB, cert 0.4502
B to advA, cert 0.3466
B to advB, cert 0.1545
size of common cert 5689 average dot 0.07862220366659756
size of cert part of A [   0    1    2 ... 9997 9998 9999],size of cert part of B [   0    2    4 ... 9997 9998 9999]
summary: A to advB  
0.5301 -> 0.4502,
summary: B to advA  
0.4037 -> 0.3466,


$x_1 -> (0.9, 0.1)$
$x_1^* -> (0.1, 0.9)$

# Experiment 3

1. Train a filter that distinguish certain parts and those that aren't.
2. Train a model only with low certainty data.

In [195]:
# Original dataset 
cifar10_dataset = cifar10.load_data()
training, test = cifar10_dataset
x_tr, x_v, y_tr, y_v = train_test_split(training[0], training[1], test_size=0.2, random_state=42)
x_tr = x_tr.reshape(x_tr.shape[0],32,32,3)
x_v = x_v.reshape(x_v.shape[0],32,32,3)
x_v = x_v.astype('float32')
x_tr = x_tr.astype('float32')
x_v /= 255
x_tr /= 255
# y_tr = keras.utils.to_categorical(y_tr, 10)
# y_v = keras.utils.to_categorical(y_v,10)
x_test, y_test = test
x_test = x_test.reshape(x_test.shape[0],32,32,3)
x_test = x_test.astype('float32')
x_test /= 255 
# y_test = keras.utils.to_categorical(y_test, 10)

dropbox_path = "/Users/inkeejung/Library/CloudStorage/Dropbox-BOSTONUNIVERSITY/Inkee Jung/Inkee Jung’s files/Interpretability/AdvLogifold/computer/"
base_models_path = dropbox_path + 'Base_Models/'
adversarial_examples_path = dropbox_path + 'Adversarial_Examples/'

# Fix certainty threshold
alpha = 0.85
# Load a model
model_name = 'ResNet/n_9_v1_cifar10.keras'
model = load_model(base_models_path + f'CIFAR10models/{model_name}')
# Find certain part
prediction_train = model.predict(x_tr,verbose = 0)
prediction_val = model.predict(x_v,verbose=0)
prediction_test = model.predict(x_test)
cert_train = np.max(prediction_train,axis=-1)>alpha
cert_val = np.max(prediction_val,axis=-1)>alpha
cert_test = np.max(prediction_test,axis=-1)>alpha
cert_train, uncert_train = np.where(cert_train)[0], np.where(~cert_train)[0]
cert_val, uncert_val = np.where(cert_val)[0], np.where(~cert_val)[0]
cert_test, uncert_test = np.where(cert_test)[0], np.where(~cert_test)[0]

# Train Filter that distinguish given data is classified as 'certain' or not.

y_tr[uncert_train] += 10
y_v[uncert_val] += 10
y_test[uncert_test] += 10

KeyboardInterrupt: 

In [194]:
x = np.array([3,1,2])>1
np.where(x)[0], np.where(~x)[0]

(array([0, 2]), array([1]))

In [198]:
y = np.array([3,1,2])
y[np.where(x)[0]] += 10
y

array([13,  1, 12])