In [None]:
# imports
# general
import numpy as np
import matplotlib.pyplot as plt
from numpy.linalg import norm
from numpy import dot 
# sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torchvision.datasets import MNIST
from torchvision import transforms
# lime
import lime
import lime.lime_tabular
# art
from art.attacks.evasion import FastGradientMethod
from art.estimators.classification import PyTorchClassifier
from art.estimators.classification import SklearnClassifier

In [None]:
# neural network
class Net(nn.Module):
    def __init__(self,layers):
        super(Net, self).__init__()
        self.layers = layers
        self.fc_linears = nn.ModuleList()
        for i in range(len(self.layers)-1):
            self.fc_linears.append(nn.Linear(in_features=self.layers[i], out_features=self.layers[i+1]))
        
    def forward(self, x):
        for i,l in enumerate(self.fc_linears):
            in_dim = self.layers[i]
            out_dim = self.layers[i+1]
            btch_nrm = nn.BatchNorm1d(out_dim)
            x = F.relu(btch_nrm(l(x.type(torch.float))))
        return x
    
class model_wrap():
    def __init__(self,model):
        self.model = model
    
    def predict_proba(self,x):
        softmax = nn.Softmax(dim=1)
        return softmax(self.model.model(Variable(torch.from_numpy(x)))).detach().numpy()#(Variable(torch.from_numpy(x)))) # took out model.predict()

In [None]:
# functions
def data(X,y,test_percent,rnd_st=0):
    '''
    returns train/test data
    Args - 
        X: x data
        y: y data
        test_percent: percent of the data set to hold for testing
        rnd_st: random state 
    Returns - 
        x_train: training data from X
        y_train: training data from y
        x_test: testing data from X
        y_test: testing data from y
    '''
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_percent, random_state=rnd_st)
    return x_train, y_train, x_test, y_test

def train_model(layers,X,y,rnd_st=0):
    '''
    builds a model from given network structure.
    Args - 
        layers: tuple of hidden layer sizes 
        X: X data
        y: y data
        rnd_st: random state 
    Returns - 
        model: a trained sklearn model ready for training 
    '''
    model = MLPClassifier(layers)
    model.fit(X,y)
    return model 

def train_model_torch(layers,X,y,rnd_st=0):
    '''
    builds a model from given network structure.
    Args - 
        layers: tuple of hidden layer sizes 
        X: X data
        y: y data
        rnd_st: random state 
    Returns - 
        model: a trained sklearn model ready for training 
    '''
    model = Net(layers)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    model_w = PyTorchClassifier(
    model=model,
    loss=criterion,
    optimizer=optimizer,
    input_shape=64,
    nb_classes=10)
    model_w.fit(X, y, batch_size=1000, nb_epochs=20)
    return model_w

def build_model_matrix(s,n,lyr_array,X,y):
    '''
    builds an sxn matrix where the i,j entry is a sklearn model. Each column has a different arcitecture, and each 
    row within a column as a different initial parameters initialization.
    Args - 
        s: number of different model parameter initializations
        n: number of different model arcitectures
        lyr_array: array of layers for the networks (currently enter manually, plan on creating code to do this)
            *note* n = len(lyr_array)
        X: X training data
        y: y training data
    returns: sxn model matrix of trained sklearn models 
    '''
    model_matrix = []
    for i in range(s):
        models_same_arc = []
        for j in range(n):
            model = train_model(lyr_array[j],X,y,rnd_st=j)
            models_same_arc.append(model)
        model_matrix.append(models_same_arc)
    return np.array(model_matrix)

def build_model_matrix_torch(s,n,lyr_array,X,y):
    '''
    builds an sxn matrix where the i,j entry is a sklearn model. Each column has a different arcitecture, and each 
    row within a column as a different initial parameters initialization.
    Args - 
        s: number of different model parameter initializations
        n: number of different model arcitectures
        lyr_array: array of layers for the networks (currently enter manually, plan on creating code to do this)
            *note* n = len(lyr_array)
        X: X training data
        y: y training data
    returns: sxn model matrix of trained sklearn models 
    '''
    model_matrix = []
    for i in range(s):
        models_same_arc = []
        for j in range(n):
            model = train_model_torch(lyr_array[j],X,y,rnd_st=j)
            models_same_arc.append(model)
        model_matrix.append(models_same_arc)
    return np.array(model_matrix)

def accurc_score(model,X,y):
    '''
    computes the accuracy score for a given model on the testing data
    Args - 
        model: the classifier to test
        X: X testing data
        y: y testing data
    Returns - 
        acc_sc: accuracy score 
    '''
    acc_sc = accuracy_score(y, model.predict(X))
    return acc_sc

def accurc_score_torch(model,X,y):
    '''
    computes the accuracy score for a given model on the testing data
    Args - 
        model: the classifier to test
        X: X testing data
        y: y testing data
    Returns - 
        acc_sc: accuracy score 
    '''
    pred = model.predict(X)
    acc_sc = accuracy_score(y,[np.where(p==p.max())[0][0] for p in pred])
    return acc_sc

def accuracy_matrix(models,X,y):
    '''
    computes the model accuracy matrix, the i,j entry is the accuracy score of the i,j model
    Args - 
        models: np ndarray of models to test
        X: X testing data
        y: y testing data
    Returns - 
        acc_matrix: accuracy score matrix  
    '''
    s = models.shape[0]
    n = models.shape[1]
    acc_matrix = np.zeros((s,n))
    for i in range(s):
        for j in range(n):
            acc_matrix[i,j] = accurc_score(models[i,j],X,y)
    return acc_matrix 

def accuracy_matrix_torch(models,X,y):
    '''
    computes the model accuracy matrix, the i,j entry is the accuracy score of the i,j model
    Args - 
        models: np ndarray of models to test
        X: X testing data
        y: y testing data
    Returns - 
        acc_matrix: accuracy score matrix  
    '''
    s = models.shape[0]
    n = models.shape[1]
    acc_matrix = np.zeros((s,n))
    for i in range(s):
        for j in range(n):
            acc_matrix[i,j] = accurc_score_torch(models[i,j],X,y)
    return acc_matrix 

def make_exp_vec(exp):
    """
    Takes a LIME explanation which is a dictionary (i, w(i)) where i is the feature id and w(i) is the weight 
    of the ith feature. With the explanation vector we generate, we can 
    Arg - 
        exp: LIME explanation
    Returns - 
        v: explanation vector where v[i] = w(i)
        y_pred: the prediction of the model
    """
    k = list(exp.keys())[0]
    l = exp[k]
    v = np.zeros(len(l))
    for (i,w) in l:
        v[i] = w
    y_pred = k
    return v,y_pred

def exp_point(point,X,model,f_names,c_names):
    """
    The explanation vector for top label of a point in a certain data set.
    Arg - 
        point: point to explain
        X: data model was trained on
        model: black box
        f_names: names of features
        c_names: names of classes
    Returns - 
        v: explanation vector (see make_exp_vec())
        y_pred: model prediction
    """
    explainer = lime.lime_tabular.LimeTabularExplainer(X, feature_names=f_names, class_names=c_names, discretize_continuous=False)
    exp = explainer.explain_instance(point, model.predict_proba, num_features=len(point), top_labels=1)#, sampling_method='lhs')
    v,y_pred = make_exp_vec(exp.local_exp)
    return v, y_pred

def exp_point_torch(point,X,model,f_names,c_names):
    """
    The explanation vector for top label of a point in a certain data set.
    Arg - 
        point: point to explain
        X: data model was trained on
        model: black box
        f_names: names of features
        c_names: names of classes
    Returns - 
        v: explanation vector (see make_exp_vec())
        y_pred: model prediction
    """
    explainer = lime.lime_tabular.LimeTabularExplainer(X, feature_names=f_names, class_names=c_names, discretize_continuous=False)
    exp = explainer.explain_instance(point, model_wrap(model).predict_proba, num_features=len(point), top_labels=1)#, sampling_method='lhs')
    v,y_pred = make_exp_vec(exp.local_exp)
    return v, y_pred

def similarity_matrix(models,X,point,f_names,c_names):
    '''
    computes a matrix where the i,j entry is the similarity of the i and j models 
    computed via cosine similarity of the explanation vectors 
    turns model matrix into array by extending along the columns (ie puts all the models with the same
    arcitecture next to eachother)
    Args - 
        models: matrix of models
        X: X training data
        point: x to explain (take from x_test)
        f_names: feature names
        c_names: class names
    Returns - 
        sim_matrix: snxsn similarity matrix
    '''
    s = models.shape[0]
    n = models.shape[1]
    flat_models = models.flatten('F')   # reshape via F? 
    flat_sims = []
    for m1 in flat_models:
        for m2 in flat_models:
            exp1,y1_pred = exp_point(point,X,m1,f_names,c_names)
            exp2,y2_pred = exp_point(point,X,m2,f_names,c_names)
            
            if y1_pred == y2_pred:
                flat_sims.append(np.dot(exp1,exp2)/(norm(exp1)*norm(exp2)))
            else:
                flat_sims.append(-2)
    sim_matrix = np.reshape(np.array(flat_sims),(s*n,s*n),'F')   # reshape via F ?
    return sim_matrix

def similarity_matrix_torch(models,X,point,f_names,c_names):
    '''
    computes a matrix where the i,j entry is the similarity of the i and j models 
    computed via cosine similarity of the explanation vectors 
    turns model matrix into array by extending along the columns (ie puts all the models with the same
    arcitecture next to eachother)
    Args - 
        models: matrix of models
        X: X training data
        point: x to explain (take from x_test)
        f_names: feature names
        c_names: class names
    Returns - 
        sim_matrix: snxsn similarity matrix
    '''
    s = models.shape[0]
    n = models.shape[1]
    flat_models = models.flatten('F')   # reshape via F? 
    flat_sims = []
    for m1 in flat_models:
        for m2 in flat_models:
            exp1,y1_pred = exp_point_torch(point,X,m1,f_names,c_names)
            exp2,y2_pred = exp_point_torch(point,X,m2,f_names,c_names)
            
            if y1_pred == y2_pred:
                flat_sims.append(np.dot(exp1,exp2)/(norm(exp1)*norm(exp2)))
            else:
                flat_sims.append(1.25)
    sim_matrix = np.reshape(np.array(flat_sims),(s*n,s*n),'F')   # reshape via F ?
    return sim_matrix

In [None]:
# load the training data 
# digits = load_digits()
# x_train, y_train, x_test, y_test = data(digits['data'],digits['target'],0.3,rnd_st=0)
# x_names = np.arange(0,64)
# y_names = digits['target_names']

dataset = MNIST('.', train=True, download=False,transform=transforms.ToTensor())
x_train = dataset.train_data.reshape(len(dataset.train_data),28*28).double()
y_train = dataset.train_labels.double()
x_test = dataset.test_data.reshape(len(dataset.test_data),28*28).double()
y_test = dataset.test_labels.double()

x_names = np.arange(0,28*28) 
y_names = ['0','1','2','3','4','5','6','7','8','9']

In [None]:
# # create model matrix 
# model_matrix = build_model_matrix(4,3,[(64,128,256,128,32),(128,64,128,256,32),(64,96,64,32,16)],x_train,y_train)

# # accuracy matrix
# accuracy_matrix = accuracy_matrix(model_matrix,x_test,y_test)
# plt.figure(1,[5,5])
# plt.imshow(accuracy_matrix)
# plt.colorbar()
# plt.axis('off')
# plt.title('model accuracy scores')
# plt.show()

# # similarity matrix
# similarity_matrix = similarity_matrix(model_matrix,x_train,x_test[0],x_names,y_names)
# plt.figure(2,[5,5])
# plt.imshow(similarity_matrix)
# plt.colorbar()
# plt.axis('off')
# plt.title('model similarity')
# plt.show()

'''
# intuition as to how model sim matrix constructed 
def prod(val) :  
    res = 1 
    for ele in val:  
        res *= ele  
    return res 

matrix = np.array([['(0,0)','(0,1)'],['(1,0)','(1,1)'],['(2,0)','(2,1)']])
m_flat = matrix.flatten('F')
interactions = []
for m1 in m_flat:
    for m2 in m_flat:
        interactions.append(m1+m2)
interactions = np.array(interactions)
s = prod(list(matrix.shape))
np.reshape(interactions,(s,s),'F')[:,5]
'''

In [None]:
# # create model matrix 
# Layers = [(64,32,16,12,10),
#           (64,56,32,16,10),
#           (64,96,32,16,10),
#           (64,32,16,10),
#           (64,56,10),
#           (64,48,32,10),
#           (64,48,32,10,8,6,10)]
# model_matrix = build_model_matrix_torch(7,len(Layers),Layers,x_train,y_train)

# # accuracy matrix
# accuracy_matrix = accuracy_matrix_torch(model_matrix,x_test,y_test)
# plt.figure(1,[5,5])
# plt.imshow(accuracy_matrix)
# plt.colorbar()
# plt.axis('off')
# plt.title('model accuracy scores')
# plt.show()

# # similarity matrix
# similarity_matrix = similarity_matrix_torch(model_matrix,x_train,x_test[0],x_names,y_names)
# plt.figure(2,[5,5])
# plt.imshow(similarity_matrix)
# plt.colorbar()
# plt.axis('off')
# plt.title('model similarity')
# plt.show()

In [None]:
# create model matrix 
Layers = [(784,636,484,256,64,10),
          (784,636,484,384,256,128,64,32,16,10),
          (784,800,600,400,200,10),
          (784,800,600,400,200,100,50,10)]
model_matrix = build_model_matrix_torch(4,len(Layers),Layers,x_train,y_train)

# accuracy matrix
accuracy_matrix = accuracy_matrix_torch(model_matrix,x_test,y_test)
plt.figure(1,[5,5])
plt.imshow(accuracy_matrix)
plt.colorbar()
plt.axis('off')
plt.title('model accuracy scores')
plt.show()

In [None]:
# similarity matrix
similarity_matrix = similarity_matrix_torch(model_matrix,x_train.detach().numpy(),x_test[0].detach().numpy(),x_names,y_names)
plt.figure(2,[5,5])
plt.imshow(similarity_matrix)
plt.colorbar()
plt.axis('off')
plt.title('model similarity')
plt.show()

In [None]:
# adversarial example generation 
model_list = model_matrix.flatten()
ind = np.random.choice(len(x_test), 250, replace=False)
x_ = x_test[ind]
y_ = y_test[ind]

x_adv_exs = []
adv_acc = []
for model in model_list:
    attack = FastGradientMethod(estimator=model, eps=3.)
    x_adv = attack.generate(x=x_)
    x_adv_exs.append(x_adv)
    adv_acc.append(accuracy_score(np.argmax(model.predict(x_adv),axis=1),y_))

x_adv_exs = np.array(x_adv_exs)
adv_acc = np.array(adv_acc)
print(adv_acc)

In [None]:
transfer = []
for model in model_list:
    adv_rates = []
    for adv_ex in x_adv_exs:
        adv_rates.append(accuracy_score(np.argmax(model.predict(adv_ex),axis=1),y_))
    transfer.append(adv_rates)
transfer = np.array(transfer)

In [None]:
plt.figure()
plt.title('transferability')
plt.imshow(transfer)
plt.colorbar()
plt.show()

In [None]:
# do the explanation vectors "point" in the direction of the adversarial example? 
model = model_list[0]
ind = np.random.choice(len(x_test), 500, replace=False)
x_ = x_test[ind]
y_ = y_test[ind]

#generate explanations
exp_vecs = []
preds = []
for x in x_:
    v, y_pred = exp_point_torch(x.detach().numpy(),x_test.detach().numpy(),model,x_names,y_names)
    exp_vecs.append(v)
    preds.append(y_pred)
exp_vecs = np.array(exp_vecs)
preds = np.array(preds)

# generate attacks
attack = FastGradientMethod(estimator=model, eps=3.)
x_adv = attack.generate(x=x_)
adv_pred = np.argmax(model.predict(x_adv),axis=1)
adv_acc = accuracy_score(adv_pred,y_)
print('model acc on adv example: ', adv_acc)

In [None]:
# generate the pertubation
deltas = []
for x,xa in zip(x_,x_adv):
    deltas.append(np.subtract(xa,x).detach().numpy())
deltas = np.array(deltas)

# select the ones which are adversarial examples
adv_indx = []
for i,(y_tr,y_adv) in enumerate(zip(y_,adv_pred)):
    if y_tr == y_adv:
        pass
    elif y_tr != y_adv:
        adv_indx.append(i)
    else:
        print('error')
adv_indx = np.array(adv_indx)

# compare cosine similarity on actual adv examples
sims = []
for exp,d in zip(exp_vecs[adv_indx],deltas[adv_indx]):
    sims.append(dot(-1.*exp,d)/(norm(d)*norm(exp)))
sims = np.array(sims)

In [None]:
plt.figure(1)
plt.hist(sims,bins=20)
plt.show()