## Load Model 1

In [8]:
import tensorflow as tf
import numpy as np
import os
import scipy.io
import random
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
import cv2
from utilities import *

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# cifar10 data
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
x_train = x_train/255.
x_test = x_test/255.
y_train = np.reshape(y_train,[50000,])
y_test = np.reshape(y_test,[10000,])
nb_classes = 10
targets = y_train.reshape(-1)
y_train = np.eye(nb_classes)[targets]
targets = y_test.reshape(-1)
y_test = np.eye(nb_classes)[targets]

# SVHN data
from scipy import io
data1=io.loadmat('train_32x32.mat')
train_data=data1['X']
train_data = train_data.astype('float32')
train_data = np.transpose(train_data, (3, 0, 1, 2))
SVHN = train_data/255.

# LSUN data
LSUN = np.load("./LSUN.npy")
lSUN = LSUN/255.

# TIM data
TIM = np.load("./TIM.npy")  

batch_size = 128
tf.reset_default_graph()
networks = ['network1', 'network2', 'network3', 'network4', 'network5','network6', 'network7', 'network8', 'network9', 'network10']

XX_list={}
YY_list={}
for mm in range(len(networks)):
  with tf.name_scope(networks[mm]):
    XX_list[networks[mm]] = tf.placeholder(tf.float32, shape = [None, 32, 32, 3],name = networks[mm] +'x')
    YY_list[networks[mm]] = tf.placeholder(tf.float32, [None,10],name =networks[mm] +'y')
    
X = tf.placeholder(tf.float32, shape = [None, 32, 32, 3],name = 'x')
Dcon = tf.placeholder(tf.float32, shape = [None, 32, 32, 3],name = 'Dcon')
Y = tf.placeholder(tf.float32, [None,10],name ='y')
b = tf.placeholder(tf.bool,shape=(),name='b')
learning_rate = tf.placeholder(tf.float32,shape=(),name= 'learning_rate')
alpha = tf.placeholder(tf.float32,shape=(),name='alpha')
stop_grad = tf.placeholder(tf.bool,shape=(),name='stop')

x_list = []
y_list = []
output_list = []
Eent_noise = 0
output1 = 0
pp = 0
for i in range(len(networks)):
    x_image, y_label, output, probs,_,_= vgg16(networks[i],False,XX_list,YY_list,1e-6)
    _, output_noise = vgg16_1(Dcon,networks[i],False,stop_grad,XX_list,YY_list)
    _, _, output_train, probs_train,ent_train,loss= vgg16(networks[i],True,XX_list,YY_list,1e-6)
    x_list.append(x_image)
    y_list.append(y_label)
    output_list.append(output)
    pp+=loss
    Eent_noise += tf.reduce_sum(-tf.log(output_noise+1e-30)*output_noise,1)
    output1 += output_noise
    
output1 = output1/len(networks)
ent_noise = tf.reduce_sum(-tf.log(output1+1e-30)*output1,1)
Eent_noise = Eent_noise/len(networks)
MI = ent_noise - Eent_noise
MI_mean=tf.reduce_mean(MI)
    
pp = pp/len(networks)
loss = pp-MI_mean*alpha
train_opt = tf.train.AdamOptimizer(learning_rate).minimize(loss)

saver = tf.train.Saver(max_to_keep = 10)
config = tf.ConfigProto(allow_soft_placement = True)
sess = tf.Session(config = config)
sess.run(tf.global_variables_initializer())
model_id=0
save_path = saver.restore(sess, "./density_estimation_cifar10_A/classification_mnist126/model.ckpt" )
epoch = 391
num_iter = epoch*200
for iter in range(1):
    if iter%1000 == 0:
        outputs=[]
        x_test_batch = np.copy(x_test)
        y_test_batch = np.copy(y_test)
        x_test_batch = stand(x_test_batch)
        for i in range(len(networks)):
            probs = []
            for j in range(10):   
              prob= sess.run(output_list[i], {x_list[i]:x_test_batch[j*1000:(j+1)*1000], y_list[i]:y_test_batch[j*1000:(j+1)*1000], b:False})
              probs.extend(prob)  
            outputs.append(probs)
        outputs=np.array(outputs) 
        
        accs = np.mean(np.argmax(outputs,2)==np.argmax(y_test_batch,1),1)
        print("accs: ",accs)

        softmax = np.mean(outputs,0)
        maxp_in = np.max(softmax,1)
        acc = np.mean(np.argmax(softmax,1) ==  np.argmax(y_test_batch,1))
        print( iter, 'Final Testing Accuracy: ', acc)
        
        ent_in = np.sum(-np.log(softmax+1e-11)*softmax,1)
        Eent_in = np.mean(np.sum(-np.log(outputs+1e-11)*outputs,2),0)
        MI_in = ent_in - Eent_in
        
        right = np.argmax(softmax,1) ==  np.argmax(y_test_batch,1)
        wrong_data0 = x_test_batch[~right]
        wrong_data = wrong_data0[np.random.randint(0,len(wrong_data0),10000)]
        right_data = x_test_batch[right]
        right_data = right_data[np.random.randint(0,len(right_data),10000)]
        
        outputs_right=[]
        for i in range(len(networks)):
            prob= sess.run(output_list[i], {x_list[i]:right_data, b:False})
            outputs_right.append(prob)
        outputs_right=np.array(outputs_right) 
        softmax_right = np.mean(outputs_right,0)
        maxp_in_right = np.max(softmax_right,1)
        ent_in_right = np.sum(-np.log(softmax_right+1e-11)*softmax_right,1)
        Eent_in_right = np.mean(np.sum(-np.log(outputs_right+1e-11)*outputs_right,2),0)
        MI_in_right = ent_in_right - Eent_in_right

        safe_images = TIM[np.random.randint(0,10000,10000)]
        print("TIM:")         
        safe_images = stand(safe_images)   

        outputs_OOD=[]
        for j in range(len(networks)):
          probs_OOD  = []
          for r in range(10):
            prob_OOD = sess.run(output_list[j], {x_list[j]:safe_images[r*1000:(r+1)*1000], b:False})
            probs_OOD.extend(prob_OOD)  
          outputs_OOD.append(probs_OOD)
        outputs_OOD = np.array(outputs_OOD) 

        softmax_OOD = np.mean(outputs_OOD,0)
        maxp_OOD = np.max(softmax_OOD,1)
        ent_OOD = np.sum(-np.log(softmax_OOD+1e-11)*softmax_OOD,1)
        Eent_OOD = np.mean(np.sum(-np.log(outputs_OOD+1e-11)*outputs_OOD,2),0)
        MI_OOD = ent_OOD - Eent_OOD

        print("maxp_OOD:",np.mean(maxp_OOD>0.99),np.mean(maxp_OOD),np.std(maxp_OOD))
        print("maxp_inD:",np.mean(maxp_in>0.99),np.mean(maxp_in),np.std(maxp_in))
        print("ent_OOD:",np.mean(ent_OOD),np.std(ent_OOD), "ent_in:", np.mean(ent_in),np.std(ent_in))
        print("MI_OOD:",np.mean(MI_OOD),np.std(MI_OOD), "ent_in:", np.mean(MI_in),np.std(MI_in))

        safe, risky  = -np.reshape(maxp_in,[10000,1]), -np.reshape(maxp_OOD,[10000,1])
        labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
        labels[safe.shape[0]:] += 1
        examples = np.squeeze(np.vstack((safe, risky)))
        print('AUPR_p:', round(100*average_precision_score(labels, examples), 2))
        print('AUROC_p:', round(100*roc_auc_score(labels, examples), 2))
        print("FPR95:",ErrorRateAt95Recall1(labels, examples)*100)

        safe, risky = np.reshape(ent_in,[10000,1]), np.reshape(ent_OOD,[10000,1])
        labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
        labels[safe.shape[0]:] += 1
        examples = np.squeeze(np.vstack((safe, risky)))
        print('AUPR_entropy:', round(100*average_precision_score(labels, examples), 2))
        print('AUROC_entropy:', round(100*roc_auc_score(labels, examples), 2))
        print("FPR95:",ErrorRateAt95Recall1(labels, examples)*100)

        safe, risky = np.reshape(MI_in,[10000,1]), np.reshape(MI_OOD,[10000,1])
        labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
        labels[safe.shape[0]:] += 1
        examples = np.squeeze(np.vstack((safe, risky)))
        print('AUPR_MI:', round(100*average_precision_score(labels, examples), 2))
        print('AUROC_MI:', round(100*roc_auc_score(labels, examples), 2))
        print("FPR95:",ErrorRateAt95Recall1(labels, examples)*100)
        print("############################################")

INFO:tensorflow:Restoring parameters from ./density_estimation_cifar10_A/classification_mnist126/model.ckpt
accs:  [0.923  0.9211 0.9199 0.9195 0.9244 0.9207 0.9203 0.922  0.9236 0.9182]
0 Final Testing Accuracy:  0.9415
TIM:
maxp_OOD: 0.0667 0.6330205 0.20062934
maxp_inD: 0.7765 0.94906837 0.12320748
ent_OOD: 0.8959903 0.43462014 ent_in: 0.12493621 0.2760778
MI_OOD: 0.6842741 0.3456992 ent_in: 0.096149355 0.2173931
AUPR_p: 90.73
AUROC_p: 92.66
FPR95: 24.97
AUPR_entropy: 92.27
AUROC_entropy: 93.59
FPR95: 24.77
AUPR_MI: 91.77
AUROC_MI: 93.26
FPR95: 24.959999999999997
############################################


## Evaluate Model 1

In [9]:
safe, risky = np.reshape(ent_in,[10000,1]), np.reshape(ent_OOD,[10000,1])
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[safe.shape[0]:] += 1
examples = np.squeeze(np.vstack((safe, risky)))

best = 0
threshhold = 0
for thresh in np.linspace(0,1,1000):
    tmp = np.mean((1-(examples<thresh))==labels)
    if tmp>=best:
        best = tmp
        threshhold = thresh
print("We choose the threshold to be",threshhold,"which yields the best OOD detection accuracy: ",best)

ioo = (examples<threshhold)*1.
row = np.concatenate((right,(1-right)*0))
np.mean(ioo*row)
p = np.sum(ioo*row)/np.sum(ioo)
r = np.sum(ioo*row)/10000
f1 = 2*p*r/(p+r)
print("The precision, recall and F1 score are ",p,r,f1,"respectively.")
print("The model thinks there are ",np.sum(ioo),"in distributional data out of the 20000 data points.")

right1 = right

We choose the threshold to be 0.24024024024024024 which yields the best OOD detection accuracy:  0.86035
The precision, recall and F1 score are  0.8924170089930055 0.8038 0.845793654969222 respectively.
The model thinks there are  9007.0 in distributional data out of the 20000 data points.


## Load Model 2

In [10]:
import tensorflow as tf
import numpy as np
import os
import scipy.io
import random
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
import cv2
from utilities import *

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# cifar10 data
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
x_train = x_train/255.
x_test = x_test/255.
y_train = np.reshape(y_train,[50000,])
y_test = np.reshape(y_test,[10000,])
nb_classes = 10
targets = y_train.reshape(-1)
y_train = np.eye(nb_classes)[targets]
targets = y_test.reshape(-1)
y_test = np.eye(nb_classes)[targets]

# SVHN data
from scipy import io
data1=io.loadmat('train_32x32.mat')
train_data=data1['X']
train_data = train_data.astype('float32')
train_data = np.transpose(train_data, (3, 0, 1, 2))
SVHN = train_data/255.

# LSUN data
LSUN = np.load("./LSUN.npy")
lSUN = LSUN/255.

# TIM data
TIM = np.load("./TIM.npy")  

batch_size = 128
tf.reset_default_graph()
networks = ['network1', 'network2', 'network3', 'network4', 'network5','network6', 'network7', 'network8', 'network9', 'network10']

XX_list={}
YY_list={}
for mm in range(len(networks)):
  with tf.name_scope(networks[mm]):
    XX_list[networks[mm]] = tf.placeholder(tf.float32, shape = [None, 32, 32, 3],name = networks[mm] +'x')
    YY_list[networks[mm]] = tf.placeholder(tf.float32, [None,10],name =networks[mm] +'y')
    
X = tf.placeholder(tf.float32, shape = [None, 32, 32, 3],name = 'x')
Dcon = tf.placeholder(tf.float32, shape = [None, 32, 32, 3],name = 'Dcon')
Y = tf.placeholder(tf.float32, [None,10],name ='y')
b = tf.placeholder(tf.bool,shape=(),name='b')
learning_rate = tf.placeholder(tf.float32,shape=(),name= 'learning_rate')
alpha = tf.placeholder(tf.float32,shape=(),name='alpha')
stop_grad = tf.placeholder(tf.bool,shape=(),name='stop')

x_list = []
y_list = []
output_list = []
Eent_noise = 0
output1 = 0
pp = 0
for i in range(len(networks)):
    x_image, y_label, output, probs,_,_= vgg16(networks[i],False,XX_list,YY_list,1e-6)
    _, output_noise = vgg16_1(Dcon,networks[i],False,stop_grad,XX_list,YY_list)
    _, _, output_train, probs_train,ent_train,loss= vgg16(networks[i],True,XX_list,YY_list,1e-6)
    x_list.append(x_image)
    y_list.append(y_label)
    output_list.append(output)
    pp+=loss
    Eent_noise += tf.reduce_sum(-tf.log(output_noise+1e-30)*output_noise,1)
    output1 += output_noise
    
output1 = output1/len(networks)
ent_noise = tf.reduce_sum(-tf.log(output1+1e-30)*output1,1)
Eent_noise = Eent_noise/len(networks)
MI = ent_noise - Eent_noise
MI_mean=tf.reduce_mean(MI)
    
pp = pp/len(networks)
loss = pp-MI_mean*alpha
train_opt = tf.train.AdamOptimizer(learning_rate).minimize(loss)

saver = tf.train.Saver(max_to_keep = 10)
config = tf.ConfigProto(allow_soft_placement = True)
sess = tf.Session(config = config)
sess.run(tf.global_variables_initializer())
model_id=0
save_path = saver.restore(sess,"./density_estimation_cifar10_A/classification_mnist666097/model.ckpt" )
epoch = 391
num_iter = epoch*200
for iter in range(1):

    if iter%1000 == 0:
        outputs=[]
        x_test_batch = np.copy(x_test)
        y_test_batch = np.copy(y_test)
        x_test_batch = stand(x_test_batch)
        for i in range(len(networks)):
            probs = []
            for j in range(10):   
              prob= sess.run(output_list[i], {x_list[i]:x_test_batch[j*1000:(j+1)*1000], y_list[i]:y_test_batch[j*1000:(j+1)*1000], b:False})
              probs.extend(prob)  
            outputs.append(probs)
        outputs=np.array(outputs) 
        
        accs = np.mean(np.argmax(outputs,2)==np.argmax(y_test_batch,1),1)
        print("accs: ",accs)

        softmax = np.mean(outputs,0)
        maxp_in = np.max(softmax,1)
        acc = np.mean(np.argmax(softmax,1) ==  np.argmax(y_test_batch,1))
        print( iter, 'Final Testing Accuracy: ', acc)
        
        ent_in = np.sum(-np.log(softmax+1e-11)*softmax,1)
        Eent_in = np.mean(np.sum(-np.log(outputs+1e-11)*outputs,2),0)
        MI_in = ent_in - Eent_in
        
        right = np.argmax(softmax,1) ==  np.argmax(y_test_batch,1)
        wrong_data0 = x_test_batch[~right]
        wrong_data = wrong_data0[np.random.randint(0,len(wrong_data0),10000)]
        right_data = x_test_batch[right]
        right_data = right_data[np.random.randint(0,len(right_data),10000)]
        
        outputs_right=[]
        for i in range(len(networks)):
            prob= sess.run(output_list[i], {x_list[i]:right_data, b:False})
            outputs_right.append(prob)
        outputs_right=np.array(outputs_right) 
        softmax_right = np.mean(outputs_right,0)
        maxp_in_right = np.max(softmax_right,1)
        ent_in_right = np.sum(-np.log(softmax_right+1e-11)*softmax_right,1)
        Eent_in_right = np.mean(np.sum(-np.log(outputs_right+1e-11)*outputs_right,2),0)
        MI_in_right = ent_in_right - Eent_in_right

        safe_images = TIM[np.random.randint(0,10000,10000)]
        print("TIM:")
        safe_images = stand(safe_images)   

        outputs_OOD=[]
        for j in range(len(networks)):
          probs_OOD  = []
          for r in range(10):
            prob_OOD = sess.run(output_list[j], {x_list[j]:safe_images[r*1000:(r+1)*1000], b:False})
            probs_OOD.extend(prob_OOD)  
          outputs_OOD.append(probs_OOD)
        outputs_OOD = np.array(outputs_OOD) 

        softmax_OOD = np.mean(outputs_OOD,0)
        maxp_OOD = np.max(softmax_OOD,1)
        ent_OOD = np.sum(-np.log(softmax_OOD+1e-11)*softmax_OOD,1)
        Eent_OOD = np.mean(np.sum(-np.log(outputs_OOD+1e-11)*outputs_OOD,2),0)
        MI_OOD = ent_OOD - Eent_OOD

        print("maxp_OOD:",np.mean(maxp_OOD>0.99),np.mean(maxp_OOD),np.std(maxp_OOD))
        print("maxp_inD:",np.mean(maxp_in>0.99),np.mean(maxp_in),np.std(maxp_in))
        print("ent_OOD:",np.mean(ent_OOD),np.std(ent_OOD), "ent_in:", np.mean(ent_in),np.std(ent_in))
        print("MI_OOD:",np.mean(MI_OOD),np.std(MI_OOD), "ent_in:", np.mean(MI_in),np.std(MI_in))

        safe, risky  = -np.reshape(maxp_in,[10000,1]), -np.reshape(maxp_OOD,[10000,1])
        labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
        labels[safe.shape[0]:] += 1
        examples = np.squeeze(np.vstack((safe, risky)))
        print('AUPR_p:', round(100*average_precision_score(labels, examples), 2))
        print('AUROC_p:', round(100*roc_auc_score(labels, examples), 2))
        print("FPR95:",ErrorRateAt95Recall1(labels, examples)*100)

        safe, risky = np.reshape(ent_in,[10000,1]), np.reshape(ent_OOD,[10000,1])
        labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
        labels[safe.shape[0]:] += 1
        examples = np.squeeze(np.vstack((safe, risky)))
        print('AUPR_entropy:', round(100*average_precision_score(labels, examples), 2))
        print('AUROC_entropy:', round(100*roc_auc_score(labels, examples), 2))
        print("FPR95:",ErrorRateAt95Recall1(labels, examples)*100)

        safe, risky = np.reshape(MI_in,[10000,1]), np.reshape(MI_OOD,[10000,1])
        labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
        labels[safe.shape[0]:] += 1
        examples = np.squeeze(np.vstack((safe, risky)))
        print('AUPR_MI:', round(100*average_precision_score(labels, examples), 2))
        print('AUROC_MI:', round(100*roc_auc_score(labels, examples), 2))
        print("FPR95:",ErrorRateAt95Recall1(labels, examples)*100)
        print("############################################")


INFO:tensorflow:Restoring parameters from ./density_estimation_cifar10_A/classification_mnist666097/model.ckpt
accs:  [0.8833 0.8398 0.8752 0.8933 0.881  0.8818 0.8734 0.8327 0.9094 0.9055]
0 Final Testing Accuracy:  0.9274
TIM:
maxp_OOD: 0.0032 0.476109 0.1873872
maxp_inD: 0.4436 0.8900021 0.16599825
ent_OOD: 1.40956 0.43491194 ent_in: 0.3204154 0.40812796
MI_OOD: 0.8702489 0.3754089 ent_in: 0.16668713 0.23101398
AUPR_p: 92.58
AUROC_p: 93.39
FPR95: 24.46
AUPR_entropy: 94.48
AUROC_entropy: 94.88
FPR95: 22.39
AUPR_MI: 94.6
AUROC_MI: 94.82
FPR95: 23.93
############################################


## Evaluate Model 2

In [11]:
safe, risky = np.reshape(ent_in,[10000,1]), np.reshape(ent_OOD,[10000,1])
labels = np.zeros((safe.shape[0] + risky.shape[0]), dtype=np.int32)
labels[safe.shape[0]:] += 1
examples = np.squeeze(np.vstack((safe, risky)))

best = 0
threshhold = 0
for thresh in np.linspace(0,1,1000):
    tmp = np.mean((1-(examples<thresh))==labels)
    if tmp>=best:
        best = tmp
        threshhold = thresh
print("We choose the threshold to be",threshhold,"which yields the best OOD detection accuracy: ",best)

ioo = (examples<threshhold)*1.
row = np.concatenate((right,(1-right)*0))
np.mean(ioo*row)
p = np.sum(ioo*row)/np.sum(ioo)
r = np.sum(ioo*row)/10000
f1 = 2*p*r/(p+r)
print("The precision, recall and F1 score are ",p,r,f1,"respectively.")
print("The model thinks there are ",np.sum(ioo),"in distributional data out of the 20000 data points.")

We choose the threshold to be 0.7797797797797797 which yields the best OOD detection accuracy:  0.87885
The precision, recall and F1 score are  0.8827180310326378 0.8249 0.8528301886792452 respectively.
The model thinks there are  9345.0 in distributional data out of the 20000 data points.


## Evaluate the Combined System

In [12]:
ioo = (examples<threshhold)*1.
row = np.concatenate((right1,(1-right1)*0))
np.mean(ioo*row)
p = np.sum(ioo*row)/np.sum(ioo)
r = np.sum(ioo*row)/10000
f1 = 2*p*r/(p+r)
print("The precision, recall and F1 score are ",p,r,f1,"respectively.")
print("The model thinks there are ",np.sum(ioo),"in distributional data out of the 20000 data points.")

The precision, recall and F1 score are  0.8868913857677903 0.8288 0.8568622383044714 respectively.
The model thinks there are  9345.0 in distributional data out of the 20000 data points.


### Model 1 achieves high precision but low recall because it filters out too many in-distributional samples, resulting in a low F1 score; Model 2 achieves higher recall and better F1 score because it knows better about what is in-distributional and what is OOD; the Combined System achieves the best F1 score because it gets benefit from both Model 1 and Model 2.
