# Reproduction of  `epsilon` metric

If you wish to reproduce the results presented in our paper from scratch, feel free to use the below code.
In this notebook, we provide the codes to reproduce the results for NAS-Bench-201 sarch space, CIFAR10, CIFAR100 and ImageNet16-120 datasets.

In [1]:
import os
import json
import time
import itertools

import numpy as np
import pandas as pd
import pickle as pkl
import seaborn as sns
from scipy import stats
from tqdm import trange
from dotmap import DotMap
from statistics import mean
import matplotlib.pyplot as plt

import torch

import nasspace
from datasets import data
from epsilon_utils import prepare_seed, compute_epsilon

In [72]:
# Select the dataset. Choose among: 'cifar10', 'cifar100', 'ImageNet16-120'
dataset = 'cifar100'

if dataset=='ImageNet16-120':
    data_loc = './datasets/ImageNet16'
else:
    data_loc = './datasets/cifardata'

In [63]:
batch_size=256
repeat=1
GPU='1'
augtype='none'
trainval=True

In [64]:
# Arguments required for NAS-Bench-201 search space initialisation
args = DotMap()
args.nasspace = 'nasbench201'
args.dataset=dataset
args.api_loc = './api/NAS-Bench-201-v1_1-096897.pth'

savedataset = dataset
dataset = 'fake' if 'fake' in savedataset else savedataset
savedataset = savedataset.replace('fake', '')
if savedataset == 'cifar10':
    savedataset = savedataset + '-valid'

In [65]:
# Load the search space (it takes some time)
searchspace = nasspace.get_search_space(args)

./api/NAS-Bench-201-v1_1-096897.pth


In [66]:
if 'valid' in savedataset:
    savedataset = savedataset.replace('-valid', '')

if dataset == 'cifar10':
    acc_type = 'ori-test'
    val_acc_type = 'x-valid'
else:
    acc_type = 'x-test'
    val_acc_type = 'x-valid'

In [67]:
# Define the device
os.environ['CUDA_VISIBLE_DEVICES'] = GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
prepare_seed(21)

In [68]:
# Load the data
# The first time, data will be downloaded into 
train_loader = data.get_data(dataset, data_loc, trainval, batch_size, augtype, repeat, args)

# Pick up a batch
data_iterator = iter(train_loader)
x, _ = next(data_iterator) # No need of true labels
x = x.to(device)

Finally, run the computation of `epsilon` metric for all the architectures within the search space. There are 15,265 architectures in NAS-Bench-201.

In [69]:
save_dir = './release_results/{}/'.format(dataset.upper())
os.makedirs(save_dir, exist_ok=True)

weights = [1e-7, 1]
test_accs = []
val_accs = []
nparams = []
scores = []
for i in trange(len(searchspace)):
    start = time.time()
    uid = searchspace[i]
    network = searchspace.get_network(uid)
    network = network.to(device)
    score = compute_epsilon(x, network, weights)
    scores.append(score)
    nparams.append(sum(p.numel() for p in network.parameters()))
    test_accs.append(searchspace.get_final_accuracy(uid, acc_type, False))
    if dataset=='cifar10':
        val_accs.append(searchspace.get_final_accuracy(uid, val_acc_type, True))
    else:
        val_accs.append(searchspace.get_final_accuracy(uid, val_acc_type, False))
    times.append(time.time()-start)

# Save your results
save_dic = {} 
save_dic["scores"] = scores
save_dic["nparams"] = nparams
save_dic["test_accs"] = test_accs
save_dic["val_accs"] = val_accs
save_dic["times"] = times

pkl.dump(save_dic, open(save_dir + "Data", "wb"))

100%|████████████████████████████████████████████████████████████████████████████████████████| 15625/15625 [30:40<00:00,  8.49it/s]


In [20]:
save_dir = './release_results/{}/avrg_perform/'.format(dataset.upper(), batch_size)
os.makedirs(save_dir, exist_ok=True)

weights = [1e-7, 1]
n_runs = 500
n_samples = 100
ind_actual_best_mean = 0

times = []
chosen = []
accs = []
val_accs = []
topscores = []

# Set up the log file
if trainval:
    logs_filename = save_dir + 'logs_' + dataset.upper() +  'val_100samples.txt'
else:
    logs_filename = save_dir + 'logs_' + dataset.upper() +  '_100samples.txt'

with open(logs_filename, 'w') as logs:
    runs = trange(n_runs, desc='acc: ')
    for N in runs:
        start = time.time()
        scores = []
        accs_run = []
        nparams = []
        np.random.seed(N)
        indices = np.random.randint(0, len(searchspace), n_samples)
        for i in indices:
            uid = searchspace[i]
            network = searchspace.get_network(uid)
            network = network.to(device)
            score = compute_epsilon(x, network, weights)
            scores.append(score)
            nparams.append(sum(p.numel() for p in network.parameters()))
            accs_run.append(searchspace.get_final_accuracy(uid, acc_type, trainval))

        accs_run.sort(reverse=True)
        best_arch = indices[np.nanargmax(scores)]
        uid_best = searchspace[best_arch]
        ind_actual_best = accs_run.index(searchspace.get_final_accuracy(uid_best, acc_type, trainval))
        ind_actual_best_mean += ind_actual_best

        topscores.append(scores[np.nanargmax(scores)])
        chosen.append(best_arch)
        accs.append(searchspace.get_final_accuracy(uid_best, acc_type, trainval))

        if not dataset == 'cifar10' or trainval:
            val_accs.append(searchspace.get_final_accuracy(uid_best, val_acc_type, trainval))
        logs.write(f"Mean acc: {mean(accs if not trainval else val_accs):.2f}% ")
        logs.write(f"Actual ranking: {ind_actual_best} \n")
        times.append(time.time()-start)
        runs.set_description(f"mean acc: {mean(accs if not trainval else val_accs):.2f}%, mean rank: {ind_actual_best_mean/(N+1):.2f}")

    logs.write(f"Average chosen architecure's rank: {ind_actual_best_mean/n_runs} \n")
    logs.write(f"Final mean test accuracy: {np.mean(accs)} +- {np.std(accs)} \n")
    logs.write(f"Median duration: {np.median(times)} \n")
    if len(val_accs) > 1:
        logs.write(f"Final mean validation accuracy: {np.mean(val_accs)} +- {np.std(val_accs)} \n")

state = {'accs': accs,
         'val_accs': val_accs,
         'chosen': chosen,
         'times': times,
         'topscores': topscores,
         }

# Save your results
save_dic = {} 
save_dic["accs"] = accs
save_dic["val_accs"] = val_accs
save_dic["chosen"] = chosen
save_dic["times"] = times
save_dic["topscores"] = topscores

pkl.dump(save_dic, open(save_dir + "Data_500runs_100samples_TRAINVAL", "wb"))

mean acc: 90.45%, mean rank: 6.19: 100%|███████████████████████████████████████████████████████| 500/500 [1:26:40<00:00, 10.40s/it]


In [74]:
n_runs = 500
n_samples = 1000
ind_actual_best_mean = 0

times_run = []
chosen = []
accs = []
val_accs = []
topscores = []
opt_test = []
opt_val = []
rand_test = []
rand_val = []

# Read the data
if dataset=='cifar10':
    f = open('/home/gracheva/Work/NAS/Epsilon-NAS/NAS-Bench-201/release_results/CIFAR10/Data','rb')
elif dataset=='cifar100':
    f = open('/home/gracheva/Work/NAS/Epsilon-NAS/NAS-Bench-201/release_results/CIFAR100/Data','rb')
elif dataset=='ImageNet16-120':
    f = open('/home/gracheva/Work/NAS/Epsilon-NAS/NAS-Bench-201/release_results/IMAGENET16-120/Data','rb')
while(1):
    try:
        d = pkl.load(f)
        scores_all = d['scores']
        accs_all = d['test_accs']
        val_accs_all = d['val_accs']
        times = d['times']
    except EOFError:
        break
f.close()
    
# Set up the log file
save_dir = './release_results/average_performance/{}/epsilon/'.format(dataset.upper())
os.makedirs(save_dir, exist_ok=True)
logs_filename = '{}logs_{}_{}samples.txt'.format(save_dir, dataset.upper(), n_samples)

with open(logs_filename, 'w') as logs:
    runs = trange(n_runs, desc='acc: ')
    for N in runs:
        scores_run = []
        accs_run = []
        nparams = []
        time_run = 0
        np.random.seed(N)
        indices = np.random.randint(0, len(accs_all), n_samples)
        for i in indices:
            scores_run.append(scores_all[i])
            accs_run.append(accs_all[i])
            time_run += times[i]
            
        # Computing optimal, random accuracies
        opt_test.append(np.nanmax(accs_run))
        opt_val.append(val_accs_all[indices[np.nanargmax(accs_run)]])
        rand_test.append(accs_run[0])
        rand_val.append(val_accs_all[indices[0]])
        
        accs_run.sort(reverse=True)
        best_arch = indices[np.nanargmax(scores_run)]
        ind_actual_best = accs_run.index(accs_all[best_arch])
        ind_actual_best_mean += ind_actual_best

        topscores.append(np.nanmax(scores_run))
        chosen.append(best_arch)
        accs.append(accs_all[best_arch])
        val_accs.append(val_accs_all[best_arch])        
        
        logs.write(f"Mean acc: {mean(accs):.2f}% ")
        logs.write(f"Actual ranking: {ind_actual_best} \n")
        times_run.append(time_run)
        runs.set_description(f"mean acc: {mean(accs):.2f}%, mean rank: {ind_actual_best_mean/(N+1):.2f}")

    logs.write(f"Average chosen architecure's rank: {ind_actual_best_mean/n_runs} \n")
    logs.write(f"Final mean test accuracy: {np.mean(accs)} +- {np.std(accs)} \n")
    logs.write(f"Median duration: {np.median(times_run)} \n")
    logs.write(f"Final mean validation accuracy: {np.mean(val_accs)} +- {np.std(val_accs)} \n\n")
    logs.write(f"Final mean optimal test accuracy: {np.mean(opt_test)} +- {np.std(opt_test)} \n")
    logs.write(f"Final mean optimal validation accuracy: {np.mean(opt_val)} +- {np.std(opt_val)} \n")
    
    logs.write(f"Final mean random test accuracy: {np.mean(rand_test)} +- {np.std(rand_test)} \n")
    logs.write(f"Final mean random validation accuracy: {np.mean(rand_val)} +- {np.std(rand_val)} \n")

mean acc: 71.79%, mean rank: 8.19: 100%|████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 279.19it/s]


In [44]:
indices[np.nanargmax(accs_run)]

2732

Feel free to run the statistics over these results in `NAS-Bench-201 Stats.ipynb`notebook.