# Epsilon metric reproduction

If you wish to reproduce the results presented in our paper from scratch, feel free to use the below code. Since\
the version of torch used by the authors of NAS-Bench-NLP is too old, we run our tests on everything on CPU. \
While the procedure is quite light, it still took us several hours to evaluate the whole benchmark dataset of over\
14k architectures.

We provide the codes to reproduce the results for single run and multiple runs architectures on PennTreebank data.

In [1]:
import os
import json
import data

import numpy as np
import pickle as pkl
import seaborn as sns
from tqdm import trange

import torch
import torch.nn as nn

from utils import batchify
from argparse import Namespace
from model import AWDRNNModel

from utils import get_batch

from epsinas_utils import compute_epsinas, prepare_seed, prepare_recepies

## Single run with Penn Tree Bank

14322 randomly created architectures are trained with a single seed.

It takes about 2 GPU hours to reproduce our results from scratch.

In [2]:
# Prepare a fixed batch of data
file_list=os.listdir("train_logs_single_run/")
log_dflt = json.load(open('train_logs_single_run/' + file_list[0], 'r'))
args = Namespace(**log_dflt)
corpus = data.Corpus(args.data)

if torch.cuda.is_available():
    args.cuda = True
    device = "cuda"
    os.environ['CUDA_VISIBLE_DEVICES'] = '1'
    
else:
    args.cuda = False
    device = "cpu"
    
device = torch.device(device)

ntokens = len(corpus.dictionary)
batch_size = 256

train_eval_data = batchify(corpus.train, batch_size, args, device)
x, _ = get_batch(train_eval_data, 0, args, evaluation=True)

/usr/bin/nvidia-modprobe: unrecognized option: "-s"

ERROR: Invalid commandline, please run `/usr/bin/nvidia-modprobe --help` for usage information.


/usr/bin/nvidia-modprobe: unrecognized option: "-s"

ERROR: Invalid commandline, please run `/usr/bin/nvidia-modprobe --help` for usage information.




In [3]:
# Reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
prepare_seed(21)

In [4]:
# Setting up the weights
weight_l = 1e-5
weight_h = 1e-3

In [5]:
save_dir = '../epsinas-release-data/NAS-Bench-NLP/evaluation/single_seed/'
os.makedirs(save_dir, exist_ok=True)

datafile_name = f'{save_dir}/data_NAS-Bench-NLP_single_06032025'

# Setting up the weights
weights = [1e-5, 1e-3]

if os.path.exists(datafile_name):
    # Load precomputed results
    datafile = open(datafile_name,'rb')
    input_data = pkl.load(datafile)
    scores = input_data["scores"]
    accs = input_data["accs"]
    nparams = input_data["nparams"]
    datafile.close()
else:
    accs = []
    nparams = []
    scores = []

    for i in trange(len(file_list)):
        file = file_list[i]
        log = json.load(open('train_logs_single_run/' + file, 'r'))
        args = Namespace(**log)

        # Build the model
        network = AWDRNNModel(args.model,
                              ntokens,
                              args.emsize,
                              args.nhid,
                              args.nlayers,
                              args.dropout,
                              args.dropouth,
                              args.dropouti,
                              args.dropoute,
                              args.wdrop,
                              args.tied,
                              args.recepie,
                              verbose=False)
        # print(network.modules)
        network = network.to(device)
        score = compute_epsinas(x, network, weights, batch_size)
        scores.append(score)
        try:
            accs.append(log['test_losses'][-1])
        except:
            # Some architectures have no reported test perplexity
            accs.append(np.nan)
        nparams.append(args.num_params)

    # Save the results
    save_dic = {}
    save_dic["scores"] = scores
    save_dic["accs"] = accs
    save_dic["nparams"] = nparams

    pkl.dump(save_dic, open(datafile_name, "wb"))

  pred_norm = (pred - pred_min)/(pred_max - pred_min)
  mae = np.nanmean(np.abs(preds[0,:]-preds[1,:]))
  score = np.nanmean(mae)/np.nanmean(preds)
  pred_norm = (pred - pred_min)/(pred_max - pred_min)
100%|███████████████████████████████████| 14322/14322 [2:09:01<00:00,  1.85it/s]


## Multiple runs with Penn Tree Bank

4114 randomly created architectures trained with 3 random seeds.

In [2]:
# Prepare a fixed batch of data
file_list=os.listdir("train_logs_multi_runs/")
log_dflt = json.load(open('train_logs_multi_runs/' + file_list[0], 'r'))
args = Namespace(**log_dflt)
corpus = data.Corpus(args.data)



if torch.cuda.is_available():
    args.cuda = True
    device = "cuda"
    os.environ['CUDA_VISIBLE_DEVICES'] = '1'
    
else:
    args.cuda = False
    device = "cpu"
    
device = torch.device(device)

ntokens = len(corpus.dictionary)
batch_size = 256

train_eval_data = batchify(corpus.train, batch_size, args, device)
x, _ = get_batch(train_eval_data, 0, args, evaluation=True)

/usr/bin/nvidia-modprobe: unrecognized option: "-s"

ERROR: Invalid commandline, please run `/usr/bin/nvidia-modprobe --help` for usage information.


/usr/bin/nvidia-modprobe: unrecognized option: "-s"

ERROR: Invalid commandline, please run `/usr/bin/nvidia-modprobe --help` for usage information.




In [3]:
# Reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
prepare_seed(21)

In [4]:
recepie_dic = prepare_recepies()
recepies = list(recepie_dic.keys())

In [5]:
save_dir = '../epsinas-release-data/NAS-Bench-NLP/evaluation/three_seeds/'
os.makedirs(save_dir, exist_ok=True)

datafile_name = f'{save_dir}/data_NAS-Bench-NLP_three_n06032025'

# Setting up the weights
weights = [1e-5, 1e-3]

if os.path.exists(datafile_name):
    # Load precomputed results
    datafile = open(datafile_name,'rb')
    input_data = pkl.load(datafile)
    scores = input_data["scores"]
    accs_mean = input_data["accs_mean"]
    accs_min = input_data["accs_min"]
    accs_max = input_data["accs_max"]
    nparams = input_data["nparams"]
    datafile.close()
else:

    accs_mean = []
    accs_min = []
    accs_max = []
    nparams = []
    scores = []

    for i in trange(len(recepie_dic)):
        rec = recepies[i]
        indices = recepie_dic[rec]
        # As for the same recepie the metric performance does not change,
        # we only need to compute it once
        file = file_list[indices[0]]
        log = json.load(open('train_logs_multi_runs/' + file, 'r'))
        args = Namespace(**log)

        # Build the model
        network = AWDRNNModel(args.model,
                              ntokens,
                              args.emsize,
                              args.nhid,
                              args.nlayers,
                              args.dropout,
                              args.dropouth,
                              args.dropouti,
                              args.dropoute,
                              args.wdrop,
                              args.tied,
                              args.recepie,
                              verbose=False)
        network = network.to(device)
        score = compute_epsinas(x, network, weights, batch_size)
        scores.append(score)
        # Retrive 3 seeds test errors
        acc_run = []
        for ind in indices:
            file = file_list[ind]
            log = json.load(open('train_logs_multi_runs/' + file, 'r'))
            args = Namespace(**log)
            try:
                acc_run.append(log['test_losses'][-1])
            except: 
                acc_run.append(np.nan)

        accs_mean.append(np.nanmean(acc_run))
        accs_min.append(np.nanmin(acc_run))
        accs_max.append(np.nanmax(acc_run))
        nparams.append(args.num_params)

    # Save the results
    save_dic = {}
    save_dic["scores"] = scores
    save_dic["accs_mean"] = accs_mean
    save_dic["accs_min"] = accs_min
    save_dic["accs_max"] = accs_max
    save_dic["nparams"] = nparams

    pkl.dump(save_dic, open(datafile_name, "wb"))

  pred_norm = (pred - pred_min)/(pred_max - pred_min)
  mae = np.nanmean(np.abs(preds[0,:]-preds[1,:]))
  score = np.nanmean(mae)/np.nanmean(preds)
  pred_norm = (pred - pred_min)/(pred_max - pred_min)
100%|███████████████████████████████████████| 4114/4114 [35:05<00:00,  1.95it/s]
