In [1]:
import argparse

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import numpy as np
import pandas as pd

In [8]:
parser = argparse.ArgumentParser()
 
parser.add_argument('--preds_path', type=str, default='/home/gean/nns_performance_prediction/results/fast/test12/', 
                    help='dataset location')    
parser.add_argument('--results_path', type=str, default='/home/gean/nns_performance_prediction/results/fast/test12/',
                    help='results dir location')
parser.add_argument('--k', type=int, default=5000, 
                    help='number of k best archs to extract accs')

#'+' == 1 or more, '*' == 0 or more, '?' == 0 or 1.
parser.add_argument('--dataset', type=str, default=['cifar10valid', 'cifar100', 'imagenet16_120'], nargs='+', 
                    help='one of the datasets from nasbench201, being cifar10valid, cifar100, or imagenet16_120')
parser.add_argument('--data_subset', type=int, default=[4, 108, 200], 
                    help='one of the subsets from nasbench101 with 4, 12, 36, or 108 epochs')
parser.add_argument('--seed', type=int, default=[0, 1, 10, 42, 100, 123, 666, 1000, 1234, 12345], nargs='+', 
                    help='seeds used for all the random procedures')

parser.add_argument('--train_size', type=int, default=[43, 86, 129, 344, 860], 
                    help='training split size')
#minimal
# parser.add_argument('--train_size', type=int, default=[172], 
#                     help='training split size')
#all except mlp
parser.add_argument('--estimators', type=str, default=['LinearRegression', 'SGDRegressor', 'Lasso', 'BayesianRidge', 
                                                       'KNeighborsRegressor', 'DecisionTreeRegressor', 'SVR',
                                                       'RandomForestRegressor', 'AdaBoostRegressor', 'GradientBoostingRegressor', 
                                                       'DummyRegressor'], nargs='+',
                    help='list of sklearn estimators used for training') 
parser.add_argument('--verbose', type=int, default=0, 
                    help='control the logging prints. 0 for deactivate and 1 for activate') 

args, unknown = parser.parse_known_args()

In [9]:
def get_dataframe(dataset, subset, n):
    df_whole = pd.read_csv(str(args.preds_path + 'nasbench201_' + str(dataset) + 
                               '_' + str(subset) + 'epochs_n' + str(n) + '_predictions.csv'))
    
    return df_whole

In [10]:
def save_results(performance_dict, dataset, subset, n, file_name): 
    df_results = pd.DataFrame.from_dict(performance_dict)
    #i.e: nasbench201_cifar100_200epochs_n172_k5000_...(pred_accs, true_accs).csv
    df_results.to_csv(str(args.results_path + 'nasbench201_' + str(dataset) + '_' + str(subset) + 'epochs_n' + str(n) 
                          + '_k' + str(args.k) + '_' + str(file_name) + '_accs.csv'), index=False, float_format='%.6f')

In [11]:
def get_results_k_best_archs(sorted_pred_accs, top1_val_test_accs, df_preds, dataset, subset, n, seed, reg_model):
    df_aux = df_preds.loc[(df_preds['model'] == str(reg_model)) & (df_preds['seed'] == int(seed))]
    df_aux = df_aux.sort_values(by=['acc_valid_pred'], ascending=False)
    
    rows_best_preds = df_aux.iloc[:args.k]
    
    pred_val_accs = list(rows_best_preds['acc_valid_pred'])
    true_val_accs = list(rows_best_preds['acc_valid_true'])
    true_test_accs = list(rows_best_preds['acc_test_true'])
            
    sorted_pred_accs['Dataset'].extend([dataset] * args.k)
    sorted_pred_accs['Epoch'].extend([subset] * args.k)
    sorted_pred_accs['Train_Size'].extend([n] * args.k)
    sorted_pred_accs['Seed'].extend([seed] * args.k)
    sorted_pred_accs['Model'].extend([reg_model] * args.k)
    sorted_pred_accs['K'].extend(list(range(1, args.k+1)))
    sorted_pred_accs['Pred_Val_Acc'].extend(pred_val_accs)
    sorted_pred_accs['True_Val_Acc'].extend(true_val_accs)
    sorted_pred_accs['True_Test_Acc'].extend(true_test_accs)
    
    if args.verbose: print("sorted_pred_accs GENERATED")
    
    for k in range(1, args.k+1):
        if args.verbose: print("### extracting top1 accs from k{} models".format(k))
            
        true_val_accs = list(df_aux['acc_valid_true'][0:k]) #from k best pred_val_accs
        true_val_accs.sort(reverse=True)
        
        true_test_accs = list(df_aux['acc_test_true'][0:k]) #from k best pred_val_accs
        true_test_accs.sort(reverse=True)

        top1_val_test_accs['Dataset'].append(dataset)
        top1_val_test_accs['Epoch'].append(subset)
        top1_val_test_accs['Train_Size'].append(n)
        top1_val_test_accs['Seed'].append(seed)
        top1_val_test_accs['Model'].append(reg_model)
        top1_val_test_accs['K'].append(k)
        top1_val_test_accs['True_Val_Acc'].append(true_val_accs[0]) #top1 acc
        top1_val_test_accs['True_Test_Acc'].append(true_test_accs[0]) #top1 acc
        
    if args.verbose: print("top1_val_test_accs GENERATED\n")    
    
    return sorted_pred_accs, top1_val_test_accs

In [12]:
def main():
    for data in args.dataset:
        
        for subset in args.data_subset:
            
            for n in args.train_size:
                sorted_pred_accs = {'Dataset': [], 'Epoch': [], 'Train_Size': [], 'Seed': [], 'Model': [], 'K': [], 
                                'Pred_Val_Acc': [], 'True_Val_Acc': [], 'True_Test_Acc': []}
                top1_val_test_accs = {'Dataset': [], 'Epoch': [], 'Train_Size': [], 'Seed': [], 'Model': [], 'K': [], 
                                'True_Val_Acc': [], 'True_Test_Acc': []}
                
                try: 
                    df_preds = get_dataframe(data, subset, n)
                
                except FileNotFoundError as e:
                    print(e)
                    continue
            
                for seed in args.seed:
                    print("\n\n\n########### {}, Epoch{}, N{}, Seed{} ###########".format(data, subset, n, seed))
                                
                    for reg in args.estimators:
                        print(reg)
                    
                        sorted_pred_accs, top1_val_test_accs = get_results_k_best_archs(sorted_pred_accs, top1_val_test_accs, 
                                                                              df_preds, data, subset, n, seed, reg)                
        
                #one per each dataset_epoch_n
                save_results(sorted_pred_accs, data, subset, n, 'sorted_pred')
                save_results(top1_val_test_accs, data, subset, n, 'top1_val_test')
                print("\n### CSV GENERATED ###")           

In [13]:
if __name__ == '__main__':
    print("preds_path: ", args.preds_path)
    print("results_path: ", args.results_path)
    print("k: ", args.k)
    print("dataset: ", args.dataset)
    print("data_subset: ", args.data_subset)
    print("seed: ", args.seed)
    print("train_size: ", args.train_size)
    print("estimators: ", args.estimators)
    print("verbose: ", args.verbose)
    
    main()

preds_path:  /home/gean/nns_performance_prediction/results/fast/test12/
results_path:  /home/gean/nns_performance_prediction/results/fast/test12/
k:  5000
dataset:  ['cifar10valid', 'cifar100', 'imagenet16_120']
data_subset:  [4, 108, 200]
seed:  [0, 1, 10, 42, 100, 123, 666, 1000, 1234, 12345]
train_size:  [43, 86, 129, 344, 860]
estimators:  ['LinearRegression', 'SGDRegressor', 'Lasso', 'BayesianRidge', 'KNeighborsRegressor', 'DecisionTreeRegressor', 'SVR', 'RandomForestRegressor', 'AdaBoostRegressor', 'GradientBoostingRegressor', 'DummyRegressor']
verbose:  0



########### cifar10valid, Epoch4, N43, Seed0 ###########
LinearRegression
SGDRegressor
Lasso
BayesianRidge
KNeighborsRegressor
DecisionTreeRegressor
SVR
RandomForestRegressor
AdaBoostRegressor
GradientBoostingRegressor
DummyRegressor



########### cifar10valid, Epoch4, N43, Seed1 ###########
LinearRegression
SGDRegressor
Lasso
BayesianRidge
KNeighborsRegressor
DecisionTreeRegressor
SVR
RandomForestRegressor
AdaBoostRegresso