In [9]:
import argparse

import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
parser = argparse.ArgumentParser()
 
parser.add_argument('--data_path', type=str, default='/home/gean/Code/nns_performance_prediction/results/', help='location of the dataset')    
parser.add_argument('--results_path', type=str, default='/home/gean/Code/nns_performance_prediction/results/', help='location of the results directory')    

parser.add_argument('--data_subset', type=int, default=[4, 12], help='one of the subsets from nasbench101 with 4, 12, 36, or 108 epochs')
parser.add_argument('--seed', type=int, default=[0, 1], nargs='+', help='seeds used for all the random procedures') 
parser.add_argument('--train_size', type=int, default=[43], help='[Int, Int...] representing the total number of train samples')
parser.add_argument('--estimators', type=str, default=['LinearRegression', 'SGDRegressor', 'Lasso', 'BayesianRidge', 'KNeighborsRegressor', 'DecisionTreeRegressor',
                                                       'SVR', 'MLPRegressor', 'RandomForestRegressor', 'AdaBoostRegressor', 'GradientBoostingRegressor', 
                                                       'DummyRegressor'], nargs='+', help='list of sklearn estimators to be used for training') 

args, unknown = parser.parse_known_args() 

In [3]:
def set_default_seed(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

In [4]:
def save_results(performance_dict, file_name): 
    df_results = pd.DataFrame.from_dict(performance_dict)
    df_results.to_csv(str(args.results_path + file_name + '.csv'),
                      index=False, float_format='%.6f')

In [26]:
def extract_statistics():    
    for n in args.train_size:
        mae_values = {'LinearRegression_MAE': [], 'SGDRegressor_MAE': [], 'Lasso_MAE': [], 'BayesianRidge_MAE': [], 'KNeighborsRegressor_MAE': [], 
                      'DecisionTreeRegressor_MAE': [], 'SVR_MAE': [], 'MLPRegressor_MAE': [], 'RandomForestRegressor_MAE': [], 'AdaBoostRegressor_MAE': [], 
                      'GradientBoostingRegressor_MAE': [], 'DummyRegressor_MAE': []}
        
        mse_values = {'LinearRegression_MSE': [], 'SGDRegressor_MSE': [], 'Lasso_MSE': [], 'BayesianRidge_MSE': [], 'KNeighborsRegressor_MSE': [], 
                      'DecisionTreeRegressor_MSE': [], 'SVR_MSE': [], 'MLPRegressor_MSE': [], 'RandomForestRegressor_MSE': [], 'AdaBoostRegressor_MSE': [], 
                      'GradientBoostingRegressor_MSE': [], 'DummyRegressor_MSE': []}
            
        r2_values = {'LinearRegression_R2': [], 'SGDRegressor_R2': [], 'Lasso_R2': [], 'BayesianRidge_R2': [], 'KNeighborsRegressor_R2': [], 
                      'DecisionTreeRegressor_R2': [], 'SVR_R2': [], 'MLPRegressor_R2': [], 'RandomForestRegressor_R2': [], 'AdaBoostRegressor_R2': [], 
                      'GradientBoostingRegressor_R2': [], 'DummyRegressor_R2': []}
    
        df_whole = pd.read_csv(str(args.data_path + 'N' + str(n) +'_nasbench_predictions.csv'))
        
        for seed in args.seed:
            set_default_seed(seed)
        
            for subset in args.data_subset:
                y_true_column = str('Seed' + str(seed) + '_Subset' + str(subset) + '_N' + str(n) + '_Ytrue')  
                y_true = df_whole[y_true_column]

                print("\nN{}, Seed{}, Subset{}: \n".format(n, seed, subset))
                
                for reg in args.estimators:  
                    y_pred_column = str('Seed' + str(seed) + '_Subset' + str(subset) + '_N' + str(n) + '_' + str(reg) + '_Ypred')  
                    y_pred = df_whole[y_pred_column]
                    
                    mae = mean_absolute_error(y_true, y_pred)
                    mae_values[str(reg + '_MAE')].append(mae)
                    
                    mse = mean_squared_error(y_true, y_pred)
                    mse_values[str(reg + '_MSE')].append(mse)
                    
                    r2 = r2_score(y_true, y_pred)
                    r2_values[str(reg + '_R2')].append(r2)
                    
                    print(reg)
                    print("y_true column: {}".format(y_true_column))
                    print("y_pred column: {} \n".format(y_pred_column))
                        
        performance_metrics = {**mae_values, **mse_values, **r2_values}
        
        print("performance_metrics dict: \n{}".format(performance_metrics))
        
        #one file per n
        save_results(performance_metrics, str("N" + str(n) + "_nasbench_metrics"))    

In [27]:
if __name__ == '__main__':
    
    print("data_path: ", args.data_path)
    print("results_path: ", args.results_path)
    print("data_subset: ", args.data_subset)
    print("seed: ", args.seed)
    print("train_size: ", args.train_size)                
    print("estimators: ", args.estimators)
    
    extract_statistics()

data_path:  /home/gean/Code/nns_performance_prediction/results/
results_path:  /home/gean/Code/nns_performance_prediction/results/
data_subset:  [4, 12]
seed:  [0, 1]
train_size:  [43]
estimators:  ['LinearRegression', 'SGDRegressor', 'Lasso', 'BayesianRidge', 'KNeighborsRegressor', 'DecisionTreeRegressor', 'SVR', 'MLPRegressor', 'RandomForestRegressor', 'AdaBoostRegressor', 'GradientBoostingRegressor', 'DummyRegressor']

N43, Seed0, Subset4: 

LinearRegression
y_true column: Seed0_Subset4_N43_Ytrue
y_pred column: Seed0_Subset4_N43_LinearRegression_Ypred 

SGDRegressor
y_true column: Seed0_Subset4_N43_Ytrue
y_pred column: Seed0_Subset4_N43_SGDRegressor_Ypred 

Lasso
y_true column: Seed0_Subset4_N43_Ytrue
y_pred column: Seed0_Subset4_N43_Lasso_Ypred 

BayesianRidge
y_true column: Seed0_Subset4_N43_Ytrue
y_pred column: Seed0_Subset4_N43_BayesianRidge_Ypred 

KNeighborsRegressor
y_true column: Seed0_Subset4_N43_Ytrue
y_pred column: Seed0_Subset4_N43_KNeighborsRegressor_Ypred 

DecisionTre