In [2]:
import argparse

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import os
import random
import time
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split

from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso, BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.dummy import DummyRegressor

from joblib import dump, load

In [15]:
parser = argparse.ArgumentParser()
 
parser.add_argument('--data_path', type=str, default='/home/gean/Code/nns_performance_prediction/meta_datasets/', help='location of the dataset')    
parser.add_argument('--model_path', type=str, default='/home/gean/Code/nns_performance_prediction/saved_models/fast/test1/', help='path to save the trained models')
parser.add_argument('--results_path', type=str, default='/home/gean/Code/nns_performance_prediction/results/fast/test1/', help='location of the results directory')    
parser.add_argument('--target', type=str, default='final_validation_accuracy', help='target of the training/test')

#'+' == 1 or more, '*' == 0 or more, '?' == 0 or 1.
parser.add_argument('--data_subset', type=int, default=[4, 12], help='one of the subsets from nasbench101 with 4, 12, 36, or 108 epochs')
parser.add_argument('--seed', type=int, default=[0, 42], nargs='+', help='seeds used for all the random procedures') 
parser.add_argument('--train_size', type=int, default=[43, 86], help='[Int, Int...] representing the total number of train samples')

parser.add_argument('--estimators', type=str, default=['LinearRegression', 'SGDRegressor', 'Lasso', 'BayesianRidge', 'KNeighborsRegressor', 'DecisionTreeRegressor',
                                                       'SVR', 'MLPRegressor', 'RandomForestRegressor', 'AdaBoostRegressor', 'GradientBoostingRegressor', 
                                                       'DummyRegressor'], nargs='+', help='list of sklearn estimators to be used for training') 

#final_test_accuracy included for extraction purposes
parser.add_argument('--features_drop', type=str, default=[], nargs='+', help='list of features to drop from nasbench101')


args, unknown = parser.parse_known_args()

In [4]:
def set_default_seed(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

In [17]:
def get_preprocessed_numpy_dataset(data_subset):
    dtype = {"module_adjacency": 'object', "module_operations": 'object', "trainable_parameters": 'uint8', "conv_num_layers": 'uint8', "conv_kernel_min": 'uint8', 
             "conv_kernel_max": 'uint8', "conv_kernel_mode": 'uint8', "maxpool_num_layers": 'uint8', "final_validation_accuracy": 'float16', "final_test_accuracy": 'float16'}    
                 
    df_whole = pd.read_csv(str(args.data_path + 'nasbench101_' + str(data_subset) + 'epochs_tabular.csv'), dtype=dtype, usecols=["module_adjacency", "module_operations", 
                            "trainable_parameters", "conv_num_layers", "conv_kernel_min", "conv_kernel_max", "conv_kernel_mode", "maxpool_num_layers", 
                            "final_validation_accuracy", "final_test_accuracy"])
    df_whole.drop(args.features_drop, axis=1, inplace=True)
    print(df_whole.dtypes)
    
    df_whole = pd.get_dummies(df_whole)
    df_y = df_whole[args.target]
    df_X = df_whole.drop([args.target], axis = 1)
    X = df_X.to_numpy()
    y = df_y.to_numpy()
    df_whole = None
    
    return X, y

In [6]:
def preds_time_extraction(estimator, X_train, y_train, X_test, y_test):
    tic = time.time()
    estimator.fit(X_train, y_train)
    toc = time.time()
    print("Training DONE")
    
    y_pred = estimator.predict(X_test)
    print("Testing DONE\n")
    
    return y_pred, (toc - tic)

In [7]:
def load_estimator(file_name: str):
    estimator = load(str(args.model_path + file_name + '.joblib'))
    
    return estimator

In [8]:
def save_results(performance_dict, subset, file_name): 
    df_results = pd.DataFrame.from_dict(performance_dict)
    df_results.to_csv(str(args.results_path + "Subset" + str(subset) + "_" + file_name + '.csv'),
                      index=False, float_format='%.6f')
    

In [9]:
def run():
    for subset in args.data_subset:
        X, y = get_preprocessed_numpy_dataset(subset)

        predictions = {}
        fit_times = {'LinearRegression': [], 'SGDRegressor': [], 'Lasso': [], 'BayesianRidge': [], 'KNeighborsRegressor': [], 'DecisionTreeRegressor': [], 'SVR': [], 
                     'MLPRegressor': [], 'RandomForestRegressor': [], 'AdaBoostRegressor': [], 'GradientBoostingRegressor': [], 'DummyRegressor': []}
        mae_values = {'LinearRegression_MAE': [], 'SGDRegressor_MAE': [], 'Lasso_MAE': [], 'BayesianRidge_MAE': [], 'KNeighborsRegressor_MAE': [], 
                      'DecisionTreeRegressor_MAE': [], 'SVR_MAE': [], 'MLPRegressor_MAE': [], 'RandomForestRegressor_MAE': [], 'AdaBoostRegressor_MAE': [], 
                      'GradientBoostingRegressor_MAE': [], 'DummyRegressor_MAE': []}        
        mse_values = {'LinearRegression_MSE': [], 'SGDRegressor_MSE': [], 'Lasso_MSE': [], 'BayesianRidge_MSE': [], 'KNeighborsRegressor_MSE': [], 
                      'DecisionTreeRegressor_MSE': [], 'SVR_MSE': [], 'MLPRegressor_MSE': [], 'RandomForestRegressor_MSE': [], 'AdaBoostRegressor_MSE': [], 
                      'GradientBoostingRegressor_MSE': [], 'DummyRegressor_MSE': []}            
        r2_values = {'LinearRegression_R2': [], 'SGDRegressor_R2': [], 'Lasso_R2': [], 'BayesianRidge_R2': [], 'KNeighborsRegressor_R2': [], 
                     'DecisionTreeRegressor_R2': [], 'SVR_R2': [], 'MLPRegressor_R2': [], 'RandomForestRegressor_R2': [], 'AdaBoostRegressor_R2': [], 
                     'GradientBoostingRegressor_R2': [], 'DummyRegressor_R2': []}
        
        for n in args.train_size:

            for seed in args.seed:
                set_default_seed(seed)
                print("\n\n######### Seed{}, Subset{}, N{} #########".format(seed, subset, n))

                X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=n, random_state=seed, shuffle=True)
                
                X_train = np.delete(X_train, 1, axis=1) #del final_test_acc
                final_test_acc = X_test[:, 1] #save final_test_acc
                X_test = np.delete(X_test, 1, axis=1) #del final_test_acc
                
                min_max_scaler = MinMaxScaler()
                min_max_scaler.fit(X_train)
                X_train = min_max_scaler.transform(X_train)
                X_test = min_max_scaler.transform(X_test)

                predictions[str('Seed' + str(seed) + "_Subset" + str(subset) + "_N" + str(n) + "_True_Test_Acc")] = final_test_acc #final_test_acc
                predictions[str('Seed' + str(seed) + "_Subset" + str(subset) + "_N" + str(n) + "_True_Val_Acc")] = y_test #final_val_acc
                
                for reg in args.estimators:    
                    estimator = load_estimator(str("seed" + str(seed) + "_subset" + str(subset) + "_n" + str(n) + "_" + reg))
                    estimator_name = str(estimator)[:str(estimator).index('(')]
                    print("\n", estimator_name)

                    y_pred, fit_time = preds_time_extraction(estimator, X_train, y_train, X_test, y_test)

                    #save results in dicsts
                    predictions[str('Seed' + str(seed) + "_Subset" + str(subset) + "_N" + str(n) + "_" + estimator_name + "_Pred_Val_Acc")] = y_pred                   
                    fit_times[estimator_name].append(fit_time)                    

                    mae = mean_absolute_error(y_test, y_pred)
                    mae_values[str(reg + '_MAE')].append(mae)
                    
                    mse = mean_squared_error(y_test, y_pred)
                    mse_values[str(reg + '_MSE')].append(mse)
                    
                    r2 = r2_score(y_test, y_pred)
                    r2_values[str(reg + '_R2')].append(r2)
                        
        performance_metrics = {**mae_values, **mse_values, **r2_values}
                    
        #one file per 'subset' per 'n'
        save_results(predictions, subset, "nasbench_predictions")
        save_results(fit_times, subset, "nasbench_fit_times")
        save_results(performance_metrics, subset, "nasbench_metrics")
        print("\n#########################################")
        print("####### Results for Subset{} GENERATED #######".format(subset))
        print("#########################################\n")
        

In [None]:
if __name__ == '__main__':
    print("data_path: ", args.data_path)
    print("model_path: ", args.model_path)
    print("results_path: ", args.results_path)
    print("target: ", args.target)
    print("data_subset: ", args.data_subset)
    print("seed: ", args.seed)
    print("train_size: ", args.train_size)                
    print("estimators: ", args.estimators)
    print("features drop: ", args.features_drop)

    run()

data_path:  /home/gean/Code/nns_performance_prediction/meta_datasets/
model_path:  /home/gean/Code/nns_performance_prediction/saved_models/fast/test1/
results_path:  /home/gean/Code/nns_performance_prediction/results/fast/test1/
target:  final_validation_accuracy
data_subset:  [4, 12]
seed:  [0, 42]
train_size:  [43, 86]
estimators:  ['LinearRegression', 'SGDRegressor', 'Lasso', 'BayesianRidge', 'KNeighborsRegressor', 'DecisionTreeRegressor', 'SVR', 'MLPRegressor', 'RandomForestRegressor', 'AdaBoostRegressor', 'GradientBoostingRegressor', 'DummyRegressor']
features drop:  []
module_adjacency              object
module_operations             object
trainable_parameters           uint8
final_validation_accuracy    float16
final_test_accuracy          float16
conv_num_layers                uint8
conv_kernel_min                uint8
conv_kernel_max                uint8
conv_kernel_mode               uint8
maxpool_num_layers             uint8
dtype: object


######### Seed0, Subset4, N43 ##