In [3]:
import argparse

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import os
import random
import time
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split

from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso, BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.dummy import DummyRegressor

from joblib import dump, load

In [4]:
parser = argparse.ArgumentParser()
 
parser.add_argument('--data_path', type=str, default='/home/gean/Code/nns_performance_prediction/meta_datasets/', help='location of the dataset')    
parser.add_argument('--model_path', type=str, default='/home/gean/Code/nns_performance_prediction/saved_models/fast/test1/', help='path to save the trained models')
parser.add_argument('--results_path', type=str, default='/home/gean/Code/nns_performance_prediction/results/fast/test1/', help='location of the results directory')    
parser.add_argument('--target', type=str, default='final_validation_accuracy', help='target of the training/test')

#'+' == 1 or more, '*' == 0 or more, '?' == 0 or 1.
parser.add_argument('--data_subset', type=int, default=[4, 12, 36, 108], help='one of the subsets from nasbench101 with 4, 12, 36, or 108 epochs')
parser.add_argument('--seed', type=int, default=[0, 42], nargs='+', help='seeds used for all the random procedures') 
parser.add_argument('--train_size', type=int, default=[43, 86, 129, 172, 344, 860], help='[Int, Int...] representing the total number of train samples')

parser.add_argument('--estimators', type=str, default=['LinearRegression', 'SGDRegressor', 'Lasso', 'BayesianRidge', 'KNeighborsRegressor', 'DecisionTreeRegressor',
                                                       'SVR', 'MLPRegressor', 'RandomForestRegressor', 'AdaBoostRegressor', 'GradientBoostingRegressor', 
                                                       'DummyRegressor'], nargs='+', help='list of sklearn estimators to be used for training') 

parser.add_argument('--features_drop', type=str, default=['module_adjacency', 'halfway_training_time', 'halfway_train_accuracy', 'halfway_validation_accuracy', 
                                                          'halfway_test_accuracy', 'final_training_time', 'final_train_accuracy', 'final_test_accuracy'], 
                    nargs='+', help='list of features to drop from nasbench101')

args, unknown = parser.parse_known_args()

In [5]:
def set_default_seed(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

In [6]:
def get_preprocessed_numpy_dataset(data_subset):
    df_whole = pd.read_csv(str(args.data_path + 'nasbench101_' + str(data_subset) + 'epochs_tabular.csv'))
    df_whole.drop(args.features_drop, axis=1, inplace=True)
    df_whole = pd.get_dummies(df_whole)
    
    df_y = df_whole[args.target]
    df_X = df_whole.drop([args.target], axis = 1)
    X = df_X.to_numpy()
    y = df_y.to_numpy()
    
    return X, y

In [7]:
def train_and_test_whole(estimator, X_train, y_train, X_test, y_test):
    tic = time.time()
    estimator.fit(X_train, y_train)
    toc = time.time()
    print("Training DONE")
    y_pred = estimator.predict(X_test)
    print("Testing DONE\n\n")
    
    return y_pred, (toc - tic)

In [8]:
def load_estimator(file_name: str):
    estimator = load(str(args.model_path + file_name + '.joblib'))
    
    return estimator

In [9]:
def save_results(performance_dict, n, file_name): 
    df_results = pd.DataFrame.from_dict(performance_dict)
    df_results.to_csv(str(args.results_path + "N" + str(n) + "_" + file_name + '.csv'),
                      index=False, float_format='%.6f')

In [10]:
def extract_predictions_and_fittime():
    for n in args.train_size:
        predictions = {}
        fit_times = {'LinearRegression': [], 'SGDRegressor': [], 'Lasso': [], 'BayesianRidge': [], 'KNeighborsRegressor': [], 'DecisionTreeRegressor': [], 'SVR': [], 
                     'MLPRegressor': [], 'RandomForestRegressor': [], 'AdaBoostRegressor': [], 'GradientBoostingRegressor': [], 'DummyRegressor': []}

        for seed in args.seed:
            set_default_seed(seed)

            for subset in args.data_subset:
                X, y = get_preprocessed_numpy_dataset(subset)
                print("\n\n######### Seed", seed, ", Subset", subset, ", N", n)

                X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=n, random_state=seed, shuffle=True)
                min_max_scaler = MinMaxScaler()
                min_max_scaler.fit(X_train)
                X_train = min_max_scaler.transform(X_train)
                X_test = min_max_scaler.transform(X_test)

                key = str('Seed' + str(seed) + "_Subset" + str(subset) + "_N" + str(n) + "_Ytrue")
                predictions[key] = y_test
                
                for reg in args.estimators:    
                    estimator = load_estimator(str("seed" + str(seed) + "_subset" + str(subset) + "_n" + str(n) + "_" + reg))
                    estimator_name = str(estimator)[:str(estimator).index('(')]
                    print("\n", estimator_name)

                    y_pred, fit_time = train_and_test_whole(estimator, X_train, y_train, X_test, y_test)

                    key = str('Seed' + str(seed) + "_Subset" + str(subset) + "_N" + str(n) + "_" + estimator_name + "_Ypred")
                    predictions[key] = y_pred
                    fit_times[estimator_name].append(fit_time)

        #one file per n
        save_results(predictions, n, "nasbench_predictions")
        save_results(fit_times, n, "nasbench_fit_times")

In [11]:
if __name__ == '__main__':
    print("data_path: ", args.data_path)
    print("model_path: ", args.model_path)
    print("results_path: ", args.results_path)
    print("target: ", args.target)
    print("data_subset: ", args.data_subset)
    print("seed: ", args.seed)
    print("train_size: ", args.train_size)                
    print("estimators: ", args.estimators)
    print("features drop: ", args.features_drop)

    extract_predictions_and_fittime()

data_path:  /home/gean/Code/nns_performance_prediction/meta_datasets/
model_path:  /home/gean/Code/nns_performance_prediction/saved_models/fast/test1/
results_path:  /home/gean/Code/nns_performance_prediction/saved_models/fast/test1/
target:  final_validation_accuracy
data_subset:  [4, 12, 36, 108]
seed:  [0, 42]
train_size:  [43, 86, 129, 172, 344, 860]
estimators:  ['LinearRegression', 'SGDRegressor', 'Lasso', 'BayesianRidge', 'KNeighborsRegressor', 'DecisionTreeRegressor', 'SVR', 'MLPRegressor', 'RandomForestRegressor', 'AdaBoostRegressor', 'GradientBoostingRegressor', 'DummyRegressor']
features drop:  ['module_adjacency', 'halfway_training_time', 'halfway_train_accuracy', 'halfway_validation_accuracy', 'halfway_test_accuracy', 'final_training_time', 'final_train_accuracy', 'final_test_accuracy']


######### Seed 0 , Subset 4 , N 43

 LinearRegression
Training DONE
Testing DONE



 SGDRegressor
Training DONE
Testing DONE



 Lasso
Training DONE
Testing DONE



 BayesianRidge
Trainin

KeyboardInterrupt: 