In [127]:
import os
import sys

sys.path.append("../")

import pickle

from sklearn.model_selection import train_test_split
from sklearn import datasets
import pandas as pd

from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import accuracy, balanced_accuracy, f1_macro, f1_weighted, precision_macro, precision_weighted, \
    recall_macro, recall_weighted
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

from util import read_dataset, get_dataset_paths, create_result_directory

from typing import Dict, Any
import hashlib
import json
import shutil 


def dict_hash(dictionary: Dict[str, Any]) -> str:
    """MD5 hash of a dictionary."""
    dhash = hashlib.md5()
    # We need to sort arguments so {'a': 1, 'b': 2} is
    # the same as {'b': 2, 'a': 1}
    encoded = json.dumps(dictionary, sort_keys=True).encode()
    dhash.update(encoded)
    return dhash.hexdigest()


def metric_list():
    metrics = [
        accuracy,
        balanced_accuracy,
        f1_macro,
        f1_weighted,
        precision_macro,
        precision_weighted,
        recall_macro,
        recall_weighted
    ]
    return metrics


def apply_metrics(y_true, y_pred, tag):
    return {f.name + "_" + tag: f(y_true, y_pred) for f in metric_list()}


def update_dicts(dict_list):
    if len(dict_list) == 0:
        return {}

    d = dict_list.pop()
    result = update_dicts(dict_list)
    result.update(d)
    return result


def save_experiment(result_directory, dataset_name, obj):
    df_path = result_directory + "/" + dataset_name + "_cv_results_.csv"
    df = pd.DataFrame(obj.cv_results_)
    df.to_csv(df_path, index=False)

    df_path = result_directory + "/" + dataset_name + "performance_over_time_.csv"
    df = pd.DataFrame(obj.performance_over_time_)
    df.to_csv(df_path, index=False)


def ger_dataset_name(dataset_path):
    return "dataset_" + dataset_path.split("dataset_")[1].split(".pkl")[0]


def ger_directory_name(directory, dataset_name, seed):
    return directory + "/" + dataset_name + "/" + str(seed)


def ger_tmp_fold_name(dataset_name, time_left_for_this_task, seed):
    return "auto-sklearn-" + dataset_name + "_" + str(time_left_for_this_task) + "_" + str(seed)


def create_result_directory(directory_name):
    if not os.path.exists(directory_name):
        os.makedirs(directory_name)

    return


def if_result_directory_exit(directory_name):
    if os.path.exists(directory_name):
        print("Experiment finished")
        exit(0)

    return


def generate_pipelines(
        dataset_path,
        result_directory,
        time_left_for_this_task=120,
        per_run_time_limit=30,
        memory_limit=10240,
        resampling_strategy="holdout",
        seed=1,
        number_of_configs=2,
        n_splits=10
):
    dataset_name = ger_dataset_name(dataset_path)
    directory_name = ger_directory_name(result_directory, dataset_name, seed)
    tmp_folder_name = ger_tmp_fold_name(dataset_name, number_of_configs, seed)

    #     print(directory_name)
    #     if_result_directory_exit(directory_name) # pass
    X, y, categorical_indicator, attribute_names = read_dataset(dataset_path)
    y = y.cat.codes

    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, random_state=seed)

    # result dir
    create_result_directory(directory_name)

    for config_id in range(number_of_configs):
    
        estimator = AutoSklearnClassifier(
            time_left_for_this_task=time_left_for_this_task,
            per_run_time_limit=per_run_time_limit,
            memory_limit=memory_limit,
            initial_configurations_via_metalearning=0,
            resampling_strategy=resampling_strategy,
            scoring_functions=metric_list(),
            tmp_folder=tmp_folder_name,
            delete_tmp_folder_after_terminate=False,
            seed=seed,
            ensemble_class=None
        )

        # generate configs
        cs = estimator.get_configuration_space(X_train, y_train, dataset_name=dataset_name)
        cs.seed(seed)
        configs = cs.sample_configuration(number_of_configs)
        config = configs[config_id]

        # open file to save the results
        df_path = directory_name + "/" + dataset_name + "_cv_results_iter.csv"
        result_df = pd.read_csv(df_path) if os.path.exists(df_path) else None

        # if exist a related config id
        config_hash = dict_hash(config.get_dictionary())
        if isinstance(result_df, pd.DataFrame):
            if config_hash in result_df["config_hash"].unique():
                shutil.rmtree(tmp_folder_name, ignore_errors=True)
                continue

        try:
            config.is_valid_configuration()
            skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
            i = 0
            for train_index, test_index in skf.split(X_train, y_train):
                i += 1
                X_train_i, y_train_i = X_train.iloc[train_index, :], y_train.iloc[train_index]
                X_test_i, y_test_i = X_train.iloc[test_index, :], y_train.iloc[test_index]

                pipeline, run_info, run_value = estimator.fit_pipeline(
                    X=X_train_i,
                    y=y_train_i,
                    dataset_name=dataset_name,
                    config=config,
                    X_test=X_test_i,
                    y_test=y_test_i,
                )

                result_dict = {
                    "seed_i": seed,
                    "config_id": config_id,
                    "fold": i,
                    "config_hash": config_hash
                }
                result_dict.update({
                    "duration": run_value.time,
                    'start_time': run_value.starttime,
                    'end_time': run_value.endtime,
                    'status': str(run_value.status)
                })
                result_dict.update({
                    "seed": run_info.seed,
                    "budget": run_info.budget,
                })
                result_dict.update(run_info.config.get_dictionary())

                if pipeline != None:
                    Xs = [X_train_i, X_test_i, X_test]
                    ys = [y_train_i, y_test_i, y_test]
                    tags = ["train", "val", "test"]
                    yps = [pipeline.predict(Xi) for Xi in Xs]
                    perfs = [apply_metrics(yt, yp, t) for yt, yp, t in zip(ys, yps, tags)]
                    result_dict.update(update_dicts(perfs))

                result_frame = pd.Series(result_dict).to_frame().T.reset_index()

                result_df = pd.concat([result_df, result_frame]) if isinstance(result_df,
                                                                               pd.DataFrame) else result_frame

            if (config_id % 10) == 0:
                result_df.to_csv(df_path, index=False)
            
            shutil.rmtree(tmp_folder_name, ignore_errors=True)

        except Exception as e:
            print(e)
            shutil.rmtree(tmp_folder_name, ignore_errors=True)
            continue

        result_df.to_csv(df_path, index=False)

    

In [128]:
# !rm -rf auto-sklearn-dataset_1044_120_1

generate_pipelines(
    dataset_path = "../../datasets/training/dataset_1044.pkl",
    result_directory = "../../results/pipeline_generation_test",
    time_left_for_this_task=120,
    per_run_time_limit=120,
    memory_limit=10240,
    resampling_strategy="holdout",
    seed=1,
)

In [129]:
# pd.set_option('max_columns', None)
pd.set_option('display.max_columns', 500)
d = pd.read_csv("../../results/pipeline_generation_test/dataset_1044/1/dataset_1044_cv_results_iter.csv")
d

Unnamed: 0,index,seed_i,config_id,fold,config_hash,duration,start_time,end_time,status,seed,budget,balancing:strategy,classifier:__choice__,data_preprocessor:__choice__,feature_preprocessor:__choice__,classifier:passive_aggressive:C,classifier:passive_aggressive:average,classifier:passive_aggressive:fit_intercept,classifier:passive_aggressive:loss,classifier:passive_aggressive:tol,data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__,data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__,data_preprocessor:feature_type:numerical_transformer:imputation:strategy,data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__,feature_preprocessor:polynomial:degree,feature_preprocessor:polynomial:include_bias,feature_preprocessor:polynomial:interaction_only,data_preprocessor:feature_type:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction,accuracy_train,balanced_accuracy_train,f1_macro_train,f1_weighted_train,precision_macro_train,precision_weighted_train,recall_macro_train,recall_weighted_train,accuracy_val,balanced_accuracy_val,f1_macro_val,f1_weighted_val,precision_macro_val,precision_weighted_val,recall_macro_val,recall_weighted_val,accuracy_test,balanced_accuracy_test,f1_macro_test,f1_weighted_test,precision_macro_test,precision_weighted_test,recall_macro_test,recall_weighted_test,classifier:libsvm_svc:C,classifier:libsvm_svc:gamma,classifier:libsvm_svc:kernel,classifier:libsvm_svc:max_iter,classifier:libsvm_svc:shrinking,classifier:libsvm_svc:tol,feature_preprocessor:random_trees_embedding:bootstrap,feature_preprocessor:random_trees_embedding:max_depth,feature_preprocessor:random_trees_embedding:max_leaf_nodes,feature_preprocessor:random_trees_embedding:min_samples_leaf,feature_preprocessor:random_trees_embedding:min_samples_split,feature_preprocessor:random_trees_embedding:min_weight_fraction_leaf,feature_preprocessor:random_trees_embedding:n_estimators,classifier:libsvm_svc:coef0,data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_max,data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_min
0,0,1,0,1,dd4face2445123e133e58f0c70bd50bb,1.641744,1680579000.0,1680579000.0,StatusType.SUCCESS,1,0.0,weighting,passive_aggressive,feature_type,polynomial,0.002433,False,True,hinge,0.010075,encoding,minority_coalescer,median,none,2.0,False,False,0.067128,0.40401,0.361561,0.277395,0.294746,0.425442,0.420798,0.361561,0.40401,0.401949,0.359728,0.27553,0.29311,0.433182,0.426719,0.359728,0.401949,0.401244,0.357368,0.269184,0.288379,0.412833,0.410519,0.357368,0.401244,,,,,,,,,,,,,,,,
1,0,1,0,2,dd4face2445123e133e58f0c70bd50bb,1.452913,1680579000.0,1680579000.0,StatusType.SUCCESS,1,0.0,weighting,passive_aggressive,feature_type,polynomial,0.002433,False,True,hinge,0.010075,encoding,minority_coalescer,median,none,2.0,False,False,0.067128,0.403739,0.361118,0.276806,0.294268,0.426989,0.421847,0.361118,0.403739,0.404385,0.36369,0.280876,0.297468,0.420083,0.417888,0.36369,0.404385,0.401244,0.357368,0.269184,0.288379,0.412833,0.410519,0.357368,0.401244,,,,,,,,,,,,,,,,
2,0,1,0,3,dd4face2445123e133e58f0c70bd50bb,1.444647,1680579000.0,1680579000.0,StatusType.SUCCESS,1,0.0,weighting,passive_aggressive,feature_type,polynomial,0.002433,False,True,hinge,0.010075,encoding,minority_coalescer,median,none,2.0,False,False,0.067128,0.403414,0.360892,0.276803,0.294349,0.427842,0.423745,0.360892,0.403414,0.407317,0.365762,0.281163,0.296928,0.41825,0.406661,0.365762,0.407317,0.401244,0.357368,0.269184,0.288379,0.412833,0.410519,0.357368,0.401244,,,,,,,,,,,,,,,,
3,0,1,0,4,dd4face2445123e133e58f0c70bd50bb,1.478803,1680579000.0,1680579000.0,StatusType.SUCCESS,1,0.0,weighting,passive_aggressive,feature_type,polynomial,0.002433,False,True,hinge,0.010075,encoding,minority_coalescer,median,none,2.0,False,False,0.067128,0.402059,0.35968,0.275354,0.29275,0.423583,0.418627,0.35968,0.402059,0.419512,0.376656,0.293849,0.311071,0.44865,0.445183,0.376656,0.419512,0.401244,0.357368,0.269184,0.288379,0.412833,0.410519,0.357368,0.401244,,,,,,,,,,,,,,,,
4,0,1,0,5,dd4face2445123e133e58f0c70bd50bb,1.589758,1680579000.0,1680579000.0,StatusType.SUCCESS,1,0.0,weighting,passive_aggressive,feature_type,polynomial,0.002433,False,True,hinge,0.010075,encoding,minority_coalescer,median,none,2.0,False,False,0.067128,0.403549,0.361348,0.276687,0.293835,0.424753,0.420259,0.361348,0.403549,0.406098,0.36164,0.281162,0.300606,0.443151,0.434608,0.36164,0.406098,0.401244,0.357368,0.269184,0.288379,0.412833,0.410519,0.357368,0.401244,,,,,,,,,,,,,,,,
5,0,1,0,6,dd4face2445123e133e58f0c70bd50bb,1.536392,1680579000.0,1680579000.0,StatusType.SUCCESS,1,0.0,weighting,passive_aggressive,feature_type,polynomial,0.002433,False,True,hinge,0.010075,encoding,minority_coalescer,median,none,2.0,False,False,0.067128,0.4068,0.364366,0.28109,0.298371,0.436902,0.431943,0.364366,0.4068,0.376829,0.334468,0.242054,0.260272,0.308346,0.304209,0.334468,0.376829,0.401244,0.357368,0.269184,0.288379,0.412833,0.410519,0.357368,0.401244,,,,,,,,,,,,,,,,
6,0,1,0,7,dd4face2445123e133e58f0c70bd50bb,1.477314,1680579000.0,1680579000.0,StatusType.SUCCESS,1,0.0,weighting,passive_aggressive,feature_type,polynomial,0.002433,False,True,hinge,0.010075,encoding,minority_coalescer,median,none,2.0,False,False,0.067128,0.405581,0.363661,0.28104,0.297875,0.430524,0.4252,0.363661,0.405581,0.387805,0.340794,0.241103,0.263685,0.373348,0.375049,0.340794,0.387805,0.401244,0.357368,0.269184,0.288379,0.412833,0.410519,0.357368,0.401244,,,,,,,,,,,,,,,,
7,0,1,0,8,dd4face2445123e133e58f0c70bd50bb,1.465475,1680579000.0,1680579000.0,StatusType.SUCCESS,1,0.0,weighting,passive_aggressive,feature_type,polynomial,0.002433,False,True,hinge,0.010075,encoding,minority_coalescer,median,none,2.0,False,False,0.067128,0.401382,0.358847,0.273876,0.291524,0.416423,0.412009,0.358847,0.401382,0.42561,0.384155,0.307062,0.322018,0.514784,0.507062,0.384155,0.42561,0.401244,0.357368,0.269184,0.288379,0.412833,0.410519,0.357368,0.401244,,,,,,,,,,,,,,,,
8,0,1,0,9,dd4face2445123e133e58f0c70bd50bb,1.461009,1680579000.0,1680579000.0,StatusType.SUCCESS,1,0.0,weighting,passive_aggressive,feature_type,polynomial,0.002433,False,True,hinge,0.010075,encoding,minority_coalescer,median,none,2.0,False,False,0.067128,0.402601,0.360115,0.275834,0.293371,0.422369,0.417624,0.360115,0.402601,0.414634,0.372742,0.289428,0.305384,0.465893,0.460998,0.372742,0.414634,0.401244,0.357368,0.269184,0.288379,0.412833,0.410519,0.357368,0.401244,,,,,,,,,,,,,,,,
9,0,1,0,10,dd4face2445123e133e58f0c70bd50bb,1.61422,1680579000.0,1680579000.0,StatusType.SUCCESS,1,0.0,weighting,passive_aggressive,feature_type,polynomial,0.002433,False,True,hinge,0.010075,encoding,minority_coalescer,median,none,2.0,False,False,0.067128,0.404904,0.362183,0.277282,0.294811,0.426498,0.421185,0.362183,0.404904,0.393902,0.354145,0.276435,0.29242,0.429221,0.428174,0.354145,0.393902,0.401244,0.357368,0.269184,0.288379,0.412833,0.410519,0.357368,0.401244,,,,,,,,,,,,,,,,


In [69]:
a = "dd4face2445123e133e58f0c70bd50bb"
a in d["config_hash"]

False

In [73]:
a in d["config_hash"].unique()

True

In [20]:
import numpy as np
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics

from ConfigSpace.configuration_space import Configuration
from autosklearn.metrics import accuracy, balanced_accuracy, f1_macro, f1_weighted, precision_macro, precision_weighted, recall_macro, recall_weighted 


import autosklearn.classification

def metric_list():
    metrics=[
        accuracy,
        balanced_accuracy,
        f1_macro,
        f1_weighted,
        precision_macro,
        precision_weighted,
        recall_macro,
        recall_weighted
    ]
    return metrics


X, y = sklearn.datasets.fetch_openml(data_id=3, return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X, y, test_size=0.5, random_state=3
)


estimator = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    per_run_time_limit=120,
    initial_configurations_via_metalearning=0,
    ensemble_class= None,
    memory_limit=4000,
#     resampling_strategy="cv",
    scoring_functions=metric_list(),
    seed=0,
    delete_tmp_folder_after_terminate=False,
    tmp_folder="tmp_folder/"

)

cs = estimator.get_configuration_space(X_train, y_train, dataset_name="kr-vs-kp")
config = cs.sample_configuration()

# Make sure that your changed configuration complies with the configuration space
config.is_valid_configuration()



In [21]:
pipeline, run_info, run_value = estimator.fit_pipeline(
    X=X_train,
    y=y_train,
    dataset_name="kr-vs-kp",
    config=config,
    X_test=X_test,
    y_test=y_test
)

In [22]:
# This object complies with Scikit-Learn Pipeline API.
# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
print(pipeline.named_steps)

{'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f25fbb052d0>, 'balancing': Balancing(random_state=0, strategy='weighting'), 'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f25ecda5ae0>, 'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f25ecda6770>}


In [23]:
# The fit_pipeline command also returns a named tuple with the pipeline constraints
print(run_info)

RunInfo(config=Configuration(values={
  'balancing:strategy': 'weighting',
  'classifier:__choice__': 'random_forest',
  'classifier:random_forest:bootstrap': 'True',
  'classifier:random_forest:criterion': 'gini',
  'classifier:random_forest:max_depth': 'None',
  'classifier:random_forest:max_features': 0.5788062783424068,
  'classifier:random_forest:max_leaf_nodes': 'None',
  'classifier:random_forest:min_impurity_decrease': 0.0,
  'classifier:random_forest:min_samples_leaf': 7,
  'classifier:random_forest:min_samples_split': 13,
  'classifier:random_forest:min_weight_fraction_leaf': 0.0,
  'data_preprocessor:__choice__': 'feature_type',
  'data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__': 'no_encoding',
  'data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__': 'no_coalescense',
  'feature_preprocessor:__choice__': 'pca',
  'feature_preprocessor:pca:keep_variance': 0.7315729299856697,
  'feature_preprocessor:pca

['__abstractmethods__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_keys',
 '_num_hyperparameters',
 '_populate_values',
 '_query_values',
 '_values',
 '_vector',
 'allow_inactive_with_values',
 'config_id',
 'configuration_space',
 'get',
 'get_array',
 'get_dictionary',
 'is_valid_configuration',
 'items',
 'keys',
 'origin',
 'values']

In [None]:
import pandas as pd
pd.DataFrame()

In [52]:
dir(run_value)

['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '_asdict',
 '_field_defaults',
 '_fields',
 '_make',
 '_replace',
 'additional_info',
 'cost',
 'count',
 'endtime',
 'index',
 'starttime',
 'status',
 'time']

In [17]:
# The fit_pipeline command also returns a named tuple with train/test performance
print(run_value)



RunValue(cost=0.07007575757575757, time=0.6700279712677002, status=<StatusType.SUCCESS: 1>, starttime=1678635367.5820646, endtime=1678635368.2657409, additional_info={'accuracy': 0.07007575757575757, 'balanced_accuracy': 0.07256984928963162, 'f1_macro': 0.07073570784588479, 'f1_weighted': 0.07034750180463356, 'precision_macro': 0.0658771929824562, 'precision_weighted': 0.06762659489633172, 'recall_macro': 0.07256984928963162, 'recall_weighted': 0.07007575757575757, 'duration': 0.6190192699432373, 'num_run': 3, 'train_loss': 0.04299065420560744, 'configuration_origin': None})


In [18]:
# We can make sure that our pipeline configuration was honored as follows
print("Passed Configuration:", pipeline.config)
print("Random Forest:", pipeline.named_steps["classifier"].choice.estimator)

Passed Configuration: Configuration(values={
  'balancing:strategy': 'none',
  'classifier:__choice__': 'qda',
  'classifier:qda:reg_param': 0.08945931211638725,
  'data_preprocessor:__choice__': 'feature_type',
  'data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding',
  'data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer',
  'data_preprocessor:feature_type:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.4928964349865436,
  'feature_preprocessor:__choice__': 'no_preprocessing',
})

Random Forest: QuadraticDiscriminantAnalysis(reg_param=0.08945931211638725)


In [None]:
# We can also search for new configurations using the fit() method
# Any configurations found by Auto-Sklearn -- even the ones created using
# fit_pipeline() are stored to disk and can be used for Ensemble Selection
cs = cls.fit(X, y, dataset_name="kr-vs-kp")

In [50]:

from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import accuracy, balanced_accuracy, f1_macro, f1_weighted, precision_macro, precision_weighted, recall_macro, recall_weighted 
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold



def metric_list():
    metrics=[
        accuracy,
        balanced_accuracy,
        f1_macro,
        f1_weighted,
        precision_macro,
        precision_weighted,
        recall_macro,
        recall_weighted
    ]
    return metrics

import numpy as np
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics

from ConfigSpace.configuration_space import Configuration

import autosklearn.classification


X, y = sklearn.datasets.fetch_openml(data_id=3, return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X, y, test_size=0.5, random_state=3
)

cls = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    per_run_time_limit=60,
    memory_limit=4096,
    # We will limit the configuration space only to
    # have RandomForest as a valid model. We recommend enabling all
    # possible models to get a better performance.
    include={"classifier": ["random_forest"]},
    scoring_functions=metric_list(),
    delete_tmp_folder_after_terminate=False,
)

# We will create a configuration that has a user defined
# min_samples_split in the Random Forest. We recommend you to look into
# how the ConfigSpace package works here:
# https://automl.github.io/ConfigSpace/master/
cs = cls.get_configuration_space(X, y, dataset_name="kr-vs-kp")
config = cs.sample_configuration()
config._values["classifier:random_forest:min_samples_split"] = 11

# Make sure that your changed configuration complies with the configuration space
config.is_valid_configuration()

pipeline, run_info, run_value = cls.fit_pipeline(
    X=X_train,
    y=y_train,
    dataset_name="kr-vs-kp",
    config=config,
    X_test=X_test,
    y_test=y_test,
)

# This object complies with Scikit-Learn Pipeline API.
# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
print(pipeline.named_steps)

# The fit_pipeline command also returns a named tuple with the pipeline constraints
print(run_info)

# The fit_pipeline command also returns a named tuple with train/test performance
print(run_value)

# We can make sure that our pipeline configuration was honored as follows
print("Passed Configuration:", pipeline.config)
print("Random Forest:", pipeline.named_steps["classifier"].choice.estimator)

pipeline.predict(X_train)



{'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f139b407220>, 'balancing': Balancing(random_state=1), 'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f13c0518220>, 'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f139b336350>}
RunInfo(config=Configuration(values={
  'balancing:strategy': 'none',
  'classifier:__choice__': 'random_forest',
  'classifier:random_forest:bootstrap': 'False',
  'classifier:random_forest:criterion': 'gini',
  'classifier:random_forest:max_depth': 'None',
  'classifier:random_forest:max_features': 0.6589866408660057,
  'classifier:random_forest:max_leaf_nodes': 'None',
  'classifier:random_forest:min_impurity_decrease': 0.0,
  'classifier:random_forest:min_samples_leaf': 7,
  'classifier:random_forest:min_samples_split': 11,
  'classifier:random_forest:min_weight_fraction_leaf': 0.0,
  'd

array([0, 0, 0, ..., 1, 1, 1], dtype=int32)

In [63]:
cs = cls.get_configuration_space(X_train, y_train, dataset_name="asd")
cs.seed(1)
configs = cs.sample_configuration(15)
configs[0]

Configuration(values={
  'balancing:strategy': 'weighting',
  'classifier:__choice__': 'random_forest',
  'classifier:random_forest:bootstrap': 'False',
  'classifier:random_forest:criterion': 'entropy',
  'classifier:random_forest:max_depth': 'None',
  'classifier:random_forest:max_features': 0.6716540974221343,
  'classifier:random_forest:max_leaf_nodes': 'None',
  'classifier:random_forest:min_impurity_decrease': 0.0,
  'classifier:random_forest:min_samples_leaf': 2,
  'classifier:random_forest:min_samples_split': 20,
  'classifier:random_forest:min_weight_fraction_leaf': 0.0,
  'data_preprocessor:__choice__': 'feature_type',
  'data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__': 'no_encoding',
  'data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer',
  'data_preprocessor:feature_type:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.0014550223790754202,
  

In [64]:
config.get_dictionary()

{'classifier:random_forest:min_samples_split': 11,
 'balancing:strategy': 'none',
 'classifier:__choice__': 'random_forest',
 'data_preprocessor:__choice__': 'feature_type',
 'feature_preprocessor:__choice__': 'extra_trees_preproc_for_classification',
 'classifier:random_forest:bootstrap': 'False',
 'classifier:random_forest:criterion': 'gini',
 'classifier:random_forest:max_depth': 'None',
 'classifier:random_forest:max_features': 0.6589866408660057,
 'classifier:random_forest:max_leaf_nodes': 'None',
 'classifier:random_forest:min_impurity_decrease': 0.0,
 'classifier:random_forest:min_samples_leaf': 7,
 'classifier:random_forest:min_weight_fraction_leaf': 0.0,
 'data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__': 'no_encoding',
 'data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer',
 'feature_preprocessor:extra_trees_preproc_for_classification:bootstrap': 'False',
 'feature_preprocessor:ext

In [133]:
run_value


RunValue(cost=0.011374407582938395, time=2.291515350341797, status=<StatusType.SUCCESS: 1>, starttime=1678642703.6772435, endtime=1678642705.9952924, additional_info={'accuracy': 0.011374407582938395, 'balanced_accuracy': 0.011650894477573281, 'f1_macro': 0.011403186406969468, 'f1_weighted': 0.011377665563017558, 'precision_macro': 0.011091042806774642, 'precision_weighted': 0.011316774069142443, 'recall_macro': 0.011650894477573281, 'recall_weighted': 0.011374407582938395, 'duration': 2.2287580966949463, 'num_run': 2, 'train_loss': 0.009341429238673538, 'configuration_origin': None})

In [135]:
dir(run_value)

['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '_asdict',
 '_field_defaults',
 '_fields',
 '_make',
 '_replace',
 'additional_info',
 'cost',
 'count',
 'endtime',
 'index',
 'starttime',
 'status',
 'time']

In [140]:
{
 'duration': run_value.additional_info["num_run"],
 'model_id': run_value,
 'start_time': run_value.starttime,
 'end_time': run_value.endtime,
 'status': str(run_value.status)
}

{'duration': 2.291515350341797,
 'model_id': 2,
 'start_time': 1678642703.6772435,
 'end_time': 1678642705.9952924,
 'status': 'StatusType.SUCCESS'}

In [41]:
dir(run_value)

['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '_asdict',
 '_field_defaults',
 '_fields',
 '_make',
 '_replace',
 'additional_info',
 'cost',
 'count',
 'endtime',
 'index',
 'starttime',
 'status',
 'time']

In [171]:

model_type = "classifier"
if model_type is None:
    raise RuntimeError(f"Unknown `automl_class` {self._get_automl_class()}")

run_info.config._values



{'classifier:random_forest:min_samples_split': 11,
 'balancing:strategy': 'weighting',
 'classifier:__choice__': 'random_forest',
 'data_preprocessor:__choice__': 'feature_type',
 'feature_preprocessor:__choice__': 'feature_agglomeration',
 'classifier:random_forest:bootstrap': 'False',
 'classifier:random_forest:criterion': 'entropy',
 'classifier:random_forest:max_depth': 'None',
 'classifier:random_forest:max_features': 0.7605427846656938,
 'classifier:random_forest:max_leaf_nodes': 'None',
 'classifier:random_forest:min_impurity_decrease': 0.0,
 'classifier:random_forest:min_samples_leaf': 8,
 'classifier:random_forest:min_weight_fraction_leaf': 0.0,
 'data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding',
 'data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer',
 'feature_preprocessor:feature_agglomeration:affinity': 'cosine',
 'feature_preprocessor:feature_agglomeration:li

In [164]:
dir(run_info)

['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '_asdict',
 '_field_defaults',
 '_fields',
 '_make',
 '_replace',
 'budget',
 'capped',
 'config',
 'count',
 'cutoff',
 'index',
 'instance',
 'instance_specific',
 'seed',
 'source_id']

In [43]:
{
"seed": run_info.seed,
"budget": run_info.budget,
}.update(run_info.config.get_dictionary())



In [189]:
!pip show scikit-learn

Name: scikit-learn
Version: 0.24.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /home/ealcobaca/Projects/dynamic_pipeline_search_space/venv/lib/python3.10/site-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: auto-sklearn, smac


In [None]:
!pip3 install --upgrade scikit-learn auto-sklearn

Collecting scikit-learn
  Using cached scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)


In [45]:
from jobs.jobs import euler
from util import get_dataset_paths


TMP = "tmp/euler_job/"
EULER_SCIRPT = """
#PBS -N p_
#PBS -l select=1:ncpus=10:nodetype=n40:mem=10GB
#PBS -l walltime=12:00:00

cd /lustre/alcobaca/dynamic_pipeline_search_space
source env3.6/bin/activate
module load python/3.6.8-pandas
cd source/generate_pipelines 

# the command

"""

TIME = "3600"
SEEDS = range(1)


def run_euler():
    cmd = "python3.6 run.py ../../results/pipeline_generation/{0}sec {1} {0} {2}\n"
    cmd_paths = get_dataset_paths("../../datasets/training/") + get_dataset_paths("../../datasets/testing/")

    os.makedirs(TMP, exist_ok=True)

    cmd_str = ""
    for path in cmd_paths:
        for s in SEEDS:
            cmd_str += cmd.format(TIME, path, s)

    req_path = TMP + "req.txt"
    job_path = TMP + "job.txt"

    f = open(job_path, "w")
    f.write(cmd_str)
    f.close()

    f = open(req_path, "w")
    f.write(EULER_SCIRPT)
    f.close()

    print("Command list:")
    print(cmd_str)
    print()
    print("Requirements list:")
    print(EULER_SCIRPT)
    print()
    euler(command_line=job_path, requirements=req_path, sleep_time=1800,
          job_name="ftm")

In [46]:
run_euler()

Command list:
python3.6 run.py ../../results/pipeline_generation/3600sec ../../datasets/training/dataset_40499.pkl 3600 0
python3.6 run.py ../../results/pipeline_generation/3600sec ../../datasets/training/dataset_871.pkl 3600 0
python3.6 run.py ../../results/pipeline_generation/3600sec ../../datasets/training/dataset_40704.pkl 3600 0
python3.6 run.py ../../results/pipeline_generation/3600sec ../../datasets/training/dataset_23381.pkl 3600 0
python3.6 run.py ../../results/pipeline_generation/3600sec ../../datasets/training/dataset_735.pkl 3600 0
python3.6 run.py ../../results/pipeline_generation/3600sec ../../datasets/training/dataset_41144.pkl 3600 0
python3.6 run.py ../../results/pipeline_generation/3600sec ../../datasets/training/dataset_1538.pkl 3600 0
python3.6 run.py ../../results/pipeline_generation/3600sec ../../datasets/training/dataset_41986.pkl 3600 0
python3.6 run.py ../../results/pipeline_generation/3600sec ../../datasets/training/dataset_40994.pkl 3600 0
python3.6 run.py ..

FileNotFoundError: [Errno 2] No such file or directory: 'qsub'