In [2]:
from joblib import Parallel, delayed
import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.dummy import DummyRegressor
from utils import *
import os

In [3]:
def append_pairwise_error(df):
    df["pairwise_error"] = 0
    counter = 0
    for algo1 in algorithms:
        for algo2 in algorithms:
            if algo1 != algo2:
                pair_error = ((df[f'{algo1}_pred'] < df[f'{algo2}_pred']) != (df[f'{algo1}_true'] < df[f'{algo2}_true'])).astype(int)
                df["pairwise_error"] += pair_error
                counter += 1
    df["pairwise_error"] /= counter
    return df

def transform_to_X_Y(data, features, algorithms):
    X = data[features]
    #X = X.replace([np.inf, -np.inf], np.nan)
    X = X.replace([np.inf, -np.inf, np.nan], 0)
    
    Y = data[algorithms]
    return X, Y

In [4]:
algorithms = ['GA', 'PSO', 'DE', 'CMAES', 'ES']
meta_columns = ['problem1', 'problem2', 'instance1', 'instance2', 'alpha', 'dim', 'algorithm_run']
meta_columns_no_run = ['problem1', 'problem2', 'instance1', 'instance2', 'alpha', 'dim']

In [None]:
runs = load_runs()
runs.head(3)

In [5]:
rruns = get_rank(runs, algorithms)
rruns.head(3)

Unnamed: 0,GA,PSO,DE,CMAES,ES,algorithm_run,alpha,dim,instance1,instance2,optimum,problem1,problem2
0,2.0,3.0,5.0,1.0,4.0,30,0.0,5,9,9,1e-20,5,14
1,5.0,2.0,4.0,1.0,3.0,30,0.1,5,9,9,1e-20,5,14
2,5.0,3.0,2.0,1.0,4.0,30,0.2,5,9,9,1e-20,5,14


In [6]:
mean_rruns = rruns.groupby(meta_columns_no_run).mean().reset_index()
mean_rruns.head(3)

Unnamed: 0,problem1,problem2,instance1,instance2,alpha,dim,GA,PSO,DE,CMAES,ES,algorithm_run,optimum
0,1,1,1,1,0.0,5,5.0,1.833333,1.833333,2.333333,4.0,15.5,1e-20
1,1,1,1,1,0.1,5,4.933333,1.783333,1.783333,2.433333,4.066667,15.5,1e-20
2,1,1,1,1,0.2,5,5.0,1.9,1.9,2.2,4.0,15.5,1e-20


In [10]:
ela.shape

(633600, 191)

In [15]:
missing=ela.replace([np.inf,-np.inf],np.nan).isna().sum().sort_values(ascending=False)

In [16]:
limo_features=list(filter(lambda x: 'limo' in x, ela.columns))

In [6]:
ela = load_ela()
features = [x for x in ela.columns if '.' in x]
ela.head(3)

Unnamed: 0,problem1,problem2,instance1,instance2,alpha,dim,algorithm_run,cm_angle.dist_ctr2best_mean,cm_angle.dist_ctr2best_sd,cm_angle.dist_ctr2worst_mean,...,norm_nbc.costs_runtime,norm_pca.expl_var.cov_x,norm_pca.expl_var.cor_x,norm_pca.expl_var.cov_init,norm_pca.expl_var.cor_init,norm_pca.expl_var_PC1.cov_x,norm_pca.expl_var_PC1.cor_x,norm_pca.expl_var_PC1.cov_init,norm_pca.expl_var_PC1.cor_init,norm_pca.costs_runtime
0,10,4,5,5,0.0,5,5,1.593654,0.34426,1.61268,...,2.797899,1.0,1.0,0.833333,0.833333,0.213621,0.213625,0.270874,0.302406,0.104997
1,10,4,5,5,0.0,5,5,1.593654,0.34426,1.61268,...,2.797899,1.0,1.0,0.833333,0.833333,0.213621,0.213625,0.270874,0.302406,0.104997
2,10,4,5,5,0.1,5,5,1.590464,0.333052,1.615398,...,2.63432,1.0,1.0,0.833333,0.833333,0.209181,0.20916,0.254892,0.300868,0.031412


In [8]:
data = rruns.merge(ela, on=['problem1', 'problem2', 'instance1', 'instance2', 'alpha', 'dim'], how='outer', suffixes=['_run', '_ela'])

In [9]:
data.head(3)

Unnamed: 0,GA,PSO,DE,CMAES,ES,algorithm_run_run,alpha,dim,instance1,instance2,...,norm_nbc.costs_runtime,norm_pca.expl_var.cov_x,norm_pca.expl_var.cor_x,norm_pca.expl_var.cov_init,norm_pca.expl_var.cor_init,norm_pca.expl_var_PC1.cov_x,norm_pca.expl_var_PC1.cor_x,norm_pca.expl_var_PC1.cov_init,norm_pca.expl_var_PC1.cor_init,norm_pca.costs_runtime
0,2.0,3.0,5.0,1.0,4.0,30,0.0,5,9,9,...,3.350662,1.0,1.0,0.833333,0.833333,0.215756,0.215764,0.211619,0.277143,0.267816
1,2.0,3.0,5.0,1.0,4.0,30,0.0,5,9,9,...,3.350662,1.0,1.0,0.833333,0.833333,0.215756,0.215764,0.211619,0.277143,0.267816
2,2.0,3.0,5.0,1.0,4.0,30,0.0,5,9,9,...,2.909333,1.0,1.0,0.833333,0.833333,0.213798,0.213823,0.209749,0.281315,0.070789


In [10]:
feature_groups = set([x.split('.')[0] for x in features])
feature_groups

{'cm_angle',
 'cm_conv',
 'cm_grad',
 'disp',
 'ela_distr',
 'ela_level',
 'ela_meta',
 'ic',
 'limo',
 'nbc',
 'norm_cm_angle',
 'norm_cm_conv',
 'norm_cm_grad',
 'norm_disp',
 'norm_ela_distr',
 'norm_ela_level',
 'norm_ela_meta',
 'norm_ic',
 'norm_limo',
 'norm_nbc',
 'norm_pca',
 'pca'}

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor
import itertools
from tqdm import tqdm

models = [RandomForestRegressor(n_jobs=-1), DummyRegressor()]
problem_out_range = range(1, 25)
all_feature_groups = list(feature_groups) + ['all']

for feature_group, problem_out, model in tqdm(list(itertools.product(all_feature_groups, problem_out_range, models))):
    feature_subset = [x for x in features if x.startswith(feature_group)]
    if feature_group == "all":
        feature_subset = features
    directory = 'gecco'
    create_directory_if_not_exist(directory)

    file = f'{directory}/m_{model.__class__.__name__}__rp_{problem_out}__fg_{feature_group}.parquet'
    if os.path.isfile(file) == False:
        print(f'Training  {file}')
        train = data.query(f"problem1!={problem_out} and problem2 != {problem_out}")
        test = data.query(f"problem1!={problem_out} and problem2 == {problem_out}")

        X_train, Y_train = transform_to_X_Y(train, feature_subset, algorithms)
        model.fit(X_train, Y_train)

        X_test, Y_test = transform_to_X_Y(test, feature_subset, algorithms)
        prediction = model.predict(X_test)
        dfpred = pd.concat([pd.DataFrame(prediction, columns=algorithms).reset_index(), test[meta_columns_no_run].reset_index()], axis=1)

        joined_table = mean_rruns.merge(dfpred, on=meta_columns_no_run, suffixes=('_true', '_pred'))
        joined_table_with_error = append_pairwise_error(joined_table)
        joined_table_with_error
        joined_table_with_error.drop(columns=['index'], inplace=True)
        joined_table_with_error['metamodel'] = model.__class__.__name__
        joined_table_with_error['removed_function'] = problem_out
        joined_table_with_error['feature_group'] = feature_group
        joined_table_with_error['all_features'] = ','.join(feature_subset)
        joined_table_with_error.to_parquet(file)
    else:
        print(f"Skiping file {file}")

  0%|          | 0/1104 [00:00<?, ?it/s]

Training  gecco/m_RandomForestRegressor__rp_1__fg_norm_cm_conv.parquet


  0%|          | 1/1104 [09:34<175:59:57, 574.43s/it]

Training  gecco/m_DummyRegressor__rp_1__fg_norm_cm_conv.parquet


  0%|          | 2/1104 [09:51<75:24:46, 246.36s/it] 

Training  gecco/m_RandomForestRegressor__rp_2__fg_norm_cm_conv.parquet


  0%|          | 3/1104 [19:43<123:43:54, 404.57s/it]

Training  gecco/m_DummyRegressor__rp_2__fg_norm_cm_conv.parquet


  0%|          | 4/1104 [20:00<76:49:34, 251.43s/it] 

Training  gecco/m_RandomForestRegressor__rp_3__fg_norm_cm_conv.parquet


  0%|          | 5/1104 [29:24<111:11:08, 364.21s/it]

Training  gecco/m_DummyRegressor__rp_3__fg_norm_cm_conv.parquet


  1%|          | 6/1104 [29:41<75:04:09, 246.13s/it] 

Training  gecco/m_RandomForestRegressor__rp_4__fg_norm_cm_conv.parquet
