In [1]:
from glob import glob
import pandas as pd
import numpy as np
from tqdm import tqdm

results_path = "../results_blackbox"

step_size = 50 # How many generations to ignore between two points in the plots
skip_gens = 10   # How many initial generations to skip 
tot_gens  = 1000

# how we sample the generations
gens = range(tot_gens)                         # all generations (slower)
# gens = range(skip_gens, tot_gens, step_size)   # skipping generations
# gens = [int(np.floor(g)) for g in np.logspace( # skipping with log scale
#                                        np.log10(0.99+skip_gens),
#                                        np.log10(tot_gens),
#                                        num=tot_gens//step_size)]

# Loading overall results
all_data = []
for f in tqdm(glob(results_path + '/*/*.csv')):
    model = "brush"
    if "brush_C_D_TS" in f:
        model = "brush_C_D_TS"
    elif "brush_D_TS" in f:
        model = "brush_D_TS"
    elif "brush_C_D_UCB1" in f:
        model = "brush_C_D_UCB1"
    elif "brush_D_UCB1" in f:
        model = "brush_D_UCB1"

    dataset = f.split('/')[-1].split(f"_{model}")[0]

    random_state = f.split(f"_{model}_0_")[1].split("_")[0]

    data = pd.read_csv(f)
    
    # print(f, model, dataset)

    for gen in gens:
        try:
            row = data[data['gen']==gen].iloc[0] # should have just 1 row
        except Exception as e:
            pass
            print(e)
            print(f, gen)
            print(row)

        all_data.append(( dataset, model, "_C_" in model, random_state, gen, row[f'best_size'],
                        row[f'min train error'], row[f'med train error'],
                        row[f'min val error'],   row[f'med val error'], row[f'std val error'],
                        row[f'min val size'],    row[f'med val size'],  row[f'std val size'] ))
            
df_results = pd.DataFrame(all_data, 
                           columns=('dataset', 'model', 'contextual', 'random_state', 
                                    'gen', 'best ind. size',
                                    'min train error', 'med train error',
                                    'min val. error', 'med val. error', 'std val. error', 
                                    'min size', 'med size', 'std size'))

df_results = df_results.replace([np.inf, -np.inf], np.nan).dropna()

df_results['model'] = df_results['model'].apply(lambda x: x.replace('brush_C_D_UCB1','C-D-UCB1'))
df_results['model'] = df_results['model'].apply(lambda x: x.replace('brush_C_D_TS','C-D-TS'))
df_results['model'] = df_results['model'].apply(lambda x: x.replace('brush_D_UCB1','D-UCB1'))
df_results['model'] = df_results['model'].apply(lambda x: x.replace('brush_D_TS','D-TS'))
df_results['model'] = df_results['model'].apply(lambda x: x.replace('brush','Baseline'))

# df_results.to_feather('../results/black-box_convergences.feather')

100%|██████████| 5865/5865 [28:33<00:00,  3.42it/s]


In [2]:
print(df_results.shape)
print(df_results['model'].unique())
print(df_results['dataset'].unique())
display(df_results.reset_index().sample(5))

df_results.reset_index().to_feather('../results/black-box_convergences.feather')

(5374021, 14)
['Baseline' 'C-D-TS' 'D-UCB1' 'D-TS' 'C-D-UCB1']
['195_auto_price' 'nikuradse_1' '522_pm10' '607_fri_c4_1000_50'
 '230_machine_cpu' '1029_LEV' '603_fri_c0_250_50' '537_houses'
 '637_fri_c1_500_50' '609_fri_c0_1000_5' '584_fri_c4_500_25'
 '654_fri_c0_500_10' '1193_BNG_lowbwt' '666_rmftsa_ladata'
 '620_fri_c1_1000_25' '601_fri_c1_250_5' '678_visualizing_environmental'
 '665_sleuth_case2002' '595_fri_c0_1000_10' '611_fri_c3_100_5'
 '687_sleuth_ex1605' '561_cpu' '644_fri_c4_250_25' '656_fri_c1_100_5'
 '210_cloud' '1027_ESL' '621_fri_c0_100_10' '690_visualizing_galaxy'
 '1089_USCrime' '582_fri_c1_500_25' '1096_FacultySalaries'
 '579_fri_c0_250_5' '651_fri_c0_100_25' '542_pollution'
 '645_fri_c3_500_50' '659_sleuth_ex1714' '622_fri_c2_1000_50' '503_wind'
 '649_fri_c0_500_5' '617_fri_c3_500_5' '616_fri_c4_500_50'
 '623_fri_c4_1000_10' '589_fri_c2_1000_25' '197_cpu_act'
 '613_fri_c3_250_5' '505_tecator' '583_fri_c1_1000_50' '599_fri_c2_1000_5'
 '628_fri_c3_1000_5' '215_2dplanes' 

Unnamed: 0,index,dataset,model,contextual,random_state,gen,best ind. size,min train error,med train error,min val. error,med val. error,std val. error,min size,med size,std size
5291392,5779046,4544_GeographicalOriginalofMusic,Baseline,False,23654,46,15.0,0.316117,0.486686,0.29216,0.513658,0.091316,1.0,5.0,2.959459
4449256,4874818,547_no2,C-D-UCB1,True,14423,818,43.0,0.370193,0.554523,0.509862,0.624344,0.217564,1.0,8.0,12.859565
3433689,3790891,657_fri_c2_250_10,D-UCB1,False,5390,891,58.0,0.085908,0.142516,0.103888,0.173801,0.423984,1.0,24.0,18.577887
3794340,4182474,201_pol,Baseline,False,16850,474,29.0,0.18336,0.5444,0.182342,0.550618,0.293073,1.0,9.0,8.291055
3562562,3925482,591_fri_c1_100_10,C-D-TS,True,16850,482,29.0,0.07557,0.900668,0.301267,1.211176,0.389,1.0,1.0,8.279879
