In [1]:
import pandas as pd
import json
from os.path import join as pjoin
from os import listdir
from statistics import median

In [2]:
feature_preprocs = [
    "SKLFastICA",
    "SKLFeatureAgglomeration",
    "SKLKernelPCA",
    "SKLNystroem",
    "SKLPCA",
    "SKLPolynomialFeatures",
    "SKLRandomTreesEmbedding",
    "SKLRBFSampler",
    "SKLSelectFromExtraTrees",
    "SKLSelectFromLinearSVC",
    "SKLSelectPercentile",
    "SKLSelectRates"
]


scalers = [
    "SKLMinMaxScaler",
    "SKLNormalizer",
    "SKLPowerTransformer",
    "SKLQuantileTransformer",
    "SKLRobustScaler",
    "SKLStandardScaler"
]

imputers = [
    "SKLSimpleImpute"
    ]



In [3]:
def load_josn(path):
    with open(path, 'r') as f:
        return json.load(f)

In [4]:

def get_node_average_run_time_and_count(node_name, ds, mean=True):
    run_times = []

    dataset_path = pjoin("results", ds)
    for file in listdir(dataset_path):
        if file.endswith("run_time.json") and node_name in file:
            run_times.append(
                load_josn(pjoin(dataset_path, file))["total_seconds"])
    

    if len(run_times) == 0:
        return 0, 0 
    
    if mean:
        average = sum(run_times) / len(run_times)
    else:
        average = median(run_times)
    return average, len(run_times)

In [5]:
def get_saved_run_time_and_count_and_acutal_run_count(node_name, node_type , ds, ds_run_history_df, mean=True):
    avg_run_time, run_count  = get_node_average_run_time_and_count(node_name, ds, mean=mean)

    in_pipeline_count = ds_run_history_df[
        (ds_run_history_df[node_type] == node_name) &
        (ds_run_history_df["status"] =="success")
        ].shape[0]
    
    saved_run_time = (avg_run_time * in_pipeline_count) - (avg_run_time * run_count)
    saved_runs = in_pipeline_count - run_count

    return saved_run_time, saved_runs, run_count


    

In [6]:
def get_savings_and_actual_run_info(ds_name, mean=True):

    ds_rh = pd.read_csv(f"run_histories/{ds_name}_train_run_history.csv")

    saving = {}
    actual = {}

    saving["imputer_total_saved_time"] = 0
    saving["imputer_total_saved_runs"] = 0
    actual["imputer_actual_run_times"] = 0


    for imputer in imputers:
        imp_saved_time, imp_saved_runs, imp_run_count = get_saved_run_time_and_count_and_acutal_run_count(imputer, "imputer", ds_name, ds_rh, mean=mean)
        saving["imputer_total_saved_time"]  += imp_saved_time
        saving["imputer_total_saved_runs"] += imp_saved_runs
        actual["imputer_actual_run_times"] += imp_run_count


    saving["scaler_total_saved_sec"] = 0
    saving["scaler_total_saved_runs"] = 0
    actual["scaler_actual_run_times"] = 0

    for scaler in scalers:
        scaler_saved_time, scaler_saved_runs, scaler_run_count = get_saved_run_time_and_count_and_acutal_run_count(scaler, "scaler", ds_name, ds_rh, mean=mean)
        saving["scaler_total_saved_sec"] += scaler_saved_time
        saving["scaler_total_saved_runs"] += scaler_saved_runs
        actual["scaler_actual_run_times"] += scaler_run_count


    saving["feature_preproc_total_saved_sec"] = 0
    saving["feature_preproc_total_saved_runs"] = 0
    actual["feature_preproc_actual_run_times"] = 0

    for feature_preproc in feature_preprocs:
        fp_saved_time, fp_saved_runs, fp_run_count = get_saved_run_time_and_count_and_acutal_run_count(feature_preproc, "feature_preprocessor", ds_name, ds_rh, mean=mean)
        saving["feature_preproc_total_saved_sec"] += fp_saved_time
        saving["feature_preproc_total_saved_runs"] += fp_saved_runs
        actual["feature_preproc_actual_run_times"] += fp_run_count


    saving["total_saved_sec"] = saving["scaler_total_saved_sec"] + saving["imputer_total_saved_time"] + saving["feature_preproc_total_saved_sec"]
    saving["total_saved_runs"] = saving["scaler_total_saved_runs"] + saving["imputer_total_saved_runs"] + saving["feature_preproc_total_saved_runs"]

    return saving, actual


def get_total_run_sec(ds_name):
    return load_josn(f"logs/{ds_name}_time.json")["total_seconds"]




In [7]:
def make_summary_saving_dataframe(datasets, mean=True):

    dataset = []
    imputer_total_saved_time = []
    imputer_total_saved_runs = []
    imputer_total_runs = []

    scaler_total_saved_sec = []
    scaler_total_saved_runs = []
    scaler_total_runs = []

    feature_preproc_total_saved_sec = []
    feature_preproc_total_saved_runs = []
    feature_preproc_total_runs = []

    total_saved_sec = []
    total_saved_runs = []


    total_run_sec = []



    for ds in datasets:
        
        ds_savings, ds_actual = get_savings_and_actual_run_info(ds, mean=mean)
        imputer_total_saved_time.append(ds_savings["imputer_total_saved_time"])
        imputer_total_saved_runs.append(ds_savings["imputer_total_saved_runs"])

        scaler_total_saved_sec.append(ds_savings["scaler_total_saved_sec"])
        scaler_total_saved_runs.append(ds_savings["scaler_total_saved_runs"])
        
        feature_preproc_total_saved_sec.append(ds_savings["feature_preproc_total_saved_sec"])
        feature_preproc_total_saved_runs.append(ds_savings["feature_preproc_total_saved_runs"])

        total_saved_sec.append(ds_savings["total_saved_sec"])
        total_saved_runs.append(ds_savings["total_saved_runs"])

        total_run_sec.append(get_total_run_sec(ds))

        imputer_total_runs.append(ds_actual["imputer_actual_run_times"])
        scaler_total_runs.append(ds_actual["scaler_actual_run_times"])
        feature_preproc_total_runs.append(ds_actual["feature_preproc_actual_run_times"])




    df = pd.DataFrame()


    df["dataset"] = datasets
    df["imputer_total_runs"] = imputer_total_runs
    df["scaler_total_runs"] = scaler_total_runs
    df["feature_preproc_total_runs"] = feature_preproc_total_runs
    df["total_run_sec"] = total_run_sec



    df["imputer_total_saved_time"] = imputer_total_saved_time
    df["imputer_total_saved_runs"] = imputer_total_saved_runs

    df["scaler_total_saved_sec"] = scaler_total_saved_sec
    df["scaler_total_saved_runs"] = scaler_total_saved_runs

    df["feature_preproc_total_saved_sec"] = feature_preproc_total_saved_sec
    df["feature_preproc_total_saved_runs"] = feature_preproc_total_saved_runs

    df["total_saved_sec"] = total_saved_sec
    df["total_saved_runs"] = total_saved_runs

    df["time_saving_percent"] = df["total_saved_sec"] / df["total_run_sec"] * 100

    return df




In [8]:



datasets = ['qsar-biodeg', 'wilt', 
            'riccardo',
            'bank-marketing', 'steel-plates-fault', 'APSFailure', 'madelon', 'hill-valley', 'kc1', 'higgs', 'numerai28.6', 'ozone-level-8hr', 'sylvine', 'mozilla4', 'eeg-eye-state', 'pc4', 'phoneme']


In [9]:
make_summary_saving_dataframe(datasets, mean=True)

Unnamed: 0,dataset,imputer_total_runs,scaler_total_runs,feature_preproc_total_runs,total_run_sec,imputer_total_saved_time,imputer_total_saved_runs,scaler_total_saved_sec,scaler_total_saved_runs,feature_preproc_total_saved_sec,feature_preproc_total_saved_runs,total_saved_sec,total_saved_runs,time_saving_percent
0,qsar-biodeg,1,6,84,140.492949,7.8037,1091,14.017171,930,63.359947,903,85.180818,2924,60.629959
1,wilt,1,6,84,238.858582,6.990265,1058,12.976321,902,65.609767,870,85.576353,2830,35.827205
2,riccardo,1,6,64,44120.571381,778.042702,752,2425.408477,642,14331.375272,601,17534.826451,1995,39.742972
3,bank-marketing,1,6,84,409.998304,7.96237,1091,10.45254,930,333.831069,903,352.245979,2924,85.914009
4,steel-plates-fault,1,6,84,245.878383,8.959239,1086,19.451534,930,56.622371,898,85.033144,2914,34.583416
5,APSFailure,1,6,77,19720.511519,215.294534,938,445.562842,807,5348.354702,761,6009.212078,2506,30.471887
6,madelon,1,6,84,4723.484987,23.083401,1040,274.537391,889,1749.606634,852,2047.227426,2781,43.341461
7,hill-valley,1,6,84,710.084856,8.019339,1039,30.553882,883,203.441298,851,242.014519,2773,34.082479
8,kc1,1,6,84,136.910586,7.917891,1091,14.522085,930,48.334797,903,70.774773,2924,51.694157
9,higgs,1,6,77,16494.55546,39.46774,963,127.48912,821,1304.963966,789,1471.920825,2573,8.923677


In [10]:

make_summary_saving_dataframe(datasets, mean=False) # median is used instead of mean

Unnamed: 0,dataset,imputer_total_runs,scaler_total_runs,feature_preproc_total_runs,total_run_sec,imputer_total_saved_time,imputer_total_saved_runs,scaler_total_saved_sec,scaler_total_saved_runs,feature_preproc_total_saved_sec,feature_preproc_total_saved_runs,total_saved_sec,total_saved_runs,time_saving_percent
0,qsar-biodeg,1,6,84,140.492949,7.8037,1091,14.017171,930,63.664855,903,85.485726,2924,60.846987
1,wilt,1,6,84,238.858582,6.990265,1058,12.976321,902,64.165255,870,84.131841,2830,35.222449
2,riccardo,1,6,64,44120.571381,778.042702,752,2425.408477,642,14357.317871,601,17560.76905,1995,39.801772
3,bank-marketing,1,6,84,409.998304,7.96237,1091,10.45254,930,331.645267,903,350.060178,2924,85.380884
4,steel-plates-fault,1,6,84,245.878383,8.959239,1086,19.451534,930,58.679336,898,87.090109,2914,35.419994
5,APSFailure,1,6,77,19720.511519,215.294534,938,445.562842,807,6106.572258,761,6767.429634,2506,34.316704
6,madelon,1,6,84,4723.484987,23.083401,1040,274.537391,889,1761.695142,852,2059.315934,2781,43.597385
7,hill-valley,1,6,84,710.084856,8.019339,1039,30.553882,883,218.204736,851,256.777957,2773,36.161588
8,kc1,1,6,84,136.910586,7.917891,1091,14.522085,930,51.690212,903,74.130188,2924,54.144964
9,higgs,1,6,77,16494.55546,39.46774,963,127.48912,821,1320.491296,789,1487.448156,2573,9.017813
