In [2]:
import pandas as pd
import json
from os.path import join as pjoin
from os import listdir
from statistics import median

In [3]:
feature_preprocs = [
    "SKLFastICA",
    "SKLFeatureAgglomeration",
    "SKLKernelPCA",
    "SKLNystroem",
    "SKLPCA",
    "SKLPolynomialFeatures",
    "SKLRandomTreesEmbedding",
    "SKLRBFSampler",
    "SKLSelectFromExtraTrees",
    "SKLSelectFromLinearSVC",
    "SKLSelectPercentile",
    "SKLSelectRates"
]


scalers = [
    "SKLMinMaxScaler",
    "SKLNormalizer",
    "SKLPowerTransformer",
    "SKLQuantileTransformer",
    "SKLRobustScaler",
    "SKLStandardScaler"
]

imputers = [
    "SKLSimpleImpute"
    ]



In [4]:
def load_josn(path):
    with open(path, 'r') as f:
        return json.load(f)

In [5]:

def get_node_average_run_time_and_count(node_name, ds, mean=True):
    run_times = []

    dataset_path = pjoin("results", ds)
    for file in listdir(dataset_path):
        if file.endswith("run_time.json") and node_name in file:
            run_times.append(
                load_josn(pjoin(dataset_path, file))["total_seconds"])
    

    if len(run_times) == 0:
        return 0, 0 
    
    if mean:
        average = sum(run_times) / len(run_times)
    else:
        average = median(run_times)
    return average, len(run_times)

In [6]:
def get_saved_run_time_and_count_and_acutal_run_count(node_name, node_type , ds, ds_run_history_df, mean=True):
    avg_run_time, run_count  = get_node_average_run_time_and_count(node_name, ds, mean=mean)

    in_pipeline_count = ds_run_history_df[
        (ds_run_history_df[node_type] == node_name) &
        (ds_run_history_df["status"] =="success")
        ].shape[0]
    
    saved_run_time = (avg_run_time * in_pipeline_count) - (avg_run_time * run_count)
    saved_runs = in_pipeline_count - run_count

    return saved_run_time, saved_runs, run_count


    

In [7]:
def get_savings_and_actual_run_info(ds_name, mean=True):

    ds_rh = pd.read_csv(f"run_histories/{ds_name}_train_run_history.csv")

    saving = {}
    actual = {}

    saving["imputer_total_saved_time"] = 0
    saving["imputer_total_saved_runs"] = 0
    actual["imputer_actual_run_times"] = 0


    for imputer in imputers:
        imp_saved_time, imp_saved_runs, imp_run_count = get_saved_run_time_and_count_and_acutal_run_count(imputer, "imputer", ds_name, ds_rh, mean=mean)
        saving["imputer_total_saved_time"]  += imp_saved_time
        saving["imputer_total_saved_runs"] += imp_saved_runs
        actual["imputer_actual_run_times"] += imp_run_count


    saving["scaler_total_saved_sec"] = 0
    saving["scaler_total_saved_runs"] = 0
    actual["scaler_actual_run_times"] = 0

    for scaler in scalers:
        scaler_saved_time, scaler_saved_runs, scaler_run_count = get_saved_run_time_and_count_and_acutal_run_count(scaler, "scaler", ds_name, ds_rh, mean=mean)
        saving["scaler_total_saved_sec"] += scaler_saved_time
        saving["scaler_total_saved_runs"] += scaler_saved_runs
        actual["scaler_actual_run_times"] += scaler_run_count


    saving["feature_preproc_total_saved_sec"] = 0
    saving["feature_preproc_total_saved_runs"] = 0
    actual["feature_preproc_actual_run_times"] = 0

    for feature_preproc in feature_preprocs:
        fp_saved_time, fp_saved_runs, fp_run_count = get_saved_run_time_and_count_and_acutal_run_count(feature_preproc, "feature_preprocessor", ds_name, ds_rh, mean=mean)
        saving["feature_preproc_total_saved_sec"] += fp_saved_time
        saving["feature_preproc_total_saved_runs"] += fp_saved_runs
        actual["feature_preproc_actual_run_times"] += fp_run_count


    saving["total_saved_sec"] = saving["scaler_total_saved_sec"] + saving["imputer_total_saved_time"] + saving["feature_preproc_total_saved_sec"]
    saving["total_saved_runs"] = saving["scaler_total_saved_runs"] + saving["imputer_total_saved_runs"] + saving["feature_preproc_total_saved_runs"]

    return saving, actual


def get_total_run_sec(ds_name):
    return load_josn(f"logs/{ds_name}_train_time.json")["total_seconds"]




In [8]:
def make_summary_saving_dataframe(datasets, mean=True):

    dataset = []
    imputer_total_saved_sec = []
    imputer_total_saved_runs = []
    imputer_total_runs = []

    scaler_total_saved_sec = []
    scaler_total_saved_runs = []
    scaler_total_runs = []

    feature_preproc_total_saved_sec = []
    feature_preproc_total_saved_runs = []
    feature_preproc_total_runs = []

    total_saved_sec = []
    total_saved_runs = []


    total_run_sec = []



    for ds in datasets:
        
        ds_savings, ds_actual = get_savings_and_actual_run_info(ds, mean=mean)
        imputer_total_saved_sec.append(ds_savings["imputer_total_saved_time"])
        imputer_total_saved_runs.append(ds_savings["imputer_total_saved_runs"])

        scaler_total_saved_sec.append(ds_savings["scaler_total_saved_sec"])
        scaler_total_saved_runs.append(ds_savings["scaler_total_saved_runs"])
        
        feature_preproc_total_saved_sec.append(ds_savings["feature_preproc_total_saved_sec"])
        feature_preproc_total_saved_runs.append(ds_savings["feature_preproc_total_saved_runs"])

        total_saved_sec.append(ds_savings["total_saved_sec"])
        total_saved_runs.append(ds_savings["total_saved_runs"])

        total_run_sec.append(get_total_run_sec(ds))

        imputer_total_runs.append(ds_actual["imputer_actual_run_times"])
        scaler_total_runs.append(ds_actual["scaler_actual_run_times"])
        feature_preproc_total_runs.append(ds_actual["feature_preproc_actual_run_times"])




    df = pd.DataFrame()


    df["dataset"] = datasets
    df["imputer_total_runs"] = imputer_total_runs
    df["scaler_total_runs"] = scaler_total_runs
    df["feature_preproc_total_runs"] = feature_preproc_total_runs
    df["total_run_sec"] = total_run_sec



    df["imputer_total_saved_sec"] = imputer_total_saved_sec
    df["imputer_total_saved_runs"] = imputer_total_saved_runs

    df["scaler_total_saved_sec"] = scaler_total_saved_sec
    df["scaler_total_saved_runs"] = scaler_total_saved_runs

    df["feature_preproc_total_saved_sec"] = feature_preproc_total_saved_sec
    df["feature_preproc_total_saved_runs"] = feature_preproc_total_saved_runs

    df["total_saved_sec"] = total_saved_sec
    df["total_saved_runs"] = total_saved_runs

    df["time_saving_percent"] = df["total_saved_sec"] / df["total_run_sec"] * 100

    return df




In [9]:
datasets = [
        "steel-plates-fault",
        "qsar-biodeg",
        "phoneme",
    
]



In [10]:
saving  = make_summary_saving_dataframe(datasets, mean=True)


saving

Unnamed: 0,dataset,imputer_total_runs,scaler_total_runs,feature_preproc_total_runs,total_run_sec,imputer_total_saved_sec,imputer_total_saved_runs,scaler_total_saved_sec,scaler_total_saved_runs,feature_preproc_total_saved_sec,feature_preproc_total_saved_runs,total_saved_sec,total_saved_runs,time_saving_percent
0,steel-plates-fault,1,6,84,145.524329,9.097568,1078,19.275208,923,62.434505,890,90.807282,2891,62.400069
1,qsar-biodeg,1,6,84,139.546442,8.193398,1080,14.328483,920,62.38416,892,84.906041,2892,60.844289
2,phoneme,1,6,77,211.737949,7.01796,1042,15.3496,888,35.89896,861,58.26652,2791,27.518222


In [11]:

make_summary_saving_dataframe(datasets, mean=False) # median is used instead of mean

Unnamed: 0,dataset,imputer_total_runs,scaler_total_runs,feature_preproc_total_runs,total_run_sec,imputer_total_saved_sec,imputer_total_saved_runs,scaler_total_saved_sec,scaler_total_saved_runs,feature_preproc_total_saved_sec,feature_preproc_total_saved_runs,total_saved_sec,total_saved_runs,time_saving_percent
0,steel-plates-fault,1,6,84,145.524329,9.097568,1078,19.275208,923,63.239068,890,91.611845,2891,62.952941
1,qsar-biodeg,1,6,84,139.546442,8.193398,1080,14.328483,920,64.488561,892,87.010442,2892,62.352319
2,phoneme,1,6,77,211.737949,7.01796,1042,15.3496,888,33.351025,861,55.718585,2791,26.314879
