In [55]:
import pandas as pd
import plotly
import plotly.graph_objects as go
from plotly import subplots
import numpy as np
from config import ASSETS_DIR
import os
from scipy.stats import spearmanr


IMG_SCALE = 2.5
COLORS = plotly.colors.qualitative.Plotly

DATASET_ORDER = ["volkert", "jasmine", "nomao", "anneal", "kr-vs-kp", "sylvine", "australian", "adult", "ldpa"]

In [56]:
scores_df = pd.read_csv("fs_efficency/scores.csv")
scores_df = scores_df.query(
    "dataset not in ['anneal', 'ldpa'] "  
    #"and fs_percent>=0.5 "                
)
scores_df.head()

Unnamed: 0,dataset,model,architecture_name,fs_method,fs_percent,fs_n_features,balanced_accuracy_mean,balanced_accuracy_std,balanced_accuracy_sum,accuracy_mean,...,roc_auc_sum,f1_mean,f1_std,f1_sum,precision_mean,precision_std,precision_sum,recall_mean,recall_std,recall_sum
0,adult,mlp/simple,A0,decision_tree,0.4,5,0.55923,0.011023,2.796152,0.774704,...,2.796152,0.867831,0.003999,4.339155,0.783015,0.004897,3.915077,0.973501,0.017234,4.867504
1,adult,mlp/simple,A0,decision_tree,0.5,7,0.721856,0.02476,3.609278,0.826402,...,3.609278,0.889818,0.003944,4.449092,0.85973,0.015438,4.298652,0.922858,0.025048,4.614289
2,adult,mlp/simple,A0,decision_tree,0.6,8,0.721225,0.035442,3.606124,0.824278,...,3.606124,0.88815,0.00635,4.440748,0.86053,0.023391,4.302648,0.919356,0.037475,4.596781
3,adult,mlp/simple,A0,decision_tree,0.7,9,0.742855,0.023515,3.714276,0.827477,...,3.714276,0.88851,0.006807,4.442549,0.872982,0.016502,4.364908,0.90555,0.029202,4.527752
4,adult,mlp/simple,A0,decision_tree,0.8,11,0.753598,0.0164,3.767988,0.831879,...,3.767988,0.890887,0.007124,4.454437,0.878726,0.012858,4.393628,0.904102,0.026838,4.520509


# Adding best architectures cases

In [57]:
best_archs_indices = scores_df.query(
    "fs_percent==1 "
    "and fs_method=='decision_tree' " # The method is not relevant
    "and model in ['mlp/full', 'mlp/full_nd', 'mlp/full_nd_nn', 'xgboost']").groupby(["dataset", "model"])["balanced_accuracy_mean"].idxmax()

best_archs_df = scores_df.loc[best_archs_indices, ["dataset", "model", "architecture_name"]]

for r in best_archs_df.iloc:
    ds = r["dataset"]
    m = r["model"]
    a = r["architecture_name"]
    
    only_best_arch = scores_df.query("dataset==@ds and model==@m and architecture_name==@a").copy()
    only_best_arch["model"] = only_best_arch["model"] + "/best"
    
    scores_df = pd.concat([scores_df, only_best_arch], axis=0)

scores_df = scores_df.reset_index(drop=True)
scores_df

Unnamed: 0,dataset,model,architecture_name,fs_method,fs_percent,fs_n_features,balanced_accuracy_mean,balanced_accuracy_std,balanced_accuracy_sum,accuracy_mean,...,roc_auc_sum,f1_mean,f1_std,f1_sum,precision_mean,precision_std,precision_sum,recall_mean,recall_std,recall_sum
0,adult,mlp/simple,A0,decision_tree,0.4,5,0.559230,0.011023,2.796152,0.774704,...,2.796152,0.867831,0.003999,4.339155,0.783015,0.004897,3.915077,0.973501,0.017234,4.867504
1,adult,mlp/simple,A0,decision_tree,0.5,7,0.721856,0.024760,3.609278,0.826402,...,3.609278,0.889818,0.003944,4.449092,0.859730,0.015438,4.298652,0.922858,0.025048,4.614289
2,adult,mlp/simple,A0,decision_tree,0.6,8,0.721225,0.035442,3.606124,0.824278,...,3.606124,0.888150,0.006350,4.440748,0.860530,0.023391,4.302648,0.919356,0.037475,4.596781
3,adult,mlp/simple,A0,decision_tree,0.7,9,0.742855,0.023515,3.714276,0.827477,...,3.714276,0.888510,0.006807,4.442549,0.872982,0.016502,4.364908,0.905550,0.029202,4.527752
4,adult,mlp/simple,A0,decision_tree,0.8,11,0.753598,0.016400,3.767988,0.831879,...,3.767988,0.890887,0.007124,4.454437,0.878726,0.012858,4.393628,0.904102,0.026838,4.520509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5931,volkert,xgboost/best,A3,random_1,1.0,180,0.609670,0.008966,3.048349,0.701337,...,0.000000,,,0.000000,,,0.000000,,,0.000000
5932,volkert,xgboost/best,A3,random_2,1.0,180,0.609670,0.008966,3.048349,0.701337,...,0.000000,,,0.000000,,,0.000000,,,0.000000
5933,volkert,xgboost/best,A3,random_3,1.0,180,0.609670,0.008966,3.048349,0.701337,...,0.000000,,,0.000000,,,0.000000,,,0.000000
5934,volkert,xgboost/best,A3,random_4,1.0,180,0.609670,0.008966,3.048349,0.701337,...,0.000000,,,0.000000,,,0.000000,,,0.000000


In [58]:
ignored_models = [
    "mlp/full", 
    "mlp/full/best", 
    "mlp/full_nd", 
    "mlp/full_nd/best", 
    "mlp/full_nd_nn", 
    "mlp/full_nd_nn/best"
]
scores_df = scores_df.query("model not in @ignored_models")
scores_df["model"].unique()

array(['mlp/simple', 'mlp/simple_nd', 'mlp/simple_nd_nn',
       'transformer/cls', 'xgboost', 'xgboost/best'], dtype=object)

In [59]:
df_replacements = {
    "model": {
       "transformer/cls": "FT-Transformer",
       #"mlp/full": "MLP + HS",
       #"mlp/full/best": "MLP*",
       #"mlp/simple": "MLP* + Pruning",
       "mlp/simple": "MLP + Pruning",
       "mlp/simple_nd": "MLP + Pruning - Dropout",
       "mlp/simple_nd_nn": "MLP + Pruning - Dropout - Non-linearity",
       "xgboost": "XGBoost + HS",
       "xgboost/best": "XGBoost"
    },
    "fs_method": {
        "decision_tree": "Decision tree",
        "f_classif": "F-score",
        "linear_model": "Linear model"
    }
}

# NN as a proxy

In [60]:
for metric in ["accuracy", "log_loss", "balanced_accuracy"]:
    full_cos_df = None

    for ds in scores_df["dataset"].unique():

        cos_df = None

        for g_d, g in scores_df.query(
                "fs_percent<1 "
                #"and model in ['transformer/cls', 'mlp/simple', 'xgboost/best'] "
                "and dataset==@ds "
            ).groupby(["model", "architecture_name"], as_index=False):

            model_name = "{}_{}".format(*g_d) if g_d[0] == "xgboost" else g_d[0]

            mod_g = g[["dataset", "fs_method", "fs_percent", f"{metric}_mean"]]
            mod_g.columns = ["dataset", "fs_method", "fs_percent", model_name ]

            if cos_df is None:
                cos_df = mod_g
                continue

            cos_df = cos_df.merge(
                mod_g,
                on = ["dataset", "fs_method", "fs_percent"]
            )

        if full_cos_df is None:
            full_cos_df = cos_df.copy()
            continue

        full_cos_df = pd.concat([full_cos_df, cos_df], axis=0)
        
    print("=======", metric)
    for m in scores_df["model"].unique():
        if "xgboost" in m:
            continue
            
        print(df_replacements["model"][m], ":", spearmanr(full_cos_df[m], full_cos_df["xgboost/best"]))

MLP + Pruning : SignificanceResult(statistic=0.9505208899715116, pvalue=1.2168784313479212e-171)
MLP + Pruning - Dropout : SignificanceResult(statistic=0.9218466137703211, pvalue=1.5071769145782778e-139)
MLP + Pruning - Dropout - Non-linearity : SignificanceResult(statistic=0.8868903392655567, pvalue=4.738752804788799e-114)
FT-Transformer : SignificanceResult(statistic=0.9454054537129859, pvalue=1.0779245155026683e-164)
MLP + Pruning : SignificanceResult(statistic=0.8322977230334847, pvalue=1.3716293154074175e-87)
MLP + Pruning - Dropout : SignificanceResult(statistic=0.788185920302832, pvalue=2.1364877774301832e-72)
MLP + Pruning - Dropout - Non-linearity : SignificanceResult(statistic=0.8182236410506678, pvalue=2.6920239020927106e-82)
FT-Transformer : SignificanceResult(statistic=0.7191770795149137, pvalue=9.300065720170505e-55)
MLP + Pruning : SignificanceResult(statistic=0.9523156487479825, pvalue=2.9611189837059474e-174)
MLP + Pruning - Dropout : SignificanceResult(statistic=0.928

In [61]:
for method in ["transformer/cls", "mlp/simple"]:
    fig = go.Figure(layout=go.Layout(
        margin={"l": 20, "r": 0, "b": 20, "t": 0},    
        template="plotly_white",
        font={"size": 18}
        )
    )

    fig.add_trace(
        go.Scatter(
            x=[0.3, 1],
            y=[0.3, 1],
            mode="lines",
            line=dict(color="#777", dash="dot"),
            showlegend=False
        )
    )
    
    for ds_i, ds in enumerate(sorted(full_cos_df["dataset"].unique(), key=lambda x: DATASET_ORDER.index(x))):
        for fs_i, fs in enumerate(reversed(full_cos_df["fs_method"].unique())):
            fig.add_trace(
                go.Scatter(
                    x=full_cos_df.query("dataset==@ds and fs_method==@fs")[method],
                    y=full_cos_df.query("dataset==@ds and fs_method==@fs")["xgboost/best"],
                    name=ds,
                    mode="markers",
                    marker=dict(
                        color=COLORS[ds_i],
                        symbol="diamond" if "random" not in fs else "circle-open",
                        size=7
                    ),
                    legendgroup=ds,
                    showlegend=fs_i == 0
                )
            )
            
    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode="markers",
            name="Algorithm selection",
            marker=dict(symbol="diamond", color="#777")
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode="markers",
            name="Random selection",
            marker=dict(symbol="circle-open", color="#777")
        )
    )
        

    fig.update_layout(
        yaxis_title="XGBoost balanced accuracy",
        xaxis_title="{} balanced accuracy".format("FT-Transformer" if "transformer" in method else "MLP + Pruning"),
        #xaxis=dict(autorange="reversed"),
        font={"size": 11},
        legend=dict(
                    orientation="h",
                    yanchor="bottom",
                    y=1.02,
                    xanchor="left",
                    x=-0.02
                )
    )



    fig.show()
    
    fig.write_image(os.path.join(ASSETS_DIR, f"fs_general_relation_{method.split('/')[0]}.png"), scale=IMG_SCALE)

# General results

In [10]:
def num_as_str(x):
    return "{0:.3f}".format(x)

In [14]:
indices_best = scores_df.groupby([
    "dataset", "model", "fs_method", 
    "fs_percent", "fs_n_features"
    ])["balanced_accuracy_mean"].idxmax()


best_scores_df = scores_df.loc[indices_best].query("fs_percent==1 "
                                                   "and fs_method=='decision_tree' "
                                                  "and model in ['transformer/cls','mlp/simple', 'mlp/simple_nd', 'mlp/simple_nd_nn','xgboost']"
                                                  )
best_scores_df["balanced_accuracy"] = "$" + (best_scores_df["balanced_accuracy_mean"] * 100).apply(num_as_str) \
                        + " \pm " + (best_scores_df["balanced_accuracy_std"] * 100).apply(num_as_str) + "$" 



best_scores_df = best_scores_df.replace(df_replacements)[["dataset", "model", "balanced_accuracy"]] \
                        .pivot(index="dataset", columns="model") \
                        .reset_index()


best_scores_df.columns = ["Dataset"] + [col[1] for col in best_scores_df.columns][1:]


best_scores_df = best_scores_df.sort_values("Dataset", key=lambda x: [DATASET_ORDER.index(i) for i in x])

with open(os.path.join(ASSETS_DIR, "fs_general_results.tex"), "w") as f:
    f.write(best_scores_df.to_latex(index=False))    
    
best_scores_df

Unnamed: 0,Dataset,FT-Transformer,MLP + Pruning,XGBoost + HS,mlp/simple_nd,mlp/simple_nd_nn
6,volkert,$62.373 \pm 0.915$,$59.915 \pm 0.781$,$60.967 \pm 0.897$,$55.964 \pm 1.621$,$43.327 \pm 0.596$
2,jasmine,$81.890 \pm 2.149$,$78.897 \pm 1.585$,$81.707 \pm 1.105$,$79.612 \pm 1.936$,$78.283 \pm 1.461$
4,nomao,$95.159 \pm 0.393$,$93.912 \pm 0.775$,$96.166 \pm 0.290$,$93.272 \pm 0.785$,$92.842 \pm 0.333$
3,kr-vs-kp,$99.839 \pm 0.166$,$99.682 \pm 0.173$,$99.368 \pm 0.205$,$99.719 \pm 0.296$,$97.076 \pm 0.721$
5,sylvine,$94.890 \pm 0.404$,$92.811 \pm 0.559$,$94.427 \pm 0.623$,$92.346 \pm 0.556$,$91.377 \pm 0.993$
1,australian,$87.195 \pm 3.422$,$85.785 \pm 2.228$,$85.660 \pm 2.866$,$84.751 \pm 2.603$,$85.816 \pm 2.426$
0,adult,$79.215 \pm 0.970$,$78.472 \pm 1.413$,$79.784 \pm 0.779$,$78.584 \pm 1.106$,$77.167 \pm 1.087$


# Full models CV scores

In [19]:
#kr-vs-kp
#sylvine
# volkert takes much time (consider)

fs_method = "decision_tree"

datasets = scores_df["dataset"].unique()
datasets = ["volkert", "adult"]
#datasets = ["jasmine", "kr-vs-kp", "sylvine", "australian", "adult"]

datasets = sorted(datasets, key=lambda x: DATASET_ORDER.index(x))


fig = subplots.make_subplots(
            len(datasets), 2, 
            shared_xaxes=True, 
            x_title="Features percent",
            row_titles=datasets,
            horizontal_spacing=0.12,
            vertical_spacing=0.02
)

for d_i, dataset in enumerate(datasets):
    # Select dataset and feature selection info
    cos_df = scores_df.query(
        "dataset==@dataset "
        "and fs_method==@fs_method"
    )

    # For each model
    #for m_i, m in enumerate(cos_df["model"].unique()):
    for m_i, m in enumerate(scores_df["model"].unique()):
        
        plot_df = cos_df.query("model==@m")

        # Computes the times needed to perform the experiment
        times_df = plot_df.groupby(["fs_percent"], as_index=False).sum()[["fs_percent", "training_time_sum", "inference_time_sum"]]
                             
        # Selects the best architecture for each featue selection percentage
        idx_max = plot_df.groupby(["fs_percent"])["balanced_accuracy_mean"].idxmax()
        plot_df = plot_df.loc[idx_max]
                             
        # Adds the best architectures information to the plot
        fig.add_trace(go.Scatter(
            x=(plot_df["fs_percent"] * 100).apply(lambda x: str(int(x)) + "%"),
            y=plot_df["balanced_accuracy_mean"],
            name=df_replacements["model"].get(m, m),
            legendgroup=m,
            line=dict(color=COLORS[m_i]),
            showlegend=True if d_i == 0 else False,
             error_y=dict(
                 type="data",
                 array=plot_df["balanced_accuracy_std"],
                 visible=True
             )
        ), row=d_i + 1, col=1)
        
        fig.update_xaxes(row=d_i + 1, col=1, autorange="reversed")
        
        
        plot_df = cos_df.query(
                "model==@m "
            )
    
    
        # For pruned models set the training time in 0
        if m in ["transformer/cls", "mlp/simple"]:
            times_df["training_time_sum"] = 0

        # For initial models reset the time
        times_df.loc[times_df["fs_percent"] == 1, "training_time_sum"] = 0
        times_df.loc[times_df["fs_percent"] == 1, "inference_time_sum"] = 0

        # Plot the elapsed time for each feature selection
        fig.add_trace(go.Scatter(
            x=(times_df["fs_percent"] * 100).apply(lambda x: str(int(x)) + "%"),
            # Sums 1 beacuse log scale
            y=times_df["training_time_sum"] + times_df["inference_time_sum"],
            line=dict(color=COLORS[m_i]),
            legendgroup=m,
            name=df_replacements["model"].get(m, m),
            showlegend=False
        ), row=d_i + 1, col=2)
        
        fig.update_xaxes(row=d_i + 1, col=2, autorange="reversed")
        fig.update_yaxes(row=d_i + 1, col=2)
        


fig.add_annotation(
    x=-0.08, yanchor="middle",  textangle=-90, text="Balanced accuracy", 
    showarrow=False, xref="paper", yref="paper",
    font=dict(size=14)
)

fig.add_annotation(
    x=0.5 - 0.02, yanchor="middle",  textangle=-90, text="Time (sec)", 
    showarrow=False, xref="paper", yref="paper",
    font=dict(size=14)
)
        
fig.update_layout(
    height=len(datasets) * 200,
    margin={"l": 50, "r": 20, "b": 60, "t": 40},    
    template="plotly_white",
    font={"size": 11},
    boxmode="group",
    legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.02,
                xanchor="left",
                x=-0.02
            )
)


fig.show()

fig.write_image(os.path.join(ASSETS_DIR, "fs_datasets_comparison.png"), scale=IMG_SCALE)

In [21]:
#kr-vs-kp
#sylvine
# volkert takes much time (consider)

fs_method = "decision_tree"

datasets = scores_df["dataset"].unique()
datasets = ["jasmine", "nomao", "kr-vs-kp", "sylvine", "australian"]

datasets = sorted(datasets, key=lambda x: DATASET_ORDER.index(x))


fig = subplots.make_subplots(
            len(datasets), 2, 
            shared_xaxes=True, 
            x_title="Features percent",
            row_titles=datasets,
            horizontal_spacing=0.12,
            vertical_spacing=0.02
)

for d_i, dataset in enumerate(datasets):
    # Select dataset and feature selection info
    cos_df = scores_df.query(
        "dataset==@dataset "
        "and fs_method==@fs_method"
    )

    # For each model
    #for m_i, m in enumerate(cos_df["model"].unique()):
    for m_i, m in enumerate(scores_df["model"].unique()):
        
        plot_df = cos_df.query("model==@m")

        # Computes the times needed to perform the experiment
        times_df = plot_df.groupby(["fs_percent"], as_index=False).sum()[["fs_percent", "training_time_sum", "inference_time_sum"]]
                             
        # Selects the best architecture for each featue selection percentage
        idx_max = plot_df.groupby(["fs_percent"])["balanced_accuracy_mean"].idxmax()
        plot_df = plot_df.loc[idx_max]
                             
        # Adds the best architectures information to the plot
        fig.add_trace(go.Scatter(
            x=(plot_df["fs_percent"] * 100).apply(lambda x: str(int(x)) + "%"),
            y=plot_df["balanced_accuracy_mean"],
            name=df_replacements["model"].get(m, m),
            legendgroup=m,
            line=dict(color=COLORS[m_i]),
            showlegend=True if d_i == 0 else False,
             error_y=dict(
                 type="data",
                 array=plot_df["balanced_accuracy_std"],
                 visible=True
             )
        ), row=d_i + 1, col=1)
        
        fig.update_xaxes(row=d_i + 1, col=1, autorange="reversed")
        
        
        plot_df = cos_df.query(
                "model==@m "
            )
    
    
        # For pruned models set the training time in 0
        if m in ["transformer/cls", "mlp/simple"]:
            times_df["training_time_sum"] = 0

        # For initial models reset the time
        times_df.loc[times_df["fs_percent"] == 1, "training_time_sum"] = 0
        times_df.loc[times_df["fs_percent"] == 1, "inference_time_sum"] = 0

        # Plot the elapsed time for each feature selection
        fig.add_trace(go.Scatter(
            x=(times_df["fs_percent"] * 100).apply(lambda x: str(int(x)) + "%"),
            # Sums 1 beacuse log scale
            y=times_df["training_time_sum"] + times_df["inference_time_sum"],
            line=dict(color=COLORS[m_i]),
            legendgroup=m,
            name=df_replacements["model"].get(m, m),
            showlegend=False
        ), row=d_i + 1, col=2)
        
        fig.update_xaxes(row=d_i + 1, col=2, autorange="reversed")
        fig.update_yaxes(row=d_i + 1, col=2)
        


fig.add_annotation(
    x=-0.1, yanchor="middle",  textangle=-90, text="Balanced accuracy", 
    showarrow=False, xref="paper", yref="paper",
    font=dict(size=14)
)

fig.add_annotation(
    x=0.5 - 0.02, yanchor="middle",  textangle=-90, text="Time (sec)", 
    showarrow=False, xref="paper", yref="paper",
    font=dict(size=14)
)
        
fig.update_layout(
    height=len(datasets) * 200,
    margin={"l": 60, "r": 20, "b": 60, "t": 40},    
    template="plotly_white",
    font={"size": 11},
    boxmode="group",
    legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.02,
                xanchor="left",
                x=-0.02
            )
)


fig.show()

fig.write_image(os.path.join(ASSETS_DIR, "fs_datasets_comparison_app.png"), scale=IMG_SCALE)

# Speed information

In [22]:
fs_method = "decision_tree"

cos_df = scores_df.query("fs_method==@fs_method")
times_df = cos_df.groupby(["dataset", "model", "fs_percent"], as_index=False) \
                .sum()[["dataset", "model", "fs_percent", "training_time_sum", "inference_time_sum"]]

# For initial models reset the time
times_df.loc[times_df["model"] == "transformer/cls", "training_time_sum"] = 0
times_df.loc[times_df["model"] == "mlp/simple", "training_time_sum"] = 0
times_df.loc[times_df["fs_percent"] == 1, "training_time_sum"] = 0
times_df.loc[times_df["fs_percent"] == 1, "inference_time_sum"] = 0

times_df["total_time"] = times_df["training_time_sum"] + times_df["inference_time_sum"]
times_df = times_df.merge(
    times_df.query("model == 'mlp/simple'")[["dataset", "fs_percent", "total_time"]],
    on=["dataset", "fs_percent"],
    suffixes=["", "_base"]
)
times_df["time_ratio"] = times_df["total_time"] /  times_df["total_time_base"]
times_df = times_df[["dataset", "model", "time_ratio"]]
times_df = times_df.groupby(["dataset", "model"], as_index=False).agg(["min", "max"])
times_df.columns = [col[0] + ("_" + col[1] if col[1] else "") for col in times_df.columns]
times_df["elapsed_time"] = "$\\times " + times_df["time_ratio_min"].apply(lambda x: "{0:.1f}".format(x)).astype("string") \
                        + " - \\times " + times_df["time_ratio_max"].apply(lambda x: "{0:.1f}".format(x)).astype("string") + "$"
times_df = times_df[["dataset", "model", "elapsed_time"]]
times_df = times_df.replace(df_replacements).pivot(index="dataset", columns="model").reset_index()
times_df.columns = ["Dataset"] + [col[1] for col in times_df.columns][1:]
times_df = times_df.sort_values("Dataset", key=lambda x: [DATASET_ORDER.index(i) for i in x])

times_df = times_df[["Dataset", "FT-Transformer", "XGBoost + HS", "XGBoost"]]

with open(os.path.join(ASSETS_DIR, "fs_times_results.tex"), "w") as f:
    f.write(times_df.to_latex(index=False))    

times_df

Unnamed: 0,Dataset,FT-Transformer,XGBoost + HS,XGBoost
6,volkert,$\times 7.6 - \times 18.2$,$\times 905.6 - \times 1533.5$,$\times 366.8 - \times 621.4$
2,jasmine,$\times 3.4 - \times 7.3$,$\times 25.5 - \times 36.8$,$\times 7.7 - \times 10.7$
4,nomao,$\times 3.5 - \times 7.2$,$\times 23.0 - \times 41.3$,$\times 7.7 - \times 13.3$
3,kr-vs-kp,$\times 4.4 - \times 5.0$,$\times 23.8 - \times 29.5$,$\times 5.2 - \times 6.1$
5,sylvine,$\times 5.0 - \times 6.1$,$\times 21.6 - \times 30.8$,$\times 5.8 - \times 8.3$
1,australian,$\times 4.1 - \times 4.5$,$\times 26.0 - \times 30.6$,$\times 5.3 - \times 7.0$
0,adult,$\times 5.1 - \times 5.4$,$\times 4.9 - \times 8.2$,$\times 1.2 - \times 1.5$


# Rough degradation information

In [39]:
fs_method = "decision_tree"


indices_best = scores_df.groupby([
    "dataset", "model", "fs_method", 
    "fs_percent", "fs_n_features"
    ])["balanced_accuracy_mean"].idxmax()


best_scores_df = scores_df.loc[indices_best].query("fs_percent==1").groupby(["dataset", "model"], as_index=False)["balanced_accuracy_mean"].max()
best_scores_df.columns = ["dataset", "model", "balanced_accuracy_mean_max"]

best_scores_df = scores_df.loc[indices_best].merge(
    best_scores_df,
    how="left",
    on=["dataset", "model"]
    )

best_scores_df["degradation"] = 1 - best_scores_df["balanced_accuracy_mean"] / best_scores_df["balanced_accuracy_mean_max"]
best_scores_df = best_scores_df.query("fs_method==@fs_method")[["dataset", "model", "degradation"]]
best_scores_df = best_scores_df.groupby(["dataset", "model"], as_index=False).agg(["mean", "std"])

best_scores_df.columns = [col[0] + ("_" + col[1] if col[1] else "") for col in best_scores_df.columns]
best_scores_df["degradation"] = "$" + (best_scores_df["degradation_mean"] * 100).apply(num_as_str).astype("string") \
                        + " \pm " + (best_scores_df["degradation_std"] * 100).apply(num_as_str).astype("string") + "$"
best_scores_df = best_scores_df[["dataset", "model", "degradation"]]
best_scores_df = best_scores_df.replace(df_replacements).pivot(index="dataset", columns="model").reset_index()
best_scores_df.columns = ["Dataset"] + [col[1] for col in best_scores_df.columns][1:]
best_scores_df = best_scores_df.sort_values("Dataset", key=lambda x: [DATASET_ORDER.index(i) for i in x])

with open(os.path.join(ASSETS_DIR, "fs_degradation_results.tex"), "w") as f:
    f.write(best_scores_df.to_latex(index=False))  

best_scores_df

Unnamed: 0,Dataset,FT-Transformer,MLP + Pruning,XGBoost,XGBoost + HS
6,volkert,$6.330 \pm 7.541$,$9.604 \pm 11.250$,$0.553 \pm 0.504$,$0.525 \pm 0.499$
2,jasmine,$2.418 \pm 1.660$,$0.872 \pm 0.681$,$1.065 \pm 0.695$,$0.608 \pm 0.416$
4,nomao,$5.438 \pm 4.538$,$3.865 \pm 3.422$,$0.206 \pm 0.139$,$0.125 \pm 0.131$
3,kr-vs-kp,$2.439 \pm 2.432$,$1.065 \pm 0.804$,$0.436 \pm 0.544$,$0.376 \pm 0.547$
5,sylvine,$0.540 \pm 0.287$,$0.166 \pm 0.197$,$-0.292 \pm 0.266$,$-0.392 \pm 0.318$
1,australian,$1.790 \pm 2.168$,$1.259 \pm 1.887$,$1.576 \pm 2.504$,$1.570 \pm 2.493$
0,adult,$5.426 \pm 5.646$,$8.303 \pm 9.424$,$5.834 \pm 4.383$,$5.820 \pm 4.348$


# Degradation per feature selector

In [40]:
# Select best architecture on each feature selection percentage
indices_best = scores_df.groupby([
    "dataset", "model", "fs_method", 
    "fs_percent", "fs_n_features"
    ])["balanced_accuracy_mean"].idxmax()


best_scores_df = scores_df.loc[indices_best].query("fs_percent==1").groupby(["dataset", "model"], as_index=False)["balanced_accuracy_mean"].max()
best_scores_df.columns = ["dataset", "model", "balanced_accuracy_mean_max"]

best_scores_df = scores_df.loc[indices_best].merge(
    best_scores_df,
    how="left",
    on=["dataset", "model"]
    )

best_scores_df["degradation"] = 1 - best_scores_df["balanced_accuracy_mean"] / best_scores_df["balanced_accuracy_mean_max"]
best_scores_df = best_scores_df.replace(df_replacements)



fig = go.Figure(layout=go.Layout(
    margin={"l": 70, "r": 0, "b": 70, "t": 0},    
    template="plotly_white",
    font={"size": 18}
)
)

for fs_m in best_scores_df["fs_method"].unique():
    if fs_m.startswith("random"):
        continue 
        
    plot_df = best_scores_df.query("fs_method==@fs_m")[["fs_percent", "degradation"]]
    fig.add_trace(go.Box(
        x=(plot_df["fs_percent"] * 100).apply(lambda x: str(int(x)) + "%"),
        y=(plot_df["degradation"] * 100),
        name=fs_m,
        #boxpoints=False
    ))

    
fig.update_layout(
    yaxis_title="Degradation percentage",
    xaxis_title="Features percentage",
    xaxis=dict(autorange="reversed"),
    yaxis=dict(
        tickmode = "array",
        tickvals = np.arange(0, 50, 10),
        ticktext = [f"{i}%" for i in np.arange(0, 50, 10)]
    ),
    boxmode="group",
    legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.02,
                xanchor="right",
                x=1
            )
)


fig.show()
fig.write_image(os.path.join(ASSETS_DIR, "fs_degradation_ratio.png"), scale=IMG_SCALE)

# Test results

In [83]:
test_scores_df = pd.read_csv("fs_efficency/test_scores.csv")
test_scores_df = test_scores_df.query(
    "dataset not in ['anneal', 'ldpa'] "  
    #"and fs_percent>=0.5 "                
)
test_scores_df.head()

Unnamed: 0,dataset,model,architecture_name,fs_method,fs_percent,balanced_accuracy,accuracy,log_loss,roc_auc,f1,precision,recall,training_time,inference_time,fs_n_features
0,adult,mlp/full,A0,decision_tree,1.0,0.787742,0.850445,0.338507,0.787742,0.902503,0.898247,0.906799,133.980073,0.194351,14
1,adult,mlp/full,A0,f_classif,1.0,0.787742,0.850445,0.338507,0.787742,0.902503,0.898247,0.906799,133.980073,0.194351,14
2,adult,mlp/full,A0,linear_model,1.0,0.787742,0.850445,0.338507,0.787742,0.902503,0.898247,0.906799,133.980073,0.194351,14
3,adult,mlp/full,A0,random_1,1.0,0.787742,0.850445,0.338507,0.787742,0.902503,0.898247,0.906799,133.980073,0.194351,14
4,adult,mlp/full,A0,random_2,1.0,0.787742,0.850445,0.338507,0.787742,0.902503,0.898247,0.906799,133.980073,0.194351,14


In [84]:
ignored_models = [
    "mlp/full", 
    "mlp/full/best", 
    "mlp/full_nd", 
    "mlp/full_nd/best", 
    "mlp/full_nd_nn", 
    "mlp/full_nd_nn/best",
    "xgboost/best"
]
test_scores_df = test_scores_df.query("model not in @ignored_models")
test_scores_df["model"].unique()

array(['mlp/simple', 'mlp/simple_nd', 'mlp/simple_nd_nn',
       'transformer/cls', 'xgboost/best'], dtype=object)

## General results

In [91]:
indices_best = test_scores_df.groupby([
    "dataset", "model", "fs_method", 
    "fs_percent", "fs_n_features"
    ])["balanced_accuracy"].idxmax()


best_scores_df = test_scores_df.loc[indices_best].query("fs_percent==1 "
                                                   "and fs_method=='decision_tree' "
                                                  "and model in ['transformer/cls','mlp/simple','xgboost/best']"
                                                  )
best_scores_df["balanced_accuracy"] = (best_scores_df["balanced_accuracy"] * 100).apply(num_as_str)


best_scores_df = best_scores_df.replace(df_replacements)[["dataset", "model", "balanced_accuracy"]] \
                        .pivot(index="dataset", columns="model") \
                        .reset_index()


best_scores_df.columns = ["Dataset"] + [col[1] for col in best_scores_df.columns][1:]


best_scores_df = best_scores_df.sort_values("Dataset", key=lambda x: [DATASET_ORDER.index(i) for i in x])

with open(os.path.join(ASSETS_DIR, "fs_test_general_results.tex"), "w") as f:
    f.write(best_scores_df.to_latex(index=False))    
    
best_scores_df



Unnamed: 0,Dataset,FT-Transformer,MLP + Pruning,XGBoost
6,volkert,63.578,60.139,59.494
2,jasmine,80.477,77.255,80.56
4,nomao,94.813,94.01,96.217
3,kr-vs-kp,99.685,99.211,97.348
5,sylvine,94.123,92.955,94.689
1,australian,85.448,86.129,89.067
0,adult,79.187,78.774,76.816


## Correlations

In [88]:
for metric in ["accuracy", "log_loss", "balanced_accuracy"]:
    full_cos_df = None

    for ds in test_scores_df["dataset"].unique():

        cos_df = None

        for g_d, g in test_scores_df.query(
                "fs_percent<1 "
                #"and model in ['transformer/cls', 'mlp/simple', 'xgboost/best'] "
                "and dataset==@ds "
            ).groupby(["model", "architecture_name"], as_index=False):
            
            model_name = "{}_{}".format(*g_d) if g_d[0] == "xgboost" else g_d[0]
            
            mod_g = g[["dataset", "fs_method", "fs_percent", f"{metric}"]]
            mod_g.columns = ["dataset", "fs_method", "fs_percent", model_name ]
            
            if cos_df is None:
                cos_df = mod_g
                continue
                
            cos_df = cos_df.merge(
                mod_g,
                on = ["dataset", "fs_method", "fs_percent"]
            )
            
            
        if full_cos_df is None:
            full_cos_df = cos_df.copy()
            continue

        full_cos_df = pd.concat([full_cos_df, cos_df], axis=0)
        
        
    print("=======", metric)
    for m in scores_df["model"].unique():
        if "xgboost" in m:
            continue
            
        print(df_replacements["model"][m], ":", spearmanr(full_cos_df[m], full_cos_df["xgboost/best"]))

MLP + Pruning : SignificanceResult(statistic=0.951747870176691, pvalue=2.0375076925918082e-173)
MLP + Pruning - Dropout : SignificanceResult(statistic=0.8205412431435823, pvalue=3.8958383930520075e-83)
MLP + Pruning - Dropout - Non-linearity : SignificanceResult(statistic=0.8053972100220989, pvalue=7.388900857705384e-78)
FT-Transformer : SignificanceResult(statistic=0.9509329411674544, pvalue=3.117888819501114e-172)
MLP + Pruning : SignificanceResult(statistic=0.8225677641106275, pvalue=7.023628516072935e-84)
MLP + Pruning - Dropout : SignificanceResult(statistic=0.7018033202808204, pvalue=3.9449088921865913e-51)
MLP + Pruning - Dropout - Non-linearity : SignificanceResult(statistic=0.5129073873368291, pvalue=6.1086215116870984e-24)
FT-Transformer : SignificanceResult(statistic=0.8029657796481484, pvalue=4.706547219187237e-77)
MLP + Pruning : SignificanceResult(statistic=0.9318170585464391, pvalue=4.470788330511702e-149)
MLP + Pruning - Dropout : SignificanceResult(statistic=0.82726359

## Selecting 80% of features

In [94]:
indices_best = test_scores_df.groupby([
    "dataset", "model", "fs_method", 
    "fs_percent", "fs_n_features"
    ])["balanced_accuracy"].idxmax()


best_scores_df = test_scores_df.loc[indices_best].query("fs_percent==0.8 "
                                                   "and fs_method=='decision_tree' "
                                                  "and model in ['transformer/cls','mlp/simple','xgboost/best']"
                                                  )
best_scores_df["balanced_accuracy"] = (best_scores_df["balanced_accuracy"] * 100).apply(num_as_str)


best_scores_df = best_scores_df.replace(df_replacements)[["dataset", "model", "balanced_accuracy"]] \
                        .pivot(index="dataset", columns="model") \
                        .reset_index()


best_scores_df.columns = ["Dataset"] + [col[1] for col in best_scores_df.columns][1:]


best_scores_df = best_scores_df.sort_values("Dataset", key=lambda x: [DATASET_ORDER.index(i) for i in x])

with open(os.path.join(ASSETS_DIR, "fs_test_general_results_0.8.tex"), "w") as f:
    f.write(best_scores_df.to_latex(index=False))    
    
best_scores_df



Unnamed: 0,Dataset,FT-Transformer,MLP + Pruning,XGBoost
6,volkert,63.611,60.178,60.126
2,jasmine,80.606,77.602,80.848
4,nomao,94.189,94.639,96.32
3,kr-vs-kp,99.369,98.429,97.19
5,sylvine,93.931,93.051,95.37
1,australian,87.428,82.063,89.067
0,adult,75.731,74.39,73.676
