In [1]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np
from config import ASSETS_DIR
import os


IMG_SCALE = 2.5

In [57]:
scores_df = pd.read_csv("fs_efficency/scores.csv")
scores_df = scores_df.query(
                "fs_percent>=0.5 "
                "and dataset not in ['anneal', 'ldpa']"
)
scores_df.head()

Unnamed: 0,dataset,model,architecture_name,fs_method,fs_percent,fs_n_features,balanced_accuracy_mean,balanced_accuracy_std,balanced_accuracy_sum,accuracy_mean,...,precision_sum,recall_mean,recall_std,recall_sum,inference_time_mean,inference_time_std,inference_time_sum,training_time_mean,training_time_std,training_time_sum
1,adult,mlp/full,A0,decision_tree,0.5,7,0.752513,0.011217,3.762564,0.835334,...,4.383474,0.911746,0.012794,4.558728,0.161841,0.005832,0.809205,85.73974,3.457281,428.698698
2,adult,mlp/full,A0,decision_tree,0.6,8,0.755865,0.010407,3.779323,0.836102,...,4.393496,0.910129,0.012232,4.550646,0.160911,0.003812,0.804556,81.501757,3.161224,407.508784
3,adult,mlp/full,A0,decision_tree,0.7,9,0.755999,0.008595,3.779996,0.836307,...,4.393418,0.910399,0.008993,4.551993,0.163342,0.007492,0.816711,85.261526,9.673928,426.30763
4,adult,mlp/full,A0,decision_tree,0.8,11,0.763006,0.007734,3.815028,0.839634,...,4.412223,0.910331,0.00816,4.551656,0.156638,0.001617,0.783192,78.800536,4.650035,394.002682
5,adult,mlp/full,A0,decision_tree,0.9,12,0.76451,0.006098,3.822549,0.841476,...,4.414594,0.912486,0.008924,4.56243,0.160309,0.004055,0.801546,77.983038,2.77402,389.915189


# Adding best architectures cases

In [58]:
best_archs_indices = scores_df.query(
    "fs_percent==1 "
    "and fs_method=='decision_tree' " # The method is not relevant
    "and model in ['mlp/full', 'xgboost']").groupby(["dataset", "model"])["balanced_accuracy_mean"].idxmax()

best_archs_df = scores_df.loc[best_archs_indices, ["dataset", "model", "architecture_name"]]

for r in best_archs_df.iloc:
    ds = r["dataset"]
    m = r["model"]
    a = r["architecture_name"]
    
    only_best_arch = scores_df.query("dataset==@ds and model==@m and architecture_name==@a").copy()
    only_best_arch["model"] = only_best_arch["model"] + "/best"
    
    scores_df = pd.concat([scores_df, only_best_arch], axis=0)

scores_df = scores_df.reset_index(drop=True)
scores_df

Unnamed: 0,dataset,model,architecture_name,fs_method,fs_percent,fs_n_features,balanced_accuracy_mean,balanced_accuracy_std,balanced_accuracy_sum,accuracy_mean,...,precision_sum,recall_mean,recall_std,recall_sum,inference_time_mean,inference_time_std,inference_time_sum,training_time_mean,training_time_std,training_time_sum
0,adult,mlp/full,A0,decision_tree,0.5,7,0.752513,0.011217,3.762564,0.835334,...,4.383474,0.911746,0.012794,4.558728,0.161841,0.005832,0.809205,85.739740,3.457281,428.698698
1,adult,mlp/full,A0,decision_tree,0.6,8,0.755865,0.010407,3.779323,0.836102,...,4.393496,0.910129,0.012232,4.550646,0.160911,0.003812,0.804556,81.501757,3.161224,407.508784
2,adult,mlp/full,A0,decision_tree,0.7,9,0.755999,0.008595,3.779996,0.836307,...,4.393418,0.910399,0.008993,4.551993,0.163342,0.007492,0.816711,85.261526,9.673928,426.307630
3,adult,mlp/full,A0,decision_tree,0.8,11,0.763006,0.007734,3.815028,0.839634,...,4.412223,0.910331,0.008160,4.551656,0.156638,0.001617,0.783192,78.800536,4.650035,394.002682
4,adult,mlp/full,A0,decision_tree,0.9,12,0.764510,0.006098,3.822549,0.841476,...,4.414594,0.912486,0.008924,4.562430,0.160309,0.004055,0.801546,77.983038,2.774020,389.915189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1927,volkert,xgboost/best,A3,linear_model,0.8,144,0.611326,0.006252,3.056632,0.703181,...,0.000000,,,0.000000,0.225573,0.010134,1.127863,114.847031,3.649015,574.235153
1928,volkert,xgboost/best,A3,linear_model,0.9,162,0.609670,0.008966,3.048349,0.701337,...,0.000000,,,0.000000,0.232875,0.014124,1.164373,123.314467,3.841903,616.572334
1929,volkert,xgboost/best,A3,decision_tree,1.0,180,0.609670,0.008966,3.048349,0.701337,...,0.000000,,,0.000000,0.239392,0.012999,1.196959,123.524770,3.850030,617.623848
1930,volkert,xgboost/best,A3,f_classif,1.0,180,0.609670,0.008966,3.048349,0.701337,...,0.000000,,,0.000000,0.239392,0.012999,1.196959,123.524770,3.850030,617.623848


In [59]:
df_replacements = {
    "model": {
       "transformer/cls": "FT-Trasnformer",
       "mlp/full": "MLP + HS",
       "mlp/full/best": "MLP⁺",
       "mlp/simple": "MLP⁺ + Pruning",
       "xgboost": "XGBoost",
       "xgboost/best": "XGBoost⁺"
    },
    "fs_method": {
        "decision_tree": "Decision tree",
        "f_classif": "F-score",
        "linear_model": "Linear model"
    }
}

# General results

In [60]:
def num_as_str(x):
    return "{0:.3f}".format(x)

In [61]:
indices_best = scores_df.groupby([
    "dataset", "model", "fs_method", 
    "fs_percent", "fs_n_features"
    ])["balanced_accuracy_mean"].idxmax()


best_scores_df = scores_df.loc[indices_best].query("fs_percent==1 "
                                                   "and fs_method=='decision_tree' "
                                                  "and model in ['transformer/cls','mlp/full','xgboost']"
                                                  )
best_scores_df["balanced_accuracy"] = "$" + (best_scores_df["balanced_accuracy_mean"] * 100).apply(num_as_str) \
                        + " \pm " + (best_scores_df["balanced_accuracy_std"] * 100).apply(num_as_str) + "$" 

best_scores_df = best_scores_df.replace(df_replacements)[["dataset", "model", "balanced_accuracy"]] \
                        .pivot(index="dataset", columns="model") \
                        .reset_index()


best_scores_df.columns = [col[1] for col in best_scores_df.columns]

with open(os.path.join(ASSETS_DIR, "fs_general_results.tex"), "w") as f:
    f.write(best_scores_df.to_latex(index=False))    
    
best_scores_df

Unnamed: 0,Unnamed: 1,FT-Trasnformer,MLP + HS,XGBoost
0,adult,$79.215 \pm 0.970$,$78.472 \pm 1.413$,$79.784 \pm 0.779$
1,australian,$87.195 \pm 3.422$,$85.785 \pm 2.228$,$85.660 \pm 2.866$
2,jasmine,$81.890 \pm 2.149$,$78.897 \pm 1.585$,$81.707 \pm 1.105$
3,kr-vs-kp,$99.839 \pm 0.166$,$99.682 \pm 0.173$,$99.368 \pm 0.205$
4,nomao,$95.159 \pm 0.393$,$93.912 \pm 0.775$,$96.166 \pm 0.290$
5,sylvine,$94.890 \pm 0.404$,$92.811 \pm 0.559$,$94.427 \pm 0.623$
6,volkert,$62.373 \pm 0.915$,$59.915 \pm 0.781$,$60.967 \pm 0.897$


In [62]:
indices_best = scores_df.groupby([
    "dataset", "model", "fs_method", 
    "fs_percent", "fs_n_features"
    ])["balanced_accuracy_mean"].idxmax()


best_scores_df = scores_df.loc[indices_best].query("fs_percent==1 "
                                                   "and fs_method=='decision_tree' "
                                                  "and model in ['transformer/cls','mlp/full','xgboost']"
                                                  )

best_scores_df = best_scores_df[["dataset", "model", "balanced_accuracy_mean"]] \
                        .pivot(index="dataset", columns="model") \
                        .reset_index()

best_scores_df.columns = [col[1] for col in best_scores_df.columns]

best_scores_df["best"] = best_scores_df.iloc[:, 1:].max(axis=1)

best_scores_df["mlp/full"] = (1 - best_scores_df["mlp/full"] / best_scores_df["best"]) * 100
best_scores_df["transformer/cls"] = (1 - best_scores_df["transformer/cls"] / best_scores_df["best"]) * 100
best_scores_df["xgboost"] = (1 - best_scores_df["xgboost"] / best_scores_df["best"]) * 100
best_scores_df

Unnamed: 0,Unnamed: 1,mlp/full,transformer/cls,xgboost,best
0,adult,1.645049,0.713716,0.0,0.797843
1,australian,1.61767,0.0,1.761058,0.871954
2,jasmine,3.653942,0.0,0.223072,0.818897
3,kr-vs-kp,0.157247,0.0,0.471742,0.998389
4,nomao,2.343484,1.047115,0.0,0.96166
5,sylvine,2.191173,0.0,0.487993,0.948902
6,volkert,3.941328,0.0,2.254851,0.623734


# Degradation per feature selector

In [63]:
# Select best architecture on each feature selection percentage
indices_best = scores_df.groupby([
    "dataset", "model", "fs_method", 
    "fs_percent", "fs_n_features"
    ])["balanced_accuracy_mean"].idxmax()


best_scores_df = scores_df.loc[indices_best].query("fs_percent==1").groupby(["dataset", "model"], as_index=False)["balanced_accuracy_mean"].max()
best_scores_df.columns = ["dataset", "model", "balanced_accuracy_mean_max"]

best_scores_df = scores_df.loc[indices_best].merge(
    best_scores_df,
    how="left",
    on=["dataset", "model"]
    )

best_scores_df["degradation"] = 1 - best_scores_df["balanced_accuracy_mean"] / best_scores_df["balanced_accuracy_mean_max"]
best_scores_df = best_scores_df.replace(df_replacements)



fig = go.Figure(layout=go.Layout(
    margin={"l": 70, "r": 0, "b": 70, "t": 0},    
    template="plotly_white",
    font={"size": 18}
)
)

for fs_m in best_scores_df["fs_method"].unique():
    plot_df = best_scores_df.query("fs_method==@fs_m")[["fs_percent", "degradation"]]
    fig.add_trace(go.Box(
        x=(plot_df["fs_percent"] * 100).apply(lambda x: str(int(x)) + "%"),
        y=plot_df["degradation"],
        name=fs_m,
        #boxpoints=False
    ))

    
fig.update_layout(
    yaxis_title="Degradation ratio",
    xaxis_title="Features percent",
    xaxis=dict(autorange="reversed"),
    boxmode="group",
    legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.02,
                xanchor="right",
                x=1
            )
)


fig.show()
fig.write_image(os.path.join(ASSETS_DIR, "fs_degradation_ratio.png"), scale=IMG_SCALE)

# Full models CV scores

In [78]:
#kr-vs-kp
#sylvine
# volkert takes much time (consider)


dataset = "nomao"
fs_method = "decision_tree"


cos_df = scores_df.query(
    "dataset==@dataset "
    "and fs_method==@fs_method"
)

# Figure showing the balanced accuracy
fig = go.Figure()

for m in cos_df["model"].unique():
    
    plot_df = cos_df.query(
            "model==@m "
        )
        
    idx_mins = plot_df.groupby(["fs_percent"])["balanced_accuracy_mean"].idxmax()
    plot_df = plot_df.loc[idx_mins]
        
    fig.add_trace(go.Scatter(
        x=plot_df["fs_percent"],
        y=plot_df["balanced_accuracy_mean"],
        name=df_replacements["model"][m],
        error_y=dict(
            type="data",
            array=plot_df["balanced_accuracy_std"],
            visible=True
        )
    ))
    
fig.update_xaxes(autorange="reversed")
fig.show()

# Figure showing the time until search the balanced accuracy in the last plot
fig = go.Figure()

for m in cos_df["model"].unique():
    
    plot_df = cos_df.query(
            "model==@m "
        )
    
    times_df = plot_df.groupby(["fs_percent"], as_index=False).sum()[["fs_percent", "training_time_sum", "inference_time_sum"]]
    times_df = times_df.sort_values("fs_percent", ascending=False)
    
    if m in ["transformer/cls", "mlp/simple"]:
        times_df["training_time_sum"] = 0
        
    times_df.loc[times_df["fs_percent"] == 1, "training_time_sum"] = 0
    times_df.loc[times_df["fs_percent"] == 1, "inference_time_sum"] = 0
    
    fig.add_trace(go.Scatter(
        x=times_df["fs_percent"],
        #y=np.cumsum((times_df["training_time_sum"] + times_df["inference_time_sum"]).values),
        y=times_df["training_time_sum"] + times_df["inference_time_sum"],
        name=df_replacements["model"][m],
        error_y=dict(
            type="data",
            array=plot_df["balanced_accuracy_std"],
            visible=True
        )
    ))
    
fig.update_xaxes(autorange="reversed")
fig.update_yaxes(type="log")
fig.show()