In [1]:
import polars as pl

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json

In [2]:
result_df = pl.read_csv('./exp_save/retrain_result.csv')
result_df.head(5), result_df.shape

(shape: (5, 6)
 ┌───────────────┬──────────────────────┬──────────┬──────┬──────────┬──────────┐
 │ dataset       ┆ selected_features    ┆ model    ┆ seed ┆ auc      ┆ bce_loss │
 │ ---           ┆ ---                  ┆ ---      ┆ ---  ┆ ---      ┆ ---      │
 │ str           ┆ str                  ┆ str      ┆ i64  ┆ f64      ┆ f64      │
 ╞═══════════════╪══════════════════════╪══════════╪══════╪══════════╪══════════╡
 │ 'placeholder' ┆ 'x                   ┆ ''x'     ┆ 0    ┆ 0.1      ┆ 0.1      │
 │ movielens-1m  ┆ all                  ┆ widedeep ┆ 0    ┆ 0.793549 ┆ 0.537674 │
 │ movielens-1m  ┆ all                  ┆ widedeep ┆ 1    ┆ 0.794956 ┆ 0.535825 │
 │ movielens-1m  ┆ all                  ┆ widedeep ┆ 2    ┆ 0.794059 ┆ 0.537212 │
 │ movielens-1m  ┆ ['title', 'user_id'] ┆ widedeep ┆ 0    ┆ 0.807336 ┆ 0.52316  │
 └───────────────┴──────────────────────┴──────────┴──────┴──────────┴──────────┘,
 (966, 6))

In [3]:
def compute_polar_metric(w):

    return (np.mean(np.abs(w - np.mean(w))))


In [34]:
fs_methods = ['no_selection','shuffle_gate',
                   'autofield',  'sfs', 'shark', 'lpfs', 
                  'gbdt', 'lasso','rf', 'xgb'
                  ] #, '

model_list = ['widedeep']

data_list = ['movielens-1m','aliccp','avazu','criteo']


percent = 0.5
res = {"model":[],"dataset":[],"fs":[],'auc':[],'bce_loss':[],'polar':[],'keep_fea_cnt':[]}
save_path = '.'
for model in model_list:
    for dataset in data_list:
        with open(f'../quick_data/{dataset}.json','r') as f:
            ori_k = len(json.load(f)['features'])
            
            for fs in fs_methods:    
                BCE_loss = []
                AUC = []
                Polar =[]
                for fs_seed in [0,1]:
                    if fs != 'no_selection':
                    
                        if fs in ['gbdt','lasso', 'rf', 'xgb']: #ML
                            fea_importance_csv_path = f'{save_path}/ml_feature_importance/{fs}-{dataset}-{fs_seed}.csv'    
                        else:
                            fea_importance_csv_path = f'{save_path}/fea_importance/{fs}-{dataset}-{model}-{fs_seed}.csv'
                        fea_df = pl.read_csv(fea_importance_csv_path)
                        
                        
                        top_k = int(fea_df.shape[0] * percent)
                        
                        selected_features = list(fea_df.sort(pl.col('importance'),descending=True)['fea'][:top_k])
                        
                        selected_features_str = str(sorted(set(selected_features)))
                        polar = compute_polar_metric(fea_df['importance'].to_numpy())
                        Polar.append(polar)
                    else:
                        selected_features_str = 'all'
                        selected_features = None
                        top_k = ori_k
                        polar = None
                        

                    auc_loss_np = result_df.filter(
                        pl.col('dataset') == dataset,
                        pl.col('selected_features') == selected_features_str,
                        pl.col('model') == model
                    )[['auc','bce_loss']].to_numpy()

                    
                    AUC.extend(auc_loss_np[:,0].tolist())
                    BCE_loss.extend(auc_loss_np[:,1].tolist())


                res['model'].append(model)
                res['dataset'].append(dataset)
                res['fs'].append(fs)
                
                res['auc'].append(round(np.mean(AUC),4))
                res['bce_loss'].append(round(np.mean(BCE_loss),4))
                res['polar'].append(round(np.mean(Polar),4))
                res['keep_fea_cnt'].append(top_k)

pl.DataFrame(res).sort('auc',descending=True).filter(
    ~pl.col('auc').is_nan()
).to_pandas()

no_selection 6 [0.7935486613453142, 0.7949560273267453, 0.7940591558175025, 0.7935486613453142, 0.7949560273267453, 0.7940591558175025]
shuffle_gate 6 [0.8073361535704644, 0.8072959973255345, 0.8073110965496744, 0.8066379727478676, 0.8077731065072211, 0.8073694787198806]
autofield 6 [0.8066379727478676, 0.8077731065072211, 0.8073694787198806, 0.8073361535704644, 0.8072959973255345, 0.8073110965496744]
sfs 6 [0.7336885669588364, 0.7336603451193209, 0.7334063506813251, 0.8066379727478676, 0.8077731065072211, 0.8073694787198806]
shark 6 [0.8073361535704644, 0.8072959973255345, 0.8073110965496744, 0.8066379727478676, 0.8077731065072211, 0.8073694787198806]
lpfs 6 [0.8073361535704644, 0.8072959973255345, 0.8073110965496744, 0.8066379727478676, 0.8077731065072211, 0.8073694787198806]
gbdt 6 [0.7334966404501582, 0.7338185487850687, 0.7338849763077688, 0.7334966404501582, 0.7338185487850687, 0.7338849763077688]
lasso 6 [0.530623041164885, 0.5286263058898119, 0.5303806920739189, 0.5306230411648

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,model,dataset,fs,auc,bce_loss,polar,keep_fea_cnt
0,widedeep,movielens-1m,shuffle_gate,0.8073,0.5242,0.3249,2
1,widedeep,movielens-1m,autofield,0.8073,0.5242,0.0533,2
2,widedeep,movielens-1m,shark,0.8073,0.5242,0.0397,2
3,widedeep,movielens-1m,lpfs,0.8073,0.5242,0.1104,2
4,widedeep,criteo,no_selection,0.8012,0.4522,,39
5,widedeep,movielens-1m,no_selection,0.7942,0.5369,,9
6,widedeep,avazu,no_selection,0.788,0.3752,,22
7,widedeep,criteo,shuffle_gate,0.7853,0.4653,0.4368,9
8,widedeep,criteo,autofield,0.7781,0.471,0.0987,9
9,widedeep,avazu,shark,0.7721,0.3845,0.0156,5


In [40]:
data_index = 0 # 0,1,2,3
pl.DataFrame(res).sort('auc',descending=True).filter(
    ~pl.col('auc').is_nan(),
    pl.col('dataset') == data_list[data_index]
)

model,dataset,fs,auc,bce_loss,polar,keep_fea_cnt
str,str,str,f64,f64,f64,i64
"""widedeep""","""aliccp""","""no_selection""",0.6592,0.1582,,23
"""widedeep""","""aliccp""","""shuffle_gate""",0.6483,0.159,0.3872,5
"""widedeep""","""aliccp""","""autofield""",0.6482,0.1589,0.1113,5
"""widedeep""","""aliccp""","""shark""",0.6468,0.159,0.0041,5
"""widedeep""","""aliccp""","""sfs""",0.6455,0.1591,0.0388,5
"""widedeep""","""aliccp""","""lpfs""",0.6423,0.1593,0.1235,5
"""widedeep""","""aliccp""","""rf""",0.6036,0.1618,0.0293,5
"""widedeep""","""aliccp""","""xgb""",0.5946,0.162,0.0505,5
"""widedeep""","""aliccp""","""gbdt""",0.5867,0.1623,0.0529,5
"""widedeep""","""aliccp""","""lasso""",0.5803,0.1627,0.0,5
