In [18]:
import polars as pl

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json

In [19]:
result_df = pl.read_csv('./exp_save/retrain_result.csv')
result_df.head(5), result_df.shape

(shape: (5, 6)
 ┌───────────────┬─────────────────────────────────┬──────────┬──────┬──────────┬──────────┐
 │ dataset       ┆ selected_features               ┆ model    ┆ seed ┆ auc      ┆ bce_loss │
 │ ---           ┆ ---                             ┆ ---      ┆ ---  ┆ ---      ┆ ---      │
 │ str           ┆ str                             ┆ str      ┆ i64  ┆ f64      ┆ f64      │
 ╞═══════════════╪═════════════════════════════════╪══════════╪══════╪══════════╪══════════╡
 │ 'placeholder' ┆ 'x                              ┆ ''x'     ┆ 0    ┆ 0.1      ┆ 0.1      │
 │ movielens-1m  ┆ all                             ┆ widedeep ┆ 0    ┆ 0.793549 ┆ 0.537674 │
 │ movielens-1m  ┆ all                             ┆ widedeep ┆ 1    ┆ 0.794956 ┆ 0.535825 │
 │ movielens-1m  ┆ all                             ┆ widedeep ┆ 2    ┆ 0.794059 ┆ 0.537212 │
 │ movielens-1m  ┆ ['genres', 'movie_id', 'title'… ┆ widedeep ┆ 0    ┆ 0.81005  ┆ 0.520381 │
 └───────────────┴─────────────────────────────────┴───

In [20]:
def compute_polar_metric(w):

    return (np.mean(np.abs(w - np.mean(w))))


In [22]:
fs_methods = ['no_selection','shuffle_gate',
                   'autofield',  'sfs', 'shark', 'lpfs', 
                  'gbdt', 'lasso','rf', 'xgb'
                  ] #, '

model_list = ['widedeep']

data_list = ['movielens-1m','aliccp','avazu','criteo']


percent = 0.5
res = {"model":[],"dataset":[],"fs":[],'auc':[],'bce_loss':[],'polar':[],'keep_fea_cnt':[]}
save_path = './exp_save'
for model in model_list:
    for dataset in data_list:
        with open(f'./quick_data/{dataset}.json','r') as f:
            ori_k = len(json.load(f)['features'])
            
            for fs in fs_methods:    
                BCE_loss = []
                AUC = []
                Polar =[]
                for fs_seed in [0,1]:
                    if fs != 'no_selection':
                    
                        if fs in ['gbdt','lasso', 'rf', 'xgb']: #ML
                            fea_importance_csv_path = f'{save_path}/ml_feature_importance/{fs}-{dataset}-{fs_seed}.csv'    
                        else:
                            fea_importance_csv_path = f'{save_path}/fea_importance/{fs}-{dataset}-{model}-{fs_seed}.csv'
                        fea_df = pl.read_csv(fea_importance_csv_path)
                        
                        
                        top_k = int(fea_df.shape[0] * percent)
                        
                        selected_features = list(fea_df.sort(pl.col('importance'),descending=True)['fea'][:top_k])
                        
                        selected_features_str = str(sorted(set(selected_features)))
                        polar = compute_polar_metric(fea_df['importance'].to_numpy())
                        Polar.append(polar)
                    else:
                        selected_features_str = 'all'
                        selected_features = None
                        top_k = ori_k
                        polar = None
                        

                    auc_loss_np = result_df.filter(
                        pl.col('dataset') == dataset,
                        pl.col('selected_features') == selected_features_str,
                        pl.col('model') == model
                    )[['auc','bce_loss']].to_numpy()

                    
                    AUC.extend(auc_loss_np[:,0].tolist())
                    BCE_loss.extend(auc_loss_np[:,1].tolist())


                res['model'].append(model)
                res['dataset'].append(dataset)
                res['fs'].append(fs)
                
                res['auc'].append(round(np.mean(AUC),4))
                res['bce_loss'].append(round(np.mean(BCE_loss),4))
                res['polar'].append(round(np.mean(Polar),4))
                res['keep_fea_cnt'].append(top_k)

pl.DataFrame(res).sort('auc',descending=True).filter(
    ~pl.col('auc').is_nan()
).to_pandas()

Unnamed: 0,model,dataset,fs,auc,bce_loss,polar,keep_fea_cnt
0,widedeep,movielens-1m,shuffle_gate,0.8105,0.5204,0.3249,4
1,widedeep,movielens-1m,shark,0.8105,0.5204,0.0397,4
2,widedeep,movielens-1m,gbdt,0.8105,0.5204,0.1248,4
3,widedeep,movielens-1m,xgb,0.8105,0.5204,0.063,4
4,widedeep,movielens-1m,autofield,0.8073,0.5244,0.0533,4
5,widedeep,movielens-1m,sfs,0.8024,0.5286,0.0787,4
6,widedeep,movielens-1m,lpfs,0.8017,0.5294,0.1104,4
7,widedeep,movielens-1m,no_selection,0.7942,0.5369,,9
8,widedeep,movielens-1m,rf,0.7924,0.5386,0.0494,4
9,widedeep,movielens-1m,lasso,0.6486,0.6463,0.0042,4


In [23]:
data_index = 0 # 0,1,2,3
pl.DataFrame(res).sort('auc',descending=True).filter(
    ~pl.col('auc').is_nan(),
    pl.col('dataset') == data_list[data_index]
)

model,dataset,fs,auc,bce_loss,polar,keep_fea_cnt
str,str,str,f64,f64,f64,i64
"""widedeep""","""movielens-1m""","""shuffle_gate""",0.8105,0.5204,0.3249,4
"""widedeep""","""movielens-1m""","""shark""",0.8105,0.5204,0.0397,4
"""widedeep""","""movielens-1m""","""gbdt""",0.8105,0.5204,0.1248,4
"""widedeep""","""movielens-1m""","""xgb""",0.8105,0.5204,0.063,4
"""widedeep""","""movielens-1m""","""autofield""",0.8073,0.5244,0.0533,4
"""widedeep""","""movielens-1m""","""sfs""",0.8024,0.5286,0.0787,4
"""widedeep""","""movielens-1m""","""lpfs""",0.8017,0.5294,0.1104,4
"""widedeep""","""movielens-1m""","""no_selection""",0.7942,0.5369,,9
"""widedeep""","""movielens-1m""","""rf""",0.7924,0.5386,0.0494,4
"""widedeep""","""movielens-1m""","""lasso""",0.6486,0.6463,0.0042,4
