## 对抗验证
对抗验证（Adversarial Validation），并不是一种评估模型效果的方法，而是一种用来确认训练集和测试集的分布是否变化的方法。<br>
它的本质是构造一个分类模型，来预测样本是训练集或测试集的概率。<br>
通常来说AUC在0.7以上，那么可以说明我们的训练集和测试集存在较大的差异。

In [None]:
from sklearn.metrics import auc
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
import lightgbm as lgb


def get_adv_feats(df_train, df_test, feats):
    df_train['adv'] = 1
    df_test['adv'] = 0
    df = pd.concat([df_train, df_test]).reset_index(drop=True)
    params = {
        'learning_rate': 0.1,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'seed': 2222,
        'n_jobs': 4,
        'verbose': -1,
    }

    fold_num = 5
    seeds = [2023]
    new_feats = []

    for f in feats:
        oof = np.zeros(len(df))
        for seed in seeds:
            kf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=seed)
            for fold, (train_index, val_index) in enumerate(kf.split(df[[f]], df['adv'])):
                train_set = lgb.Dataset(df.loc[train_index, [f]], df.loc[train_index, ['adv']])
                val_set = lgb.Dataset(df.loc[val_index, [f]], df.loc[val_index, ['adv']])
                model = lgb.train(params, train_set, valid_sets=val_set, num_boost_round=10000,
                                  callbacks=[lgb.early_stopping(100), lgb.log_evaluation(-1)])
                oof[val_index] += model.predict(df.loc[val_index, [f]]) / len(seeds)
                score = auc(df.loc[val_index, 'adv'], oof[val_index])
                if score > 0.7:
                    print('--------------------------------------', f, score)
                else:
                    new_feats.append(f)
                break
    return new_feats

## null importance
<li> 首先使用真实标签进行训练，得到特征的重要性分数保存。
<li> 随机打乱标签，对打乱后的标签进行训练，打印特征重要性分数。这里可能需要根据实际情况多做几次，取重要性的均值或众数。
<li> 对于前两步骤的重要性分布情况，如果1中的重要性远高于2中的重要性，说明该特征是有效特征，反之说明该特征为无效特征。

https://www.kaggle.com/ogrellier/feature-selection-with-null-importances

### 1. 获取特征重要性

In [None]:
from sklearn.metrics import roc_auc_score


def get_feature_importances(data, shuffle, seed=None):
    # Gather real features
    train_features = [f for f in data if f not in ['TARGET', 'SK_ID_CURR']]
    # Go over fold and keep track of CV score (train and valid) and feature importances

    # Shuffle target if required
    y = data['TARGET'].copy()
    if shuffle:
        # Here you could as well use a binomial distribution
        y = data['TARGET'].copy().sample(frac=1.0)

    # Fit LightGBM in RF mode, yes it's quicker than sklearn RandomForest
    dtrain = lgb.Dataset(data[train_features], y, free_raw_data=False, silent=True)
    lgb_params = {
        'objective': 'binary',
        'boosting_type': 'rf',
        'subsample': 0.623,
        'colsample_bytree': 0.7,
        'num_leaves': 127,
        'max_depth': 8,
        'seed': seed,
        'bagging_freq': 1,
        'n_jobs': 4
    }

    # Fit the model
    clf = lgb.train(params=lgb_params, train_set=dtrain, num_boost_round=200, categorical_feature=categorical_feats)
    # Get feature importances
    imp_df = pd.DataFrame()
    imp_df["feature"] = list(train_features)
    imp_df["importance_gain"] = clf.feature_importance(importance_type='gain')
    imp_df["importance_split"] = clf.feature_importance(importance_type='split')
    imp_df['trn_score'] = roc_auc_score(y, clf.predict(data[train_features]))

    return imp_df

### 2. 获取原版本重要性分数

In [None]:
data = pd.DataFrame() # delete
# Seed the unexpected randomness of this world
np.random.seed(123)
# Get the actual importance, i.e. without shuffling
actual_imp_df = get_feature_importances(data=data, shuffle=False)

### 3. 获取多个target shuffle版本的特征重要性

In [None]:
null_imp_df = pd.DataFrame()
nb_runs = 80
import time
start = time.time()
dsp = ''
for i in range(nb_runs):
    # Get current run importances
    imp_df = get_feature_importances(data=data, shuffle=True)
    imp_df['run'] = i + 1
    # Concat the latest importances with the old ones
    null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)
    # Erase previous message
    for l in range(len(dsp)):
        print('\b', end='', flush=True)
    # Display current run and time used
    spent = (time.time() - start) / 60
    dsp = 'Done with %4d of %4d (Spent %5.1f min)' % (i + 1, nb_runs, spent)
    print(dsp, end='', flush=True)

### 4. 计算Score

#### 以未进行特征shuffle的特征重要性除以shuffle之后的0.75分位数作为我们的score；

In [None]:
feature_scores = []
for _f in actual_imp_df['feature'].unique():
    f_null_imps_gain = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_gain'].values
    f_act_imps_gain = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_gain'].mean()
    gain_score = np.log(1e-10 + f_act_imps_gain / (1 + np.percentile(f_null_imps_gain, 75)))  # Avoid didvide by zero

    f_null_imps_split = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_split'].values
    f_act_imps_split = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_split'].mean()
    split_score = np.log(1e-10 + f_act_imps_split / (1 + np.percentile(f_null_imps_split, 75)))  # Avoid didvide by zero

    feature_scores.append((_f, split_score, gain_score))

scores_df = pd.DataFrame(feature_scores, columns=['feature', 'split_score', 'gain_score'])

#### shuffle target之后特征重要性低于实际target对应特征的重要性0.25分位数的次数百分比。

In [None]:
correlation_scores = []
for _f in actual_imp_df['feature'].unique():
    f_null_imps = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_gain'].values
    f_act_imps = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_gain'].values
    gain_score = 100 * (f_null_imps < np.percentile(f_act_imps, 25)).sum() / f_null_imps.size

    f_null_imps = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_split'].values
    f_act_imps = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_split'].values
    split_score = 100 * (f_null_imps < np.percentile(f_act_imps, 25)).sum() / f_null_imps.size
    correlation_scores.append((_f, split_score, gain_score))

corr_scores_df = pd.DataFrame(correlation_scores, columns=['feature', 'split_score', 'gain_score'])

## 计算特征筛选之后的最佳分数并记录相应特征

In [None]:
def score_feature_selection(df=None, train_features=None, cat_feats=None, target=None):
    # Fit LightGBM
    dtrain = lgb.Dataset(df[train_features], target, free_raw_data=False, silent=True)
    lgb_params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'learning_rate': .1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'num_leaves': 31,
        'max_depth': -1,
        'seed': 13,
        'n_jobs': 4,
        'min_split_gain': .00001,
        'reg_alpha': .00001,
        'reg_lambda': .00001,
        'metric': 'auc'
    }

    # Fit the model
    hist = lgb.cv(
        params=lgb_params,
        train_set=dtrain,
        num_boost_round=2000,
        categorical_feature=cat_feats,
        nfold=5,
        stratified=True,
        shuffle=True,
        early_stopping_rounds=50,
        verbose_eval=0,
        seed=17,
    )
    # Return the last mean / std values
    return hist['auc-mean'][-1], hist['auc-stdv'][-1]

# features = [f for f in data.columns if f not in ['SK_ID_CURR', 'TARGET']]
# score_feature_selection(df=data[features], train_features=features, target=data['TARGET'])
categorical_feats = [] # delete

for threshold in [0, 10, 20, 30 , 40, 50 ,60 , 70, 80 , 90, 95, 99]:
    split_feats     = [_f for _f, _score, _ in correlation_scores if _score >= threshold]
    split_cat_feats = [_f for _f, _score, _ in correlation_scores if (_score >= threshold) & (_f in categorical_feats)]

    gain_feats     = [_f for _f, _, _score in correlation_scores if _score >= threshold]
    gain_cat_feats = [_f for _f, _, _score in correlation_scores if (_score >= threshold) & (_f in categorical_feats)]

    print('Results for threshold %3d' % threshold)
    split_results = score_feature_selection(df=data, train_features=split_feats,
                                            cat_feats=split_cat_feats, target=data['TARGET'])
    print('\t SPLIT : %.6f +/- %.6f' % (split_results[0], split_results[1]))
    gain_results = score_feature_selection(df=data, train_features=gain_feats,
                                           cat_feats=gain_cat_feats, target=data['TARGET'])
    print('\t GAIN  : %.6f +/- %.6f' % (gain_results[0], gain_results[1]))