## Feature Selection

### Plot Feature Importance

In [27]:
def plot_feature_importance(
    feature_importance_df,
    feature_name='feature',
    importance_name=['split', 'gain'],
    top_k=50,
    fig_width=16,
    fig_height=8,
    fontsize=14,
):
    if isinstance(importance_name, str):
        importance_name = [importance_name]
    
    num_importance = len(importance_name)
    plt.figure(figsize=(fig_width, fig_height*num_importance))
    gs = gridspec.GridSpec(1, num_importance)
    
    def _fetch_best_features(df, fimp='gain'):
        cols = (df[[feature_name, fimp]]
                .groupby(feature_name)
                .mean()
                .sort_values(by=fimp, ascending=False)
                .index
                .values[:top_k])
        return cols, df.loc[df[feature_name].isin(cols)]
    
    for i, fimp in enumerate(importance_name):
        cols, best_features = _fetch_best_features(feature_importance_df, fimp)
        ax = plt.subplot(gs[0, i])
        sns.barplot(x=fimp, y=feature_name, data=best_features, order=cols, ax=ax)
        title = f'Features {fimp} importance (averaged/folds)'
        plt.title(title, fontweight='bold', fontsize=fontsize)
    
    plt.tight_layout()

# or this
# fold_importance_df.plot.barh(x='feature', y='gain', figsize=(13,20))

### Iterative Feature Elimination by LightGBM

In [28]:
class Iterative_CV:
    def __init__(self, X_train_full, y_train, eval_cols, metric):
        self.X_train_full = X_train_full
        self.y_train = y_train
        self.eval_cols = eval_cols
        self.metric = metric
    
    # =====
    # eliminates/imputes one feature at a time,
    # returns list with options and discards
    # ====
    
    def iter_cv_elim():
        excl_improve = []; excl_worse = []
        if self.metric == 'rmse':
            init_valid_avg_score = -1 * lgb_kfold(self.X_train_full, self.y_train, bayes_opt=True)
            print(f'[Iter_Feature_Elim] Current best score is {init_valid_avg_score}')
            for cols in tqdm(self.eval_cols):
                temp_cols = list(set(self.X_train_full.columns) - {col})
                X_train = self.X_train_full[temp_cols]
                new_valid_avg_score = -1 * lgb_kfold(X_train, self.y_train, bayes_opt=True)
                degree = new_valid_avg_score - init_valid_avg_score
                if degree < 0:
                    pct = 100 * (-1 * degree / init_valid_avg_score)
                    excl_improve.append([col, pct])
                    print(f"[Iter_Feature_Elim] '{col}', exclusion improved (lowered) avg CV by {pct}pct.")
                else:
                    pct = 100 * (degree / init_valid_avg_score)
                    excl_worse.append([col, pct])
                    print(f"[Iter_Feature_Elim] '{col}', exclusion worsened (raised) avg CV by {pct}pct.")
        elif self.metric == 'auc':
            init_valid_avg_score = lgb_skfold(self.X_train_full, self.y_train, bayes_opt=True)
            print(f'[Iter_Feature_Elim] Current best score is {init_valid_avg_score}')
            for col in tqdm(self.eval_cols):
                temp_cols = list(set(self.X_train_full.columns) - {col})
                X_train = self.X_train_full[temp_cols]
                new_valid_avg_score = lgb_skfold(X_train, self.y_train, bayes_opt=True)
                degree = new_valid_avg_score - init_valid_avg_score
                if degree > 0:
                    pct = 100 * (degree / init_valid_avg_score)
                    excl_improve.append([col, pct])
                    print(f"[Iter_Feature_Elim] '{col}', exclusion improved (raised) avg CV by {pct}pct.")
                else:
                    pct = 100 * (-1 * degree / init_valid_avg_score)
                    excl_worse.append([col, pct])
                    print(f"[Iter_Feature_Elim] '{col}', exclusion worsened (lowered) avg CV by {pct}pct.")

        excl_improve.sort(key=lambda lst: lst[1])
        excl_worse.sort(key=lambda lst: lst[1])
        del init_valid_avg_score, cols, temp_cols, X_train, new_valid_avg_score, degree, pct
        gc.collect()
        return excl_improve, excl_worse
    
    def iter_cv_rank():
        impt = []
        if self.metric == 'rmse':
            for col in tqdm(self.eval_cols):
                X_train = self.X_train_full[col]
                assert X_train.shape[1] == 1
                print(f"[Iter_Feature_Rank] '{col}', evaluation ongoing.")
                valid_avg_score = -1 * lgb_kfold(X_train, self.y_train, bayes_opt=True)
                impt.append([col, valid_avg_score])
        elif self.metric == 'auc':
            for col in tqdm(self.eval_cols):
                X_train = self.X_train_full[col]
                assert X_train.shape[1] == 1
                print(f"[Iter_Feature_Rank] '{col}', evaluation ongoing.")
                valid_avg_score = lgb_skfold(X_train, self.y_train, bayes_opt=True)
                impt.append([col, valid_avg_score])
        impt.sort(key=lambda lst: lst[1])
        del col, X_train, valid_avg_score
        gc.collect()
        return impt

### Null Importance Selection

In [29]:
def _get_lgb_fimp(
    params,
    X_train,
    y_train,
    features,
    shuffle,
    seed=42,
    categorical=[]
):
    # Shuffle target if required
    y = y_train.copy()
    if shuffle:
        random.seed(seed)
        np.random.seed(seed)
        y = y_train.copy().sample(frac=1.0)
    
    arg_categorical = categorical if len(categorical) > 0 else 'auto'
    dtrain = lgb.Dataset(X_train[features],
                         label=y.values,
                         categorical_feature=arg_categorical)
    
    # Fit the model
    clf = lgb.train(params, dtrain)

    # Get feature importances
    imp_df = pd.DataFrame()
    imp_df['feature'] = features
    imp_df['split'] = clf.feature_importance(importance_type='split')
    imp_df['gain'] = clf.feature_importance(importance_type='gain')
    
    return imp_df


def null_importance_selection(
    params,
    X_train,
    y_train,
    features,
    seed=42,
    categorical=[],
    num_actual_run=1,
    num_null_run=40,
    eps=1e-10,
    valid_percentile=75,
):
    actual_imp_df = pd.DataFrame()
    
    np.random.seed(seed)
    for i in tqdm(range(num_actual_run)):
        seed = np.random.randint(1000)
        imp_df = _get_lgb_fimp(params,
                               X_train,
                               y_train,
                               features,
                               shuffle=False,
                               seed=seed,
                               categorical=categorical)
        imp_df['run'] = i
        actual_imp_df = pd.concat([actual_imp_df, imp_df], axis=0)
    
    null_imp_df = pd.DataFrame()
    
    np.random.seed(seed)
    for i in tqdm(range(num_null_run)):
        seed = np.random.randint(1000)
        imp_df = _get_lgb_fimp(params,
                               X_train,
                               y_train,
                               features,
                               shuffle=True,
                               seed=seed,
                               categorical=categorical)
        imp_df['run'] = i
        null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)
    
    feature_scores = []
    
    for _f in actual_imp_df['feature'].unique():
        # importance gain of gain
        act_fimp_split = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'split'].mean()
        null_fimp_split = null_imp_df.loc[null_imp_df['feature'] == _f, 'split'].values
        split_score = np.log(eps + act_fimp_split / (1 + np.percentile(null_fimp_split, valid_percentile)))
        
        # importance gain of gain
        act_fimp_gain = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'gain'].mean()
        null_fimp_gain = null_imp_df.loc[null_imp_df['feature'] == _f, 'gain'].values
        gain_score = np.log(eps + act_fimp_gain / (1 + np.percentile(null_fimp_gain, valid_percentile)))

        feature_scores.append((_f, split_score, gain_score))
    
    scores_df = pd.DataFrame(feature_scores, columns=['feature', 'split_score', 'gain_score'])
    return scores_df

### Consider Multicollinearity

In [30]:
def extract_high_corr_columns(df, threshold=0.99, verbose=True):
    df_corr = abs(df.corr())
    delete_columns = []
    
    # diagonal values filled by zero
    for i in range(0, len(df_corr.columns)):
        df_corr.iloc[i, i] = 0
    
    # loop as removing high-correlated columns in df_corr
    while True:
        df_max_column_value = df_corr.max()
        max_corr = df_max_column_value.max()
        query_column = df_max_column_value.idxmax()
        target_column = df_corr[query_column].idxmax()
        
        if max_corr < threshold:
            break
        else:
            # drop feature which is highly correlated with others 
            if sum(df_corr[query_column]) <= sum(df_corr[target_column]):
                delete_column = target_column
                saved_column = query_column
            else:
                delete_column = query_column
                saved_column = target_column
            
            df_corr.drop([delete_column], axis=0, inplace=True)
            df_corr.drop([delete_column], axis=1, inplace=True)
            delete_columns.append(delete_column)
            
            if verbose:
                printl('{}: Drop: {} <- Query: {}, Corr: {:.5f}'.format(
                    len(delete_columns), delete_column, saved_column, max_corr
                ))

    return delete_columns