# Feature selection

In [34]:
import re
from datetime import datetime

import numpy as np
import pandas as pd
import xgboost as xgb
from IPython.display import display
from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures, DropHighPSIFeatures, MRMR, SmartCorrelatedSelection
from lightgbm import LGBMClassifier
from sklearn.feature_selection import SelectFromModel, SelectKBest, mutual_info_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance
from sklearn.model_selection import StratifiedKFold
from sklearn_genetic import GAFeatureSelectionCV
from boruta import BorutaPy

from models.model_utils import split_dataset_v2

In [3]:
FEATURES_VERSION = 6
SELECTION_VERSION = 10

In [4]:
FEATURES_PATH = '../data/features/features_auto_v{}.csv'.format(FEATURES_VERSION)

features_df = pd.read_csv(FEATURES_PATH)

In [5]:
clients_cols = ['client_id', 'target', 'is_train']
clients_data = features_df[clients_cols]

X = features_df.drop(columns=clients_cols)
y = features_df['target']

In [6]:
# X_train = X[features_df['is_train'] == 1]
# y_train = y[features_df['is_train'] == 1]

X_train, _, _, y_train, _, _ = split_dataset_v2(FEATURES_PATH, valid_size=0.2)

In [36]:
"""Estimators for feature selection"""

lgb_est = LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',
    importance_type='gain',
    n_estimators=1334,
    learning_rate=0.02976,
    max_depth=3,
    num_leaves=3,
    min_child_samples=215,
    subsample=0.68114,
    colsample_bytree=0.78928,
    bagging_fraction=0.61835,
    bagging_freq=1,
    reg_alpha=5.0,
    reg_lambda=5.0,
    class_weight=None,
    verbosity=-1,
    n_jobs=6,
    random_state=42,
    predict_disable_shape_check=True,
)

xgb_est = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    booster='gbtree',
    n_estimators=1035,
    learning_rate=0.04225,
    max_depth=2,
    min_child_weight=76,
    gamma=0.52283,
    subsample=0.63927,
    colsample_bytree=0.63823,
    reg_alpha=2.0,
    reg_lambda=17.0,
    scale_pos_weight=3,
    n_jobs=6,
    random_state=42,
    # early_stopping_rounds=50,
)

## Filters

In [38]:
"""Constant features"""

drop_constant = DropConstantFeatures(tol=1)
drop_constant.fit(X_train)
X1 = drop_constant.transform(X)
X1_train = drop_constant.transform(X_train)

In [39]:
"""Quasi-constant features"""

drop_quasi = DropConstantFeatures(tol=0.99)
drop_quasi.fit(X1_train)
X2 = drop_quasi.transform(X1)
X2_train = drop_quasi.transform(X1_train)

In [40]:
"""Duplicate features"""

drop_duplicates = DropDuplicateFeatures()
drop_duplicates.fit(X2_train)
X3 = drop_duplicates.transform(X2)
X3_train = drop_duplicates.transform(X2_train)

In [None]:
"""Correlation"""

corr_selector = SmartCorrelatedSelection(
    method='pearson',
    threshold=0.7,
    selection_method='missing_values',
    # selection_method='model_performance',
    # scoring='roc_auc',
    # estimator=lgb_est,
)

corr_selector.fit(X3_train, y_train)
X_corr = corr_selector.transform(X3)
X_corr_train = corr_selector.transform(X3_train)

In [42]:
"""PCI"""

psi_selector = DropHighPSIFeatures(
    split_frac=0.5,
    # split_col='communication_month',
    # cut_off=datetime(2025, 4, 1),
    strategy='equal_frequency',
    bins=10,
    threshold=0.1,
)

psi_selector.fit(X_corr_train)
X_psi = psi_selector.transform(X_corr)
X_psi_train = psi_selector.transform(X_corr_train)

In [109]:
# """Mutual information"""
#
# mi = SelectKBest(
#     score_func=mutual_info_classif,
#     k=300,
# )
# mi.fit(X_corr, y)
# selected_mi_features = X_corr.columns[mi.get_support()]
# X_mi = X_corr[selected_mi_features]

In [None]:
# """MRMR"""
#
# mrmr_selector = MRMR(
#     method='MIQ',
#     max_features=50,
#     regression=False,
#     random_state=42,
#     n_jobs=6,
# )
# X_mrmr = mrmr_selector.fit_transform(X_corr, y)

In [None]:
X_filtered = X_psi
X_filtered_train = X_psi_train
display(X_filtered.info(verbose=True, show_counts=True))
display(X_filtered.describe())

In [44]:
FILTER_OUT_PATH = '../data/features_selected/features_v{}_filtered_v{}.csv'.format(FEATURES_VERSION, SELECTION_VERSION)

pd.concat([X_filtered, clients_data], axis=1).to_csv(FILTER_OUT_PATH, index=False)

## Boruta selection

In [45]:
X_prev = X_filtered
X_prev_train = X_filtered_train

rf = RandomForestClassifier(
    class_weight='balanced',
    max_depth=5,
    n_jobs=6,
    random_state=42,
)
feat_selector = BorutaPy(
    rf,
    n_estimators='auto',
    perc=100,
    max_iter=100,
    verbose=2,
    random_state=42,
)

feat_selector.fit(X_prev_train.values, y_train.values)

accepted_features = X_prev_train.columns[feat_selector.support_].to_list()
tentative_features = X_prev_train.columns[feat_selector.support_weak_].to_list()

X_boruta = X_prev[accepted_features + tentative_features]
X_boruta_train = X_prev_train[accepted_features + tentative_features]

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	358
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	358
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	358
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	358
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	358
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	358
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	358
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	132
Tentative: 	92
Rejected: 	134
Iteration: 	9 / 100
Confirmed: 	132
Tentative: 	92
Rejected: 	134
Iteration: 	10 / 100
Confirmed: 	132
Tentative: 	92
Rejected: 	134
Iteration: 	11 / 100
Confirmed: 	132
Tentative: 	92
Rejected: 	134
Iteration: 	12 / 100
Confirmed: 	133
Tentative: 	64
Rejected: 	161
Iteration: 	13 / 100
Confirmed: 	133
Tentative: 	64
Rejected: 	161
Iteration: 	14 / 100
Confirmed: 	133
Tentative: 	64
Rejected: 	161
Iteration: 	15 / 100
Confirmed: 	133
Tentative: 	64
Rejected: 	161
Iteration: 	16 / 100
Conf

In [46]:
BORUTA_OUT_PATH = '../data/features_selected/features_v{}_boruta_v1.csv'.format(FEATURES_VERSION)
pd.concat([X_boruta, clients_data], axis=1).to_csv(BORUTA_OUT_PATH, index=False)

In [13]:
# Boruta import

BORUTA_IN_PATH = '../data/features_selected/features_v{}_boruta_v1.csv'.format(FEATURES_VERSION)

boruta_features_df = pd.read_csv(BORUTA_IN_PATH).drop(columns=clients_cols)
boruta_features_df = boruta_features_df.rename(columns=lambda x: re.sub(r'[,\n\[\]\{\}:"]', '__', x))
boruta_cols = boruta_features_df.columns.tolist()

X_boruta = X[boruta_cols]
X_boruta_train = X_train[boruta_cols]

## Selection with regularization

In [None]:
# """L1 regularization"""
#
# log_l1 = LogisticRegression(
#     penalty='l1',
#     solver='saga',
#     max_iter=1500,
#     class_weight='balanced',
#     n_jobs=6,
#     random_state=42,
#     verbose=1,
# )
# sfm = SelectFromModel(log_l1, threshold='median')
#
# sfm.fit(X_filtered, y)
#
# selected_l1_features = X_filtered.columns[sfm.get_support()]
# X_l1 = X_filtered[selected_l1_features]

In [167]:
"""ElasticNet regularization"""

X_prev = X_boruta
X_prev_train = X_boruta_train

log_en = LogisticRegression(
    penalty='elasticnet',
    solver='saga',
    l1_ratio=0.3,
    C=1,
    max_iter=3000,
    class_weight='balanced',
    n_jobs=6,
    random_state=42,
    verbose=1,
)

log_en.fit(X_prev_train, y_train)

coef = pd.Series(np.abs(log_en.coef_[0]), index=X_prev.columns)
selected_en_features = coef[coef > 1e-6].sort_values(ascending=False).index.tolist()
X_en = X_prev[selected_en_features]
X_en_train = X_prev_train[selected_en_features]

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.


max_iter reached after 167 seconds




In [172]:
ELASTICNET_OUT_PATH = '../data/features_selected/features_v{}_elasticnet_v1.csv'.format(FEATURES_VERSION)
pd.concat([X_en, clients_data], axis=1).to_csv(ELASTICNET_OUT_PATH, index=False)

## Select best features from model params

In [None]:
# """Select top features from LightGBM model"""
#
# lgb = LGBMClassifier(
#     n_estimators=500,
#     learning_rate=0.05,
#     predict_disable_shape_check=True,
#     random_state=42,
#     n_jobs=6,
# )
#
# lgb_data = X_filtered.rename(columns=lambda x: re.sub(r'[,\n\[\]\{\}:"]', '_', x))
# lgb.fit(lgb_data, y)
#
# importances = lgb.feature_importances_
# indices = np.argsort(importances)[::-1]
#
# top100 = X_filtered.columns[indices[:200]]
# X_lgb_imp = X_filtered[top100]

## Permutation importance

In [27]:
# """Permutation Importance"""
#
# perm = permutation_importance(
#     lgb, X_lgb_imp, y,
#     n_repeats=10,
#     scoring='roc_auc',
#     n_jobs=6,
#     random_state=42,
# )
#
# perm_sorted_idx = perm.importances_mean.argsort()[::-1]
# selected_perm = [top100[i] for i in perm_sorted_idx if perm.importances_mean[i] < 0]
#
# X_pi = X_lgb_imp[selected_perm]

## Stability selection

In [175]:
X_prev = X_boruta
X_prev_train = X_boruta_train

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
freq = pd.Series(0, index=X_filtered.columns)
lgb_data = X_prev_train.rename(columns=lambda x: re.sub(r'[,\n\[\]\{\}:"]', '_', x))

for train_idx, val_idx in skf.split(lgb_data, y_train):
    lgb = LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        predict_disable_shape_check=True,
        random_state=42,
        n_jobs=6,
    )
    lgb.fit(lgb_data.iloc[train_idx], y_train.iloc[train_idx])

    imp = pd.Series(lgb.feature_importances_, index=X_prev.columns)
    top = imp.sort_values(ascending=False).head(60).index
    freq[top] += 1

selected_stable_features = freq[freq >= 6].index.tolist()[:50]
X_stable = X_prev[selected_stable_features]
X_stable_train = X_prev_train[selected_stable_features]

[LightGBM] [Info] Number of positive: 4931, number of negative: 15027
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008443 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37446
[LightGBM] [Info] Number of data points in the train set: 19958, number of used features: 169
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247069 -> initscore=-1.114307
[LightGBM] [Info] Start training from score -1.114307
[LightGBM] [Info] Number of positive: 4931, number of negative: 15027
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004975 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37449
[LightGBM] [Info] Number of data points in the train set: 19958, number of used features: 169
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247069 -> initscore=-1.114307
[LightGBM] [Info] Start training from score -1.114307
[LightGBM]

In [177]:
selected_stable_features = freq[freq >= 6].index.tolist()
X_stable = X_prev[selected_stable_features]
X_stable_train = X_prev_train[selected_stable_features]

In [178]:
STABLE_OUT_PATH = '../data/features_selected/features_v{}_stable_v2.csv'.format(FEATURES_VERSION)
pd.concat([X_stable, clients_data], axis=1).to_csv(STABLE_OUT_PATH, index=False)

## RFE

In [37]:
X_prev = X_boruta
X_prev_train = X_boruta_train

# lgb = LGBMClassifier(
#     n_estimators=500,
#     learning_rate=0.05,
#     predict_disable_shape_check=True,
#     random_state=42,
#     n_jobs=6,
# )
rfe = RFE(
    estimator=lgb_est,
    n_features_to_select=50,
    step=1,
    verbose=1,
)
rfe.fit(X_prev_train, y_train)

selected_rfe_features = X_prev.columns[rfe.get_support()]
X_rfe = X_prev[selected_rfe_features]
X_rfe_train = X_prev_train[selected_rfe_features]

Fitting estimator with 169 features.
Fitting estimator with 168 features.
Fitting estimator with 167 features.
Fitting estimator with 166 features.
Fitting estimator with 165 features.
Fitting estimator with 164 features.
Fitting estimator with 163 features.
Fitting estimator with 162 features.
Fitting estimator with 161 features.
Fitting estimator with 160 features.
Fitting estimator with 159 features.
Fitting estimator with 158 features.
Fitting estimator with 157 features.
Fitting estimator with 156 features.
Fitting estimator with 155 features.
Fitting estimator with 154 features.
Fitting estimator with 153 features.
Fitting estimator with 152 features.
Fitting estimator with 151 features.
Fitting estimator with 150 features.
Fitting estimator with 149 features.
Fitting estimator with 148 features.
Fitting estimator with 147 features.
Fitting estimator with 146 features.
Fitting estimator with 145 features.
Fitting estimator with 144 features.
Fitting estimator with 143 features.
F

In [38]:
RFE_OUT_PATH = '../data/features_selected/features_v{}_rfe_lgb_v4.csv'.format(FEATURES_VERSION)
pd.concat([X_rfe, clients_data], axis=1).to_csv(RFE_OUT_PATH, index=False)

## Genetic algorithm

In [153]:
X_prev = X_boruta
X_prev_train = X_boruta_train

ga_selector = GAFeatureSelectionCV(
    estimator=LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=-1,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        n_jobs=6,
        random_state=42,
    ),
    cv=3,
    population_size=40,
    generations=25,
    crossover_probability=0.35,
    mutation_probability=0.20,
    tournament_size=5,
    elitism=True,
    max_features=60,
    scoring='roc_auc',
    criteria='max',

    n_jobs=6,
    verbose=True,
    keep_top_k=5,
    use_cache=True,
    refit=False,
)

lgb_data = X_prev_train.rename(columns=lambda x: re.sub(r'[,\n\[\]\{\}:"]', '_', x))
ga_selector.fit(lgb_data, y_train)
selected_ga_features = X_prev.columns[ga_selector.best_features_.tolist()]
X_ga = X_prev[selected_ga_features]
X_ga_train = X_prev_train[selected_ga_features]



gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	40    	-44999.6	49749.8    	0.791395   	-100000    
1  	52    	-2499.24	15612.6    	0.793863   	-100000    
2  	45    	0.788844	0.00405573 	0.794357   	0.774117   
3  	41    	0.792356	0.00177868 	0.795972   	0.788162   
4  	46    	0.79406 	0.00165947 	0.795972   	0.791395   
5  	39    	0.795648	0.000642932	0.797046   	0.793643   
6  	38    	0.796161	0.000727964	0.797661   	0.795467   
7  	43    	0.797177	0.00062382 	0.799491   	0.795691   
8  	43    	0.797909	0.000853557	0.799491   	0.797046   
9  	43    	0.798612	0.000819492	0.799491   	0.797221   
10 	43    	0.799425	0.000236423	0.799491   	0.798362   
11 	46    	0.799491	3.33067e-16	0.799491   	0.799491   
12 	43    	0.799491	3.33067e-16	0.799491   	0.799491   
13 	41    	0.799491	3.33067e-16	0.799491   	0.799491   
14 	47    	0.799491	3.33067e-16	0.799491   	0.799491   
15 	40    	0.799491	3.33067e-16	0.799491   	0.799491   
16 	44    	0.799491	3.33067e-16	0.799491   	0.79

0,1,2
,estimator,LGBMClassifie...subsample=0.8)
,cv,3
,scoring,'roc_auc'
,population_size,40
,generations,25
,crossover_probability,0.35
,mutation_probability,0.2
,tournament_size,5
,elitism,True
,max_features,60

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,300
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [158]:
GA_OUT_PATH = '../data/features_selected/features_v{}_ga_v1.csv'.format(FEATURES_VERSION)
pd.concat([X_ga, clients_data], axis=1).to_csv(GA_OUT_PATH, index=False)