# Feature selection

In [43]:
import re

import numpy as np
import pandas as pd
from IPython.display import display
from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures, MRMR, SmartCorrelatedSelection
from lightgbm import LGBMClassifier
from sklearn.feature_selection import SelectFromModel, SelectKBest, mutual_info_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance

In [2]:
FEATURES_VERSION = 6
SELECTION_VERSION = 4

In [3]:
FEATURES_PATH = '../data/features/features_auto_v{}.csv'.format(FEATURES_VERSION)

features_df = pd.read_csv(FEATURES_PATH)

In [4]:
clients_cols = ['client_id', 'target', 'is_train']
clients_data = features_df[clients_cols]

In [5]:
X = features_df.drop(columns=clients_cols)
y = features_df['target']

## Filters

In [6]:
"""Low-Variance"""

drop_constant = DropConstantFeatures(tol=1)
drop_quasi = DropConstantFeatures(tol=0.99)
drop_duplicates = DropDuplicateFeatures()

X1 = drop_constant.fit_transform(X)
X2 = drop_quasi.fit_transform(X1)
X3 = drop_duplicates.fit_transform(X2)

In [23]:
"""Correlation"""

corr_selector = SmartCorrelatedSelection(
    method='pearson',
    threshold=0.7,
    selection_method='missing_values',
)
# X_corr = corr_selector.fit_transform(X3)
X_corr = corr_selector.fit_transform(X3, y)

In [8]:
# """Mutual information"""
#
# mi = SelectKBest(
#     score_func=mutual_info_classif,
#     k=1200,
# )
# mi.fit(X_corr, y)
# selected_mi_features = X_corr.columns[mi.get_support()]
# X_mi = X_corr[selected_mi_features]

In [9]:
# """MRMR"""
#
# mrmr_selector = MRMR(
#     method='MIQ',
#     max_features=5,
#     regression=False,
#     random_state=42,
#     n_jobs=6,
# )
# X_mrmr = mrmr_selector.fit_transform(X_corr, y)

In [24]:
X_filtered = X_corr
display(X_filtered.info(verbose=True, show_counts=True))
display(X_filtered.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34964 entries, 0 to 34963
Data columns (total 431 columns):
 #    Column                                                                                       Non-Null Count  Dtype  
---   ------                                                                                       --------------  -----  
 0    COUNT(transactions)                                                                          34964 non-null  int64  
 1    ENTROPY(transactions.cat_c2)                                                                 34964 non-null  float64
 2    ENTROPY(transactions.cat_c4)                                                                 34964 non-null  float64
 3    KURTOSIS(transactions.float_c16)                                                             34964 non-null  float64
 4    KURTOSIS(transactions.float_c17)                                                             34964 non-null  float64
 5    KURTOSIS(transactions.f

None

Unnamed: 0,COUNT(transactions),ENTROPY(transactions.cat_c2),ENTROPY(transactions.cat_c4),KURTOSIS(transactions.float_c16),KURTOSIS(transactions.float_c17),KURTOSIS(transactions.float_c18),KURTOSIS(transactions.float_c20),KURTOSIS(transactions.int_c19),MAX(transactions.float_c16),MAX(transactions.float_c17),...,MODE(transactions.cat_c3)_303,MODE(transactions.cat_c3)_312,MODE(activities.cat_c6)_4,MODE(activities.cat_c6)_Rare,MODE(activities.cat_c6)_8,MODE(comms.cat_c2)_S3769,MODE(comms.cat_c2)_Rare,MODE(comms.cat_c2)_S27888,MODE(comms.cat_c2)_S3563,MODE(comms.cat_c4)_1
count,34964.0,34964.0,34964.0,34964.0,34964.0,34964.0,34964.0,34964.0,34964.0,34964.0,...,34964.0,34964.0,34964.0,34964.0,34964.0,34964.0,34964.0,34964.0,34964.0,34964.0
mean,447.722171,1.574397,0.105946,78.719137,132.415299,42.100321,16.052793,-0.719137,1.61992,4.57712,...,0.047763,0.020135,0.05254,0.016131,0.028515,0.030202,0.039183,0.013957,0.010439,0.945744
std,474.127701,0.402746,0.190807,146.90933,225.166766,57.951094,112.5411,2.653992,4.867832,19.215062,...,0.213268,0.140464,0.223116,0.125981,0.166441,0.171147,0.194033,0.117315,0.10164,0.226525
min,2.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,122.0,1.329142,0.0,0.0,0.0,11.095781,0.0,-1.807487,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,309.0,1.620685,0.015948,27.623509,34.185537,25.522798,0.0,-1.327778,0.47,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,616.0,1.855134,0.126522,93.020321,176.602358,51.168443,0.0,-0.379546,1.66,4.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,8721.0,2.720781,2.159006,3572.595778,2741.303377,1350.808942,3767.000265,259.670459,413.130005,2294.719971,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:
FILTER_OUT_PATH = '../data/features_selected/features_v{}_filtered_v{}.csv'.format(FEATURES_VERSION, SELECTION_VERSION)

pd.concat([X_filtered, clients_data], axis=1).to_csv(FILTER_OUT_PATH, index=False)

## Embedded

In [14]:
# """L1 regularization"""
#
# log_l1 = LogisticRegression(
#     penalty='l1',
#     solver='saga',
#     max_iter=500,
#     class_weight='balanced',
#     n_jobs=6,
#     random_state=42,
#     verbose=1,
# )
# sfm = SelectFromModel(log_l1, threshold='median')
#
# sfm.fit(X_filtered, y)
#
# selected_l1_features = X_filtered.columns[sfm.get_support()]
# X_l1 = X_filtered[selected_l1_features]



In [26]:
"""Select top features from LightGBM model"""

lgb = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    predict_disable_shape_check=True,
    random_state=42,
    n_jobs=6,
)

lgb_data = X_filtered.rename(columns=lambda x: re.sub(r'[,\n\[\]\{\}:"]', '_', x))
lgb.fit(lgb_data, y)

importances = lgb.feature_importances_
indices = np.argsort(importances)[::-1]

top100 = X_filtered.columns[indices[:200]]
X_lgb_imp = X_filtered[top100]

[LightGBM] [Info] Number of positive: 8554, number of negative: 26410
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040767 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 86878
[LightGBM] [Info] Number of data points in the train set: 34964, number of used features: 431
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244652 -> initscore=-1.127344
[LightGBM] [Info] Start training from score -1.127344


In [35]:
top100 = X_filtered.columns[indices[:200]]
X_lgb_imp = X_filtered[top100]

In [27]:
# """Permutation Importance"""
#
# perm = permutation_importance(
#     lgb, X_lgb_imp, y,
#     n_repeats=10,
#     scoring='roc_auc',
#     n_jobs=6,
#     random_state=42,
# )
#
# perm_sorted_idx = perm.importances_mean.argsort()[::-1]
# selected_perm = [top100[i] for i in perm_sorted_idx if perm.importances_mean[i] < 0]
#
# X_pi = X_lgb_imp[selected_perm]

In [39]:
X_embedded = X_lgb_imp
display(X_embedded.info(verbose=True, show_counts=True))
display(X_embedded.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34964 entries, 0 to 34963
Data columns (total 200 columns):
 #    Column                                                                                  Non-Null Count  Dtype  
---   ------                                                                                  --------------  -----  
 0    INTERACTION_contact_per_trans                                                           34964 non-null  float64
 1    ENTROPY(comms.cat_c3)                                                                   34964 non-null  float64
 2    MEAN(comms.contact_date_day_of_month_cos WHERE cat_c3 = 3)                              34964 non-null  float64
 3    COUNT(comms WHERE cat_c4 = 2)                                                           34964 non-null  int64  
 4    ENTROPY(comms.cat_c5)                                                                   34964 non-null  float64
 5    MEAN(comms.contact_date_day_of_month_sin WHERE cat_c4 = 2)

None

Unnamed: 0,INTERACTION_contact_per_trans,ENTROPY(comms.cat_c3),MEAN(comms.contact_date_day_of_month_cos WHERE cat_c3 = 3),COUNT(comms WHERE cat_c4 = 2),ENTROPY(comms.cat_c5),MEAN(comms.contact_date_day_of_month_sin WHERE cat_c4 = 2),MEAN(comms.contact_date_day_of_month_sin WHERE cat_c3 = 3),ENTROPY(comms.cat_c2),PERCENT_TRUE(transactions.fl_c15),COUNT(comms WHERE cat_c3 = 3),...,"tran_date_gap_days__linear_trend__attr_""slope""",activity_date_gap_days__longest_strike_below_mean,KURTOSIS(transactions.int_c19),PERCENT_TRUE(activities.cat_c10),SUM(transactions.float_c16 WHERE fl_c13 = True)__30_days,float_c18_diff__approximate_entropy__m_2__r_0.25,SUM(transactions.float_c18 WHERE cat_c3 = 303),MEAN(transactions.float_c18 WHERE fl_c14 = True),MAX(transactions.float_c18 WHERE tran_date_is_weekend = True),MIN(activities.float_c14 WHERE cat_c8 = True)
count,34964,34964,34964,34964,34964,34964,34964,34964,34964,34964,...,34964,34964,34964,34964,34964,34964,34964,34964,34964,34964
mean,1,0,0,42,0,0,0,2,0,13,...,0,18,0,0,1,0,793,17,819,0
std,1,0,0,14,0,0,0,0,0,5,...,1,11,2,0,4,0,3464,45,989,0
min,0,0,0,0,0,0,0,0,0,0,...,-14,0,-2,0,0,0,0,0,0,0
25%,1,0,0,33,0,0,0,2,0,10,...,0,12,-1,0,0,0,54,5,250,0
50%,1,0,0,42,0,0,0,2,0,13,...,0,17,-1,0,0,0,225,10,500,0
75%,1,0,0,51,0,0,0,2,0,17,...,0,23,0,0,1,0,664,19,1000,0
max,65,1,1,362,1,0,0,4,0,115,...,83,275,259,1,189,1,219808,2856,35410,16


In [40]:
EMBEDDED_OUT_PATH = '../data/features_selected/features_v{}_embedded_v{}.csv'.format(FEATURES_VERSION, SELECTION_VERSION)

pd.concat([X_embedded, clients_data], axis=1).to_csv(EMBEDDED_OUT_PATH, index=False)

## Wrapped

In [52]:
rfe = RFE(
    estimator=lgb,
    n_features_to_select=50,
    step=0.1,
    verbose=1,
)
rfe.fit(X_embedded, y)

selected_rfe_features = X_embedded.columns[rfe.get_support()]
X_rfe = X_embedded[selected_rfe_features]

Fitting estimator with 200 features.
[LightGBM] [Info] Number of positive: 8554, number of negative: 26410
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014311 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48701
[LightGBM] [Info] Number of data points in the train set: 34964, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244652 -> initscore=-1.127344
[LightGBM] [Info] Start training from score -1.127344
Fitting estimator with 195 features.
[LightGBM] [Info] Number of positive: 8554, number of negative: 26410
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014020 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 47726
[LightGBM] [Info] Number of data points in the train set: 34964, number of used features: 195
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244652 -> initscore=

In [53]:
X_wrappered = X_rfe
display(X_wrappered.info(verbose=True, show_counts=True))
display(X_wrappered.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34964 entries, 0 to 34963
Data columns (total 50 columns):
 #   Column                                                                                 Non-Null Count  Dtype  
---  ------                                                                                 --------------  -----  
 0   INTERACTION_contact_per_trans                                                          34964 non-null  float64
 1   ENTROPY(comms.cat_c3)                                                                  34964 non-null  float64
 2   MEAN(comms.contact_date_day_of_month_cos WHERE cat_c3 = 3)                             34964 non-null  float64
 3   COUNT(comms WHERE cat_c4 = 2)                                                          34964 non-null  int64  
 4   ENTROPY(comms.cat_c5)                                                                  34964 non-null  float64
 5   MEAN(comms.contact_date_day_of_month_sin WHERE cat_c4 = 2)                

None

Unnamed: 0,INTERACTION_contact_per_trans,ENTROPY(comms.cat_c3),MEAN(comms.contact_date_day_of_month_cos WHERE cat_c3 = 3),COUNT(comms WHERE cat_c4 = 2),ENTROPY(comms.cat_c5),MEAN(comms.contact_date_day_of_month_sin WHERE cat_c4 = 2),MEAN(comms.contact_date_day_of_month_sin WHERE cat_c3 = 3),ENTROPY(comms.cat_c2),PERCENT_TRUE(transactions.fl_c15),COUNT(comms WHERE cat_c3 = 3),...,float_c18_diff__quantile__q_0.75,MAX(activities.float_c11),STD(activities.float_c12 WHERE activity_date_is_weekend = True),MEAN(activities.activity_date_day_of_month_cos)__90_days,MEAN(activities.activity_date_day_of_week_cos),MEAN(activities.activity_date_day_of_week_cos WHERE activity_date_is_weekend = True),SUM(transactions.float_c16),MEAN(transactions.tran_date_day_of_month_sin WHERE fl_c13 = True),MEAN(activities.activity_date_day_of_month_sin WHERE activity_date_is_weekend = True),MEAN(transactions.int_c19)
count,34964,34964,34964,34964,34964,34964,34964,34964,34964,34964,...,34964,34964,34964,34964,34964,34964,34964,34964,34964,34964
mean,1,0,0,42,0,0,0,2,0,13,...,27,51,21,0,0,0,13,0,0,0
std,1,0,0,14,0,0,0,0,0,5,...,58,28,5,0,0,0,28,0,0,0
min,0,0,0,0,0,0,0,0,0,0,...,-1097,0,0,0,-1,0,-6,0,0,-1
25%,1,0,0,33,0,0,0,2,0,10,...,7,25,19,0,0,0,0,0,0,0
50%,1,0,0,42,0,0,0,2,0,13,...,14,54,22,0,0,0,2,0,0,0
75%,1,0,0,51,0,0,0,2,0,17,...,28,77,25,0,0,0,15,0,0,0
max,65,1,1,362,1,0,0,4,0,115,...,3228,99,54,1,1,1,1185,0,0,1


In [54]:
WRAPPERED_OUT_PATH = '../data/features_selected/features_v{}_wrappered_v{}.csv'.format(FEATURES_VERSION, SELECTION_VERSION)

pd.concat([X_wrappered, clients_data], axis=1).to_csv(WRAPPERED_OUT_PATH, index=False)