### Features selection

This kerenel explains how to select features using sklearn.
It can be imporved by prefitting the models or by cross-validating and aggregating feature importances.

The idea is to see how correlation, simple model, and GBT models will vote for features.

This kerenl produces a selection report "report.csv" which can be used to select top-k features and testing them on differnet model.

*This is a draft work, and will be improved regularly.*


In [1]:
# Familiar imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

# utilities
import os
import gc
import glob
import random
from datetime import datetime
from pathlib import Path



# helpers
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# selectors
from sklearn.feature_selection import SelectKBest, chi2,  RFE, SelectFromModel
from sklearn.model_selection import train_test_split

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


# base
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin, clone

# scoring
from sklearn import metrics


In [2]:
# notebook options
pd.set_option("display.max_columns", 100)
%matplotlib inline


path = "../input/tabular-playground-series-sep-2021/"
train_file = "train.csv"
test_file = "test.csv"

In [3]:
# Load the training data
train = pd.read_csv(f'{path}{os.sep}{train_file}', index_col=0)
test = pd.read_csv(f'{path}{os.sep}{test_file}', index_col=0)

# Preview the data
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
f1,942672.0,9.020086e-02,4.356374e-02,-1.499100e-01,7.022700e-02,9.013500e-02,1.165000e-01,4.151700e-01
f2,942729.0,3.459637e-01,1.462507e-01,-1.904400e-02,2.830500e-01,3.891000e-01,4.584500e-01,5.189900e-01
f3,942428.0,4.068744e+03,6.415829e+03,-9.421700e+03,4.184300e+02,1.279500e+03,4.444400e+03,3.954400e+04
f4,942359.0,2.012140e-01,2.125103e-01,-8.212200e-02,3.508650e-02,1.370000e-01,2.971000e-01,1.319900e+00
f5,942514.0,3.048693e-01,1.453425e-01,-6.989800e-03,2.405200e-01,3.277900e-01,4.128300e-01,5.547500e-01
...,...,...,...,...,...,...,...,...
f115,942360.0,1.208876e+00,1.149588e-01,9.052700e-01,1.146800e+00,1.177200e+00,1.242000e+00,1.886700e+00
f116,942330.0,4.276905e+16,6.732441e+16,-8.944400e+15,2.321100e+14,1.327500e+16,5.278700e+16,3.249900e+17
f117,942512.0,3.959205e+03,3.155992e+03,-4.152400e+02,1.306200e+03,3.228000e+03,6.137900e+03,1.315100e+04
f118,942707.0,5.592672e-01,4.084261e-01,-1.512400e-01,2.765600e-01,4.734400e-01,7.462100e-01,2.743600e+00


In [4]:
train.sample(10)

Unnamed: 0_level_0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,...,f70,f71,f72,f73,f74,f75,f76,f77,f78,f79,f80,f81,f82,f83,f84,f85,f86,f87,f88,f89,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99,f100,f101,f102,f103,f104,f105,f106,f107,f108,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
931970,0.082187,0.20847,1758.0,0.14212,0.26098,-0.16427,-46.171,505360.0,2206800000000000.0,38947.0,1.441,-10314000.0,0.069469,-0.35575,0.29225,11.01,7.2652,-0.047339,14.474,2.8764,4763.2,2.0969,109.22,1.2152,654.71,958540000000.0,92810000000.0,2563700.0,0.003231,0.79919,-0.85538,110310.0,-58163.0,-0.001168,6.517e+16,-30.275,454.68,1.4589,377.65,0.003775,48.167,0.892,6.1739,50.774,0.0008,4.1493,-0.75457,5.2106,-0.59512,0.005398,...,0.001763,2.6183,86.609,2525800000000000.0,-13256000000.0,0.008801,8.7558,1112.0,3055.0,2.8996,0.11767,6.8941,1995700000.0,69.252,-242460.0,4.1301,346980000.0,9.2081,229.63,6795.1,7.3429,0.46406,1209.3,27.491,0.82548,37.328,1083.3,1.0011,14319000000000.0,1.2497,0.29745,0.7074,1075800.0,66.032,-0.89376,0.081167,0.10761,0.001363,6089900000.0,0.26258,-7.8647,2.2242,-3.9537,-2.7836,-507.1,0.95674,1.7271e+17,82.115,0.54018,0
119154,0.082624,0.36603,359.84,0.002339,0.40406,-0.35442,324.92,282510.0,237970000000000.0,-61.53,1.2794,3631300000.0,0.34237,3.3044,-0.048113,2.9638,6.6174,5.0738,8.3545,0.092024,19091.0,3.8031,115.03,-0.30021,357.14,196140000000.0,142480000000.0,3179300.0,-0.00207,8.3502,0.93834,155170.0,1059600.0,-0.001756,1.1739e+17,-91.072,578.83,2.7619,694.68,0.008806,1859.7,-0.024263,5.8395,93.585,0.001108,4.2845,0.41081,8.9023,-0.80674,,...,0.003546,1.3223,1063.0,763350000000000.0,-14244000000.0,-0.005391,6.8368,11412.0,4418.9,-3.769,0.097579,2.4434,79798000000.0,153.92,5892400.0,-20.961,1552900000.0,-0.47591,533.12,2037.2,5.0595,0.43075,-15.477,64.536,0.86724,6.6105,64.88,0.9984,32768000000000.0,1.1294,1.0067,35.187,8929.6,7.3088,-48.287,0.22486,0.6375,0.039043,2762300000.0,0.79379,-31.743,,2.0308,4.8183,-212.2,1.3009,8.977e+16,6620.8,0.38311,0
540831,0.00945,0.4271,-223.63,0.001698,0.30234,1.7339,1448.5,4952.1,4657800000000.0,-6.7692,-0.66454,8103000.0,0.31527,-0.076824,0.084798,359.05,7.2103,0.016138,44.74,238.93,3359.8,2.173,106.39,1.2254,324.31,2293400000000.0,-16702000000.0,,0.10165,,-0.3943,-11426.0,550420.0,0.000968,141650000000000.0,-290.39,429.79,1.6579,2505.4,0.085687,177.32,0.022521,,131.66,0.009607,4.0911,0.86269,6.0436,-1.2104,0.30684,...,0.000751,1.2315,482.98,13945000000000.0,5983700000000.0,-0.002065,7.7984,42882.0,16713.0,10.138,0.49807,1.9966,99321000000.0,305.0,,22.834,-59648000.0,43.982,173.99,581.98,8.5218,0.49738,28076.0,126.33,1.0414,2.2636,1042.3,0.99905,-14949000000.0,3.5,0.68582,15.724,378690.0,-6.2524,-77.629,0.32915,-0.000352,0.009691,811120000.0,0.24442,-13.189,1.7518,132.08,-0.59273,9641.1,1.2929,32217000000000.0,360.38,0.18716,0
835172,0.070643,0.43618,7.1147,0.26679,0.4375,-0.29273,103.57,472150.0,514300000000.0,368.54,-2.7119,234160000.0,0.26541,0.38688,0.043383,7.799,8.9918,1.9697,10.991,2.6193,371450.0,2.1084,75.677,1.3279,248.12,18264000000.0,1517900000000.0,364300.0,0.010759,5.7462,0.8815,370820.0,201760.0,0.002853,2.9708e+16,2577.5,105.3,3.0762,1782.8,0.099744,983.87,1.0028,6.0392,33.428,-0.002706,3.9938,0.68893,6.9299,-0.97178,-0.001763,...,0.000645,2.3417,1272.7,1174500000000000.0,-3643600000.0,1.0225,4.832,143.19,41297.0,-1.1466,0.076443,1.4751,181570000000.0,5.9101,23712000.0,909.6,682790000.0,2.4086,0.74928,7457.3,2.1992,0.4094,5106.9,7.4189,1.0725,19.433,342.78,1.0009,36509000000000.0,1.2158,0.42662,16.142,5007.9,75.467,545.13,0.1873,0.24305,0.024998,4735200000.0,0.2843,-30.551,1.8709,-2.1989,33.07,4957.2,1.1849,2.2256e+16,790.5,0.48116,0
440246,0.087763,0.39384,11723.0,0.30166,0.32483,2.1016,61.505,57685.0,773850000000000.0,5039.4,2.3379,1282400000.0,0.30114,-0.15878,-0.097965,6.3278,6.9322,10.802,36.258,67.248,40522.0,2.9321,99.158,-2.1016,423.64,581290000000.0,2231600000000.0,32859.0,0.10491,13.726,0.94074,104840.0,161580.0,-0.000458,25325000000000.0,-220.94,495.44,2.8884,1574.8,0.012879,62.054,-0.000258,6.5455,24.736,0.004638,-1.2,0.89527,6.2542,-1.1506,0.002527,...,1.0101,1.1927,-7.849,24481000000000.0,-8715500000.0,0.087503,4.9706,10391.0,3532.7,13.065,0.20694,2.5315,9005000000.0,12.454,142480.0,907.99,43154000000.0,1.3259,81.98,5455.4,6.2204,0.49281,994.52,182.61,0.97001,5.9614,648.79,1.0001,150140000000.0,1.4404,1.0169,17.953,945820.0,444.15,1079.4,0.003656,0.074529,0.012178,2489100000.0,0.51354,-36.755,1.5259,11.223,-7.8481,107490.0,1.1814,4.404e+16,5314.9,0.28625,0
655232,0.12543,0.25065,3711.8,-0.002429,,-1.446,,-2007.9,-5818800000000.0,133.54,0.98112,2571200000.0,0.2643,5.4958,0.17945,27.188,5.7804,3.0087,26.874,33.366,-2960.1,2.7777,114.76,-1.6159,1128.1,6195200000000.0,5157800000000.0,1863300.0,0.90363,5.4225,0.92578,32683.0,48497.0,0.001904,5.0648e+16,220.8,796.04,2.9414,130.92,0.094643,261.75,0.88712,9.9256,36.886,-0.00114,4.6092,-1.1432,8.1141,-1.1552,0.38306,...,0.98866,1.4498,572.09,1051600000000.0,5665700000000.0,0.99614,7.5855,220.39,37926.0,1.7592,0.14727,1.9831,271620000000.0,361.55,31728000.0,598.45,5298300000.0,29.353,,133.57,8.7067,0.38335,431.93,,1.2005,23.683,631.62,1.0017,-68130000000.0,1.439,0.63319,43.508,1075000.0,530.95,248.52,0.041786,0.20169,0.011021,395200000.0,0.79017,,1.5962,,5.4465,351300.0,1.184,1.3866e+16,636.28,0.1951,1
238328,0.080657,0.14295,1411.8,0.049432,0.35731,-0.093218,1844.7,113950.0,-3578900000000.0,6.2841,2.4127,4538000.0,0.30298,1.6582,0.020068,466.57,7.4756,3.9222,17.608,124.91,349170.0,1.3967,102.38,1.0383,967.98,22886000000000.0,1357200000000.0,1704400.0,0.94653,13.298,1.0618,208550.0,222260.0,-0.001894,-21891000000000.0,-315.27,170.48,1.3774,269.18,0.89955,-0.45199,0.10999,7.0578,10.585,-0.000485,4.263,-0.50292,5.5019,-0.61469,0.000195,...,-0.00117,1.256,440.42,62926000000000.0,2316200000.0,1.0003,9.5033,37252.0,8695.6,1.9457,0.074653,2.4564,48645000000.0,656.75,4152000.0,1072.3,43592000000.0,6.8731,357.79,3313.0,3.4791,0.41163,20659.0,114.83,1.0468,2.9638,11090.0,0.99871,23105000000000.0,1.2763,0.49611,3.8115,544720.0,2988.2,-165.6,0.32738,0.03784,-0.001019,2875100000.0,0.60671,-6.2595,3.4167,113.62,7.543,111610.0,1.1395,4026700000000000.0,3827.1,0.24781,0
798140,0.089509,0.31986,62.801,0.25861,0.41941,1.2415,2026.6,713370.0,8078600000000000.0,5578.1,2.2709,193620000.0,0.18907,2.0192,0.14555,21.794,5.9761,14.319,20.335,3.6218,160340.0,2.4233,41.606,-1.653,137.76,16381000000.0,818690000000.0,42788.0,0.016197,9.3943,0.90714,518960.0,238800.0,0.002651,9051300000000000.0,2716.5,620.85,2.7411,4494.0,0.017986,111.38,0.91746,8.4294,105.46,0.009362,3.8694,-0.38192,7.8663,-0.76982,0.2289,...,0.000669,1.1871,1181.2,317870000000000.0,1537800000.0,0.088616,4.6065,492.01,16861.0,4.1583,0.51506,6.1264,145590000000.0,288.37,14359000.0,,7521900000.0,-0.18458,28.904,3743.9,8.3681,0.47146,2205.7,54.926,-0.72281,1.4194,-33.089,0.99869,-118770000000.0,1.275,0.57418,,49317.0,1648.5,207.32,0.20811,0.058126,,67734000.0,0.64396,-1.0459,1.4025,2.5472,10.544,4683.9,1.1497,37526000000000.0,2140.6,0.09704,1
246668,0.10105,0.3218,3128.8,0.13329,0.001847,-1.4455,1278.1,318820.0,7195200000000000.0,546.57,1.0265,5669900000.0,0.30717,1.505,-0.018332,95.657,5.6481,1.9556,12.436,8.2265,9386.0,2.5371,110.68,1.0719,364.63,1806800000000.0,22456000000.0,2438100.0,0.000712,15.503,-0.99263,25272.0,373970.0,-1.6e-05,-163010000000000.0,49.256,217.38,1.672,2148.8,0.09241,66.029,1.0152,6.439,11.313,0.038505,4.9609,-0.4961,11.341,-1.0972,0.44204,...,0.99579,1.5487,833.73,1210000000000000.0,2861200000.0,0.10724,6.6648,3193.0,12746.0,0.85038,0.35046,5.8746,142110000000.0,1.8721,9518500.0,608.64,86394000.0,33.734,353.35,9747.6,7.3401,0.48587,96.247,349.4,1.0899,3.6386,80.377,0.9993,-48744000000.0,1.5041,,5.4378,111090.0,8.841,60.964,0.052725,0.1011,0.000781,-1201400.0,0.53155,-4.5702,2.9054,0.72652,13.099,23300.0,1.1607,1.916e+17,464.54,0.4707,1
929159,0.12848,0.3808,379.68,0.005072,0.27776,0.19419,2764.5,157000.0,70633000000000.0,517.58,1.1129,4489100000.0,0.3531,0.75067,0.058046,5.5176,8.4491,5.9055,26.84,448.92,182270.0,2.6464,144.16,-2.936,879.36,3756100000000.0,108100000000.0,2663200.0,0.91454,3.0666,-0.77831,24465.0,569730.0,-0.002063,-835440000000000.0,130.85,248.69,1.5799,9239.6,0.013175,48.307,-0.002078,5.9673,0.70779,4e-05,3.1987,0.34955,5.6179,0.7124,0.41276,...,0.000915,2.3992,504.19,7456300000000.0,6148200000000.0,-0.003863,5.557,10339.0,2873.3,-7.7902,0.50263,7.0915,321230000000.0,726.12,419080.0,1327.6,175190000.0,11.345,9.6454,1757.3,5.5124,0.29171,2361.6,195.45,0.008013,0.85687,181.81,1.0005,24696000000000.0,1.2038,0.49935,11.165,535770.0,2251.5,273.19,0.36457,-0.000229,-0.00292,284990000.0,0.75564,-4.1497,1.2349,129.17,-2.2011,752.05,1.1785,3.4369e+16,799.21,0.40334,0


In [5]:
# Separate target from features
y = train['claim']
X = train.drop(['claim'], axis=1)
X_test = test

In [6]:
# Preview features
X.head().T

id,0,1,2,3,4
f1,1.085900e-01,1.009000e-01,1.780300e-01,1.523600e-01,1.162300e-01
f2,4.313900e-03,2.996100e-01,-6.980000e-03,7.259100e-03,5.029000e-01
f3,-3.756600e+01,1.182200e+04,9.072700e+02,7.801000e+02,-1.091500e+02
f4,1.736400e-02,2.765000e-01,2.721400e-01,2.517900e-02,2.979100e-01
f5,2.891500e-01,4.597000e-01,4.594800e-01,5.194700e-01,3.449000e-01
...,...,...,...,...,...
f114,4.378800e+03,9.132300e+02,4.511900e+04,4.952400e+03,3.856500e+03
f115,1.209600e+00,1.246400e+00,1.176400e+00,1.178400e+00,1.483000e+00
f116,8.613400e+14,7.575100e+15,3.218100e+14,4.533000e+12,-8.991300e+12
f117,1.401000e+02,1.861000e+03,3.838200e+03,4.889100e+03,


In [7]:
# identify columns
numerical_cols = list(X.select_dtypes(include=np.number).columns)
non_numeric_cols = list(X.select_dtypes(include=['object', 'bool']).columns)

In [8]:
print(f'We have {len(numerical_cols)} numeric and {len(non_numeric_cols)} non-numeric features')

We have 118 numeric and 0 non-numeric features


In [9]:
features = non_numeric_cols + numerical_cols

In [10]:
X.describe().T.style.bar(subset=['mean'], color='#20c8f2')\
                   .background_gradient(subset=['std'], cmap='YlGn')

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
f1,942672.0,0.090201,0.043564,-0.14991,0.070227,0.090135,0.1165,0.41517
f2,942729.0,0.345964,0.146251,-0.019044,0.28305,0.3891,0.45845,0.51899
f3,942428.0,4068.744207,6415.82944,-9421.7,418.43,1279.5,4444.4,39544.0
f4,942359.0,0.201214,0.21251,-0.082122,0.035086,0.137,0.2971,1.3199
f5,942514.0,0.304869,0.145343,-0.00699,0.24052,0.32779,0.41283,0.55475
f6,942398.0,-0.071458,2.123777,-12.791,-1.1207,-0.38011,0.92194,11.202
f7,942415.0,1620.843815,1276.281403,-224.8,481.545,1446.1,2495.9,5426.6
f8,942546.0,377164.164157,345432.472849,-29843.0,91209.0,289670.0,560560.0,1913700.0
f9,942670.0,1806053749440377.5,2335204188640509.0,-1153300000000000.0,11531000000000.0,504305000000000.0,3103100000000000.0,1.0424e+16
f10,942696.0,5323.442367,10068.380032,-26404.0,75.87675,1073.2,5693.2,85622.0


In [11]:
print(f'Features: {list(X.columns)}')

Features: ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118']


### Feature Engineering

In [12]:
# new features
# https://www.kaggle.com/hiro5299834/tps-sep-2021-single-lgbm
X['n_missing'] = X[features].isna().sum(axis=1)
X_test['n_missing'] = X_test[features].isna().sum(axis=1)

X['std'] = X[features].std(axis=1)
X_test['std'] = X_test[features].std(axis=1)

features += ['n_missing', 'std']

# imputation
X[features] = X[features].fillna(X[features].mean())
X_test[features] = X_test[features].fillna(X_test[features].mean())



### Feature Selection
See here for a good description: 
https://mlwhiz.com/blog/2019/08/07/feature_selection/?utm_campaign=News&utm_medium=Community&utm_source=DataCamp.com

In [13]:
# user defined selector
# in this case correlation
class CorrelationSelector():
    def __init__(self, n_features, corr_fnc=np.corrcoef):
        self.corr_fnc = corr_fnc
        self.n_features = n_features
        self.support = []
        self.__name__ = "Correlation"
        
    def fit(self, X, y, use_abs=True):
        correlations = {}
        for col in X.columns:
            corr = self.corr_fnc(X[col], y)[0, 1]
            if np.isnan(corr):
                corr = 0
            
            correlations[col] = corr
        if use_abs:
            correlations = dict(zip(correlations.keys(), [abs(v) for v in correlations.values()]))
        
        # sort them by correlation
        correlations = {k: v for k, v in sorted(correlations.items(), key=lambda item: item[1])}
        selected_features = list(correlations.keys())[-self.n_features:]
        
        self.support = [True if feature in selected_features else False for feature in list(X.columns)]
    
    def get_support(self):
        return self.support        

In [14]:

class FeatureSelector():
    """ Process a dictionay of selectors and produces a dataframe report
    """
    def __init__(self, selectors=[None]):
        self.selectors = selectors
        self.report = {}
        
        
    def fit(self, X, y,  verbose=True):
        self.report["Feature"] = X.columns.tolist()
        for name, selector in selectors.items():
            print('-' * 50)
            print(f'Selector: {name}')
            print('-' * 50)
            selector["model"].fit(X, y, **selector["params"])
            self.report[name] = selector["model"].get_support()
            
    def get_report(self,
                   sort=True):
        df = pd.DataFrame(self.report)
        df['Total'] = df.iloc[:,1:].sum(axis=1)
        if sort:
            df = df.sort_values(['Total'], ascending=False)
        
        return df

In [15]:
# let us define some models,

lr_params = {
    'max_iter':1000
}

# from optuna studies here https://www.kaggle.com/mlanhenke/tps-09-simple-blend-stacking-xgb-lgbm-catb

xgb_params = dict(
    eval_metric='auc',
    max_depth=3,
    subsample=0.5,
    colsample_bytree=0.5,
    learning_rate=0.01187431306013263,
    n_estimators=10000,
    n_jobs=-1,
    use_label_encoder=False,
    objective='binary:logistic',
    tree_method='gpu_hist',
    gpu_id=0,
    predictor='gpu_predictor')

lgbm_params = {
    'metric' : 'auc',
    'objective' : 'binary',
    'device_type': 'gpu', 
    'n_estimators': 10000, 
    'learning_rate': 0.12230165751633416, 
    'num_leaves': 1400, 
    'max_depth': 8, 
    'min_child_samples': 3100, 
    'reg_alpha': 10, 
    'reg_lambda': 65, 
    'min_split_gain': 5.157818977461183, 
    'subsample': 0.5, 
    'subsample_freq': 1, 
    'colsample_bytree': 0.2,
    'n_jobs': 4
}

catb_params = {
    'eval_metric' : 'AUC',
    'depth' : 5,
    'grow_policy' : 'SymmetricTree',
    'l2_leaf_reg' : 3.0,
    'random_strength' : 1.0,
    'learning_rate' : 0.1,
    'iterations' : 10000,
    'loss_function' : 'CrossEntropy',
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 0,
    'thread_count':4
}

lr = LogisticRegression(**lr_params)
xgb = XGBClassifier(**xgb_params)
lgbm = LGBMClassifier(**lgbm_params)
cat = CatBoostClassifier(**catb_params)

In [16]:
# create some selectors
n_features = 100 # 
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.2,random_state=42)

# correlation
corr_selector = CorrelationSelector(n_features=n_features)

#  RFE logistic 
rfe_lr_selector = RFE(estimator=lr, n_features_to_select=n_features, step=10, verbose=5)
rfe_xgb_selector = RFE(estimator=xgb, n_features_to_select=n_features, step=10, verbose=5)
rfe_lgbm_selector = RFE(estimator=lgbm, n_features_to_select=n_features, step=10, verbose=5)
rfe_cat_selector = RFE(estimator=cat, n_features_to_select=n_features, step=10, verbose=5)


# SelectFromModel
from_model_lr_selector = SelectFromModel(lr, max_features=n_features)
from_model_xgb_selector = SelectFromModel(xgb, max_features=n_features)
from_model_lgbm_selector = SelectFromModel(lgbm, max_features=n_features)
from_model_cat_selector = SelectFromModel(cat, max_features=n_features)


# "params":{'eval_set':[(valid_X, valid_y)], 'early_stopping_rounds': 300, 'verbose': 1000}},

selectors = {
    "corr_selector": {"model": corr_selector, "params":{}},
    
    "rfe_xgb_selector": {"model": rfe_xgb_selector, "params":{}},
    "from_model_xgb_selector": {"model": from_model_xgb_selector, "params":{}},
    
    "rfe_lgbm_selector": {"model": rfe_lgbm_selector, "params":{}},
    "from_model_lgbm_selector": {"model": from_model_lgbm_selector, "params":{}},
    
    "rfe_cat_selector": {"model": rfe_cat_selector, "params":{}},
    "from_model_cat_selector": {"model": from_model_cat_selector, "params":{}},
    
    "rfe_lr_selector": {"model": rfe_lr_selector, "params":{}},
    "from_model_lr_selector": {"model": from_model_lr_selector, "params":{}}
}

In [17]:
# run the selection task
featruer_selector = FeatureSelector(selectors)
featruer_selector.fit(train_X, train_y)
report = featruer_selector.get_report()

--------------------------------------------------
Selector: corr_selector
--------------------------------------------------
--------------------------------------------------
Selector: rfe_xgb_selector
--------------------------------------------------
Fitting estimator with 120 features.
Fitting estimator with 110 features.
--------------------------------------------------
Selector: from_model_xgb_selector
--------------------------------------------------
--------------------------------------------------
Selector: rfe_lgbm_selector
--------------------------------------------------
Fitting estimator with 120 features.
Fitting estimator with 110 features.
--------------------------------------------------
Selector: from_model_lgbm_selector
--------------------------------------------------
--------------------------------------------------
Selector: rfe_cat_selector
--------------------------------------------------
Fitting estimator with 120 features.
Fitting estimator with 110 f

In [18]:
report.head(20)

Unnamed: 0,Feature,corr_selector,rfe_xgb_selector,from_model_xgb_selector,rfe_lgbm_selector,from_model_lgbm_selector,rfe_cat_selector,from_model_cat_selector,rfe_lr_selector,from_model_lr_selector,Total
118,n_missing,True,True,True,True,False,True,True,True,False,7
64,f65,True,True,False,True,True,True,True,True,False,7
41,f42,True,True,False,True,True,True,True,True,False,7
46,f47,True,True,False,True,True,True,True,True,False,7
0,f1,True,True,False,True,True,True,True,False,False,6
53,f54,True,True,False,True,True,True,False,True,False,6
30,f31,True,True,False,True,True,True,False,True,False,6
31,f32,True,True,False,True,True,True,False,True,False,6
109,f110,True,True,False,True,True,True,False,True,False,6
34,f35,True,True,False,True,False,True,False,True,True,6


In [19]:
report.to_csv('report.csv')

In [20]:
# simple vote-thresholding
min_votes = 2 # at least two models find this feature helpful
report.loc[(report['Total'] >= min_votes), 'Feature']

118    n_missing
64           f65
41           f42
46           f47
0             f1
         ...    
18           f19
57           f58
40           f41
58           f59
100         f101
Name: Feature, Length: 119, dtype: object