Instalando e testando o Boruta

In [None]:
%pip install boruta

In [None]:
# https://github.com/gabrielhpr/FeatureSelectionWithBoruta/blob/master/Feature_selection_with_boruta.ipynb

In [32]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import binom
from IPython.display         import Image
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

In [54]:
sample_behavior = pd.read_csv("../bases/AmostraBehavior.csv")
sample_behavior

Unnamed: 0,Id,Perf_final,UltPercLimit,Idade,N_Atraso30_59Dias,RazaoGastos,RendaMensal,N_EmeprestimosAbertos,N_atrasos_Ult90Dias,N_emprestimos,...,CAT_lnRazaoGastos,WOE_N_atrasos_Ult90Dias,WOE_N_emprestimos,WOE_N_Atraso60_89Dias,WOE_N_dependentes,WOE_CAT_UltPercLimit,WOE_CAT_Idade,WOE_CAT_RazaoGastos,WOE_CAT_lnRazaoGastos,WOE_CAT_N_EmeprestimosAbertos
0,18744,1,1.007054,50,3,0.141472,3300.0,4,0,5,...,2,0.396398,-0.249568,-2.741548,-0.296127,-1.251949,-0.039102,0.198561,0.198561,-0.360182
1,106506,0,0.014666,56,0,0.293676,5454.0,5,0,2,...,3,0.396398,0.199208,0.290109,-0.193107,1.270830,-0.039102,0.081338,0.081338,0.159293
2,133947,0,0.597323,42,0,0.138962,25000.0,11,0,2,...,2,0.396398,0.199208,0.290109,-0.296127,-0.034538,-0.254073,0.198561,0.198561,0.129020
3,81867,0,0.025595,50,0,0.342331,8520.0,12,0,2,...,3,0.396398,0.199208,0.290109,0.147208,1.478014,-0.039102,0.081338,0.081338,0.129020
4,108256,0,0.105306,80,0,0.143105,5764.0,14,0,5,...,2,0.396398,-0.249568,0.290109,-0.091080,0.875943,1.020199,0.198561,0.198561,-0.000105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7346,1576,0,0.000054,41,0,0.108462,5577.0,7,0,5,...,1,0.396398,-0.249568,0.290109,-0.091080,1.270830,-0.254073,0.062296,0.062296,0.159293
7347,28740,0,0.102461,66,0,0.686345,4166.0,18,0,2,...,4,0.396398,0.199208,0.290109,-0.193107,0.875943,1.020199,-0.423260,-0.423260,-0.000105
7348,147913,1,0.835482,46,1,0.291097,4166.0,6,0,5,...,3,0.396398,-0.249568,-1.859418,-0.193107,-1.251949,-0.254073,0.081338,0.081338,0.159293
7349,110681,1,0.006804,79,2,0.003998,2500.0,12,0,5,...,1,0.396398,-0.249568,-1.859418,0.147208,1.270830,1.020199,0.062296,0.062296,0.129020


In [34]:
labels = sample_behavior.columns[2:len(sample_behavior.columns)]
 
X = sample_behavior[labels] # Covariáveis
y = sample_behavior.Perf_final # Target

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 999)

# Boruta

O algoritmo Boruta é um Wrapper Method, que realiza várias maniputações e combinações entre as features para gerar os resultados de interesse.
O Boruta considera as características de sombra e a distribuição binomial para realizar a seleção de features. As features são comparadas com uma versão aleatória de si mesmo, a partir de uma duplicação do banco de dados criando assim variáveis sombras para cada coluna.

Em seguida é verificado se as variáveis observadas têm uma importância maior que as variáveis sombras, se isso ocorrer chamamos de acerto.
Com isso continuam as iterações com o intuito de remover correlações com a variável resposta.

In [37]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

y = y_train.ravel()

In [38]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight = "balanced", bootstrap = True)

- class_weight: {“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y.
- n_jobs: The number of jobs to run in parallel. fit, predict, decision_path and apply are all parallelized over the trees. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details.

- bootstrap: default=True
Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.

In [39]:
feat_selector = BorutaPy(rf, n_estimators = "auto", random_state=1)

- n_estimators: he number of trees to be used in the forest

- random_state: int, RandomState instance or None, default=None
Controls both the randomness of the bootstrapping of the samples used when building trees (if bootstrap=True) and the sampling of the features to consider when looking for the best split at each node (if max_features < n_features). See Glossary for details.

In [40]:
result_boruta = feat_selector.fit(X_train.values, y_train)

In [41]:
result_boruta

In [42]:
feat_selector.support_

array([ True, False,  True, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False])

In [45]:
X_train.columns[feat_selector.support_].to_list()

['UltPercLimit', 'N_Atraso30_59Dias', 'lnRazaoGastos', 'WOE_CAT_UltPercLimit']

In [46]:
X_train.columns[feat_selector.support_weak_].to_list()

['RendaMensal']

In [47]:
# Verificar as features selecionadas
accept = X_train.columns[feat_selector.support_].to_list()
irresolution = X_train.columns[feat_selector.support_weak_].to_list()

print("Selecionadas:")
print("************************")
print(list(accept))
print(list(irresolution))
print("************************")

Selecionadas:
************************
['UltPercLimit', 'N_Atraso30_59Dias', 'lnRazaoGastos', 'WOE_CAT_UltPercLimit']
['RendaMensal']
************************


In [None]:
# .support_
result_boruta.support_

In [None]:
# .ranking_ 
result_boruta.ranking_

In [49]:
feature_ranks = list(zip(X_train.columns, 
                         result_boruta.ranking_, 
                         result_boruta.support_))

for feat in feature_ranks:
    print('Feature: {} Rank: {},  Manter: {}'.format(feat[0], feat[1], feat[2]))

Feature: UltPercLimit Rank: 1,  Manter: True
Feature: Idade Rank: 3,  Manter: False
Feature: N_Atraso30_59Dias Rank: 1,  Manter: True
Feature: RazaoGastos Rank: 6,  Manter: False
Feature: RendaMensal Rank: 2,  Manter: False
Feature: N_EmeprestimosAbertos Rank: 10,  Manter: False
Feature: N_atrasos_Ult90Dias Rank: 4,  Manter: False
Feature: N_emprestimos Rank: 19,  Manter: False
Feature: N_Atraso60_89Dias Rank: 6,  Manter: False
Feature: N_dependentes Rank: 15,  Manter: False
Feature: lnRazaoGastos Rank: 1,  Manter: True
Feature: SqrtRazaoGastos Rank: 9,  Manter: False
Feature: CAT_UltPercLimit Rank: 3,  Manter: False
Feature: CAT_Idade Rank: 12,  Manter: False
Feature: CAT_RazaoGastos Rank: 22,  Manter: False
Feature: CAT_RendaMensal Rank: 16,  Manter: False
Feature: CAT_N_EmeprestimosAbertos Rank: 17,  Manter: False
Feature: CAT_lnRazaoGastos Rank: 23,  Manter: False
Feature: WOE_N_atrasos_Ult90Dias Rank: 6,  Manter: False
Feature: WOE_N_emprestimos Rank: 18,  Manter: False
Feature: W

### Teste 01: Todas as Variáveis

In [50]:
rf1 = RandomForestClassifier(n_jobs=-1, class_weight = "balanced", bootstrap = True)
rf1.fit(X_train,y_train)
y_test_pred1 = rf1.predict(X_test) 
accuracy_score(y_test, y_test_pred1)

0.9356300997280145

### Teste 02: Variáveis em support_

In [52]:
cols_select = ['UltPercLimit', 'N_Atraso30_59Dias', 'lnRazaoGastos', 'WOE_CAT_UltPercLimit']
rf2 = RandomForestClassifier(n_jobs=-1, class_weight = "balanced", bootstrap = True)
rf2.fit(X_train[cols_select],y_train)
y_test_pred2 = rf2.predict(X_test[cols_select]) 
accuracy_score(y_test, y_test_pred2)

0.9233907524932004

### Teste 03: Variáveis em support_weak_

In [53]:
cols_select = ['UltPercLimit', 'N_Atraso30_59Dias', 'lnRazaoGastos', 'WOE_CAT_UltPercLimit','RendaMensal']
rf2 = RandomForestClassifier(n_jobs=-1, class_weight = "balanced", bootstrap = True)
rf2.fit(X_train[cols_select],y_train)
y_test_pred2 = rf2.predict(X_test[cols_select]) 
accuracy_score(y_test, y_test_pred2)

0.9347234814143246