Instalando e testando o Boruta

In [None]:
%pip install boruta

In [None]:
# https://github.com/gabrielhpr/FeatureSelectionWithBoruta/blob/master/Feature_selection_with_boruta.ipynb

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import binom
from IPython.display         import Image
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

In [None]:
sample_behavior = pd.read_csv("../bases/AmostraBehavior.csv")

In [None]:
labels = sample_behavior.columns[2:len(sample_behavior.columns)]
 
X = sample_behavior[labels] # Covariáveis
y = sample_behavior.Perf_final # Target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 999)

# Boruta

O algoritmo Boruta é um Wrapper Method, que realiza várias maniputações e combinações entre as features para gerar os resultados de interesse.
O Boruta considera as características de sombra e a distribuição binomial para realizar a seleção de features. As features são comparadas com uma versão aleatória de si mesmo, a partir de uma duplicação do banco de dados criando assim variáveis sombras para cada coluna.

Em seguida é verificado se as variáveis observadas têm uma importância maior que as variáveis sombras, se isso ocorrer chamamos de acerto.
Com isso continuam as iterações com o intuito de remover correlações com a variável resposta.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

y = y_train.ravel()

In [None]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight = "balanced", bootstrap = True)

- class_weight: {“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y.
- n_jobs: The number of jobs to run in parallel. fit, predict, decision_path and apply are all parallelized over the trees. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details.

- bootstrap: default=True
Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.

In [None]:
feat_selector = BorutaPy(rf, n_estimators = "auto", random_state=1)

- n_estimators: he number of trees to be used in the forest

- random_state: int, RandomState instance or None, default=None
Controls both the randomness of the bootstrapping of the samples used when building trees (if bootstrap=True) and the sampling of the features to consider when looking for the best split at each node (if max_features < n_features). See Glossary for details.

In [None]:
result_boruta = feat_selector.fit(X_train.values, y_train)

In [None]:
result_boruta

In [None]:
# Verificar as features selecionadas
accept = X_train.columns[feat_selector.support_].to_list()
irresolution = X_train.columns[feat_selector.support_weak_].to_list()

print("Selecionadas:")
print("************************")
print(list(accept))
print(list(irresolution))
print("************************")

In [None]:
# .support_
result_boruta.support_

In [None]:
# .ranking_ 
result_boruta.ranking_

In [None]:
feature_ranks = list(zip(X_train.columns, 
                         result_boruta.ranking_, 
                         result_boruta.support_))

for feat in feature_ranks:
    print('Feature: {} Rank: {},  Manter: {}'.format(feat[0], feat[1], feat[2]))

### Teste 01: Todas as Variáveis

In [None]:
rf1 = RandomForestClassifier(n_jobs=-1, class_weight = "balanced", bootstrap = True)
rf1.fit(X_train,y_train)
y_test_pred1 = rf1.predict(X_test) 
accuracy_score(y_test, y_test_pred1)

### Teste 02: Variáveis em support_

In [None]:
cols_select = ['UltPercLimit', 'N_Atraso30_59Dias', 'lnRazaoGastos', 'WOE_CAT_UltPercLimit']
rf2 = RandomForestClassifier(n_jobs=-1, class_weight = "balanced", bootstrap = True)
rf2.fit(X_train[cols_select],y_train)
y_test_pred2 = rf2.predict(X_test[cols_select]) 
accuracy_score(y_test, y_test_pred2)

### Teste 03: Variáveis em support_weak_

In [None]:
cols_select = ['UltPercLimit', 'N_Atraso30_59Dias', 'lnRazaoGastos', 'WOE_CAT_UltPercLimit','RendaMensal']
rf2 = RandomForestClassifier(n_jobs=-1, class_weight = "balanced", bootstrap = True)
rf2.fit(X_train[cols_select],y_train)
y_test_pred2 = rf2.predict(X_test[cols_select]) 
accuracy_score(y_test, y_test_pred2)