# Feature Selection

In [29]:
import numpy as np

from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

# [ I ] Univariate Feature Selection

<b><i> Conservation de 50% des features les plus significativement liés à la variable cible </i></b>

In [None]:
from sklearn.feature_selection import SelectPercentile

In [2]:
cancer = load_breast_cancer()

### Ajout de bruit

In [4]:
rng = np.random.RandomState( 42 )

In [6]:
noise = rng.normal( size = ( len( cancer.data ) , 50 ) )

In [11]:
X_w_noise = np.hstack( [ cancer.data , noise ] )

### Train / test split

In [12]:
X_train , X_test , y_train , y_test = train_test_split( X_w_noise , cancer.target , random_state = 0 , test_size = 0.5 )

### Feature selection

In [14]:
select = SelectPercentile( percentile = 50 )

select.fit( X_train , y_train )

X_train_selected = select.transform( X_train )

In [53]:
X_train.shape

(284, 80)

In [54]:
X_train_selected.shape

(284, 40)

### Features selectionnés

In [17]:
select.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False,  True, False,  True,
       False, False,  True, False, False, False, False,  True, False,
       False,  True, False, False,  True, False,  True, False, False,
       False, False, False, False,  True, False,  True, False, False,
       False, False,  True, False,  True, False, False, False, False,
        True,  True, False,  True, False, False, False, False])

### Performance du modèle sur ( X_train , X_test ) vs.( X_train_selected , X_test_selected )

In [48]:
mdl_LR = LogisticRegression()

In [49]:
mdl_LR.fit( X_train , y_train )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [50]:
mdl_LR.score( X_test , y_test )

0.9298245614035088

In [51]:
mdl_LR.fit( X_train_selected , y_train )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [52]:
X_test_selected = select.transform( X_test )

mdl_LR.score( X_test_selected , y_test )

0.3543859649122807

# [ II ] Model_based feature selection

In [19]:
from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import RandomForestClassifier

### Feature selection

In [55]:
# threshold = 'median' : la moitié des features seront sélectionnés

select = SelectFromModel( RandomForestClassifier( n_estimators = 100 , random_state = 42 ) , threshold = 'median' )

select.fit( X_train , y_train )

X_train_selected = select.transform( X_train )

In [56]:
X_train.shape

(284, 80)

In [57]:
X_train_selected.shape

(284, 40)

### Features sélectionnés

In [58]:
select.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True, False, False,  True, False,
       False, False, False, False,  True, False, False,  True, False,
        True,  True, False, False, False,  True, False, False,  True,
        True, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False,  True, False, False,
       False,  True, False, False, False, False, False, False])

### Performance du modèle sur ( X_train , X_test ) vs.( X_train_selected , X_test_selected )

In [59]:
mdl_LR = LogisticRegression()

In [60]:
mdl_LR.fit( X_train , y_train )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [61]:
mdl_LR.score( X_test , y_test )

0.9298245614035088

In [62]:
mdl_LR.fit( X_train_selected , y_train )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [63]:
X_test_selected = select.transform( X_test )

mdl_LR.score( X_test_selected , y_test )

0.9508771929824561

# [ III ] Iterative Feature Selection

In [76]:
from sklearn.feature_selection import RFE

### Feature Selection

In [77]:
select = RFE( RandomForestClassifier( n_estimators = 100 , random_state = 42 ) , n_features_to_select = 40 )

select.fit( X_train , y_train )

X_train_selected = select.transform( X_train )

In [78]:
X_train.shape

(284, 80)

In [79]:
X_train_selected.shape

(284, 40)

### Features sélectionnés

In [80]:
select.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True, False,  True, False, False,  True,
        True, False, False, False,  True, False, False, False, False,
       False, False,  True, False,  True, False, False, False, False,
       False,  True, False, False, False,  True, False, False])

### Performance du modèle sur ( X_train , X_test ) vs.( X_train_selected , X_test_selected )

In [81]:
mdl_LR = LogisticRegression()

In [82]:
mdl_LR.fit( X_train , y_train )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [83]:
mdl_LR.score( X_test , y_test )

0.9298245614035088

In [84]:
mdl_LR.fit( X_train_selected , y_train )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [85]:
X_test_selected = select.transform( X_test )

mdl_LR.score( X_test_selected , y_test )

0.9508771929824561