# Feature Selection using Permutation Importances

In [1]:
import pandas as pd

df = pd.read_csv('mushrooms_mapped.csv')

In [2]:
# Split data into train, val, and test
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, random_state=42)
train, val = train_test_split(train, random_state=42)

train.shape, val.shape, test.shape

((4569, 24), (1524, 24), (2031, 24))

In [3]:
target = 'class'
features = train.columns.drop([target] + ['Unnamed: 0', 'veil_type'])
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]

In [6]:
import category_encoders as ce
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier

transformers = make_pipeline(
    ce.OrdinalEncoder()
)

X_train_transformed = transformers.fit_transform(X_train, y_train)
X_val_transformed = transformers.transform(X_val)

model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_transformed, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [7]:
import eli5
from eli5.sklearn import PermutationImportance

permuter = PermutationImportance(    
    model, 
    scoring='accuracy', 
    n_iter=5, 
    random_state=42
)

permuter.fit(X_val_transformed, y_val)

PermutationImportance(cv='prefit',
                      estimator=RandomForestClassifier(bootstrap=True,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       n_estimators=100,
                                                     

In [9]:
feature_names = X_val.columns.tolist()

In [10]:
eli5.show_weights(
    permuter,
    top=None, # show permutation importances for all features
    feature_names=feature_names
)

Weight,Feature
0.0759  ± 0.0062,odor
0.0173  ± 0.0090,gill_size
0.0146  ± 0.0035,spore_print_color
0  ± 0.0000,habitat
0  ± 0.0000,stalk_shape
0  ± 0.0000,cap_surface
0  ± 0.0000,cap_color
0  ± 0.0000,bruises
0  ± 0.0000,gill_attachment
0  ± 0.0000,gill_spacing


## Without Odor We Can Build A Model Dependent On More Than Three Features

In [11]:
features = train.columns.drop([target] + ['Unnamed: 0', 'veil_type', 'odor'])
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]

In [12]:
transformers = make_pipeline(
    ce.OrdinalEncoder()
)

X_train_transformed = transformers.fit_transform(X_train, y_train)
X_val_transformed = transformers.transform(X_val)

model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_transformed, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [13]:
permuter = PermutationImportance(    
    model, 
    scoring='accuracy', 
    n_iter=5, 
    random_state=42
)

permuter.fit(X_val_transformed, y_val)

PermutationImportance(cv='prefit',
                      estimator=RandomForestClassifier(bootstrap=True,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       n_estimators=100,
                                                     

In [14]:
feature_names = X_val.columns.tolist()

In [15]:
eli5.show_weights(
    permuter,
    top=None, # show permutation importances for all features
    feature_names=feature_names
)

Weight,Feature
0.0786  ± 0.0030,gill_size
0.0409  ± 0.0070,spore_print_color
0.0024  ± 0.0013,cap_surface
0.0018  ± 0.0005,gill_spacing
0.0016  ± 0.0013,bruises
0.0009  ± 0.0006,habitat
0.0005  ± 0.0010,stalk_root
0.0005  ± 0.0005,stalk_shape
0.0001  ± 0.0005,ring_type
0  ± 0.0000,stalk_color_above_ring


In [17]:
minimum_importance = 0
mask = permuter.feature_importances_ > minimum_importance

X_train.columns[mask].tolist()

['cap_surface',
 'bruises',
 'gill_spacing',
 'gill_size',
 'stalk_shape',
 'stalk_root',
 'ring_type',
 'spore_print_color',
 'habitat']