In [1]:
# Load Libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd

from sklearn.model_selection import train_test_split

# Variable Importance

Feature importance refers to a class of techniques for assigning scores to input features to a predictive model that indicates the relative importance of each feature when making a prediction.

Feature importance scores can be calculated for problems that involve predicting a numerical value, called regression, and those problems that involve predicting a class label, called classification.

The scores are useful and can be used in a range of situations in a predictive modeling problem, such as:

    - Better understanding the data.
    - Better understanding a model.
    - Reducing the number of input features.
    


## Load Data

In [2]:
from google.colab import files
uploaded = files.upload()

Saving mtcars.csv to mtcars.csv


In [3]:
import io
dat = pd.read_csv(io.BytesIO(uploaded['mtcars.csv']), sep = ",")
dat.head()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


Let's use only numeric variables as input and *mpg* as target.

In [4]:
X = dat.drop(['mpg', 'model'], axis = 1)
y = dat['mpg']

Let's split the data into train and validation for this notebook purposes.

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=1);


## Decision Trees Variable Importance

Decision trees have their own method to compute variable importance, based on the reduction in the criterion used to select split points, like Gini or entropy.

This same approach can be used for ensembles of decision trees, such as the random forest.

In [6]:
from sklearn.tree import DecisionTreeRegressor

In [7]:
# 3) Define the model
model = DecisionTreeRegressor(random_state = 0)

# 4) Train the model
model.fit(X_train, y_train)

# get importance
importance = pd.DataFrame({'variable' : X.columns.values, 'imp' : model.feature_importances_/np.max(model.feature_importances_)}).sort_values(by = 'imp', ascending = False)
importance

Unnamed: 0,variable,imp
4,wt,1.0
2,hp,0.5062278
5,qsec,0.02469548
1,disp,0.003689802
3,drat,0.0008806037
8,gear,0.0003929967
9,carb,4.964283e-16
0,cyl,0.0
6,vs,0.0
7,am,0.0


If we want to reduce our dataset to only three input variables we will do.

In [8]:
n_vars = 3
important_vars = importance.variable.values[0:n_vars].tolist()
important_vars

['wt', 'hp', 'qsec']

**Important note:** This means that they are more important variables than price **for this decision tree**. You cannot assume that this is a generic importance that could be applied to other models.

## Generic Variable Importance

Permutation feature importance is a technique for calculating relative importance scores that is independent of the model used.

First, a model, the more data agnostic and basic possible, is fit on the dataset. Then the model is used to make predictions on a dataset, although the values of a feature in the dataset are modified. This is repeated for each feature in the dataset. Then this whole process is repeated 3, 5, 10 or more times. The result is a mean importance score for each input feature.

In [11]:
from sklearn.inspection import permutation_importance
?permutation_importance

In [12]:
from sklearn.linear_model import LinearRegression as model_constructor

In [13]:
# 3) Define the model
model = model_constructor()

# 4) Train the model
model.fit(X_train, y_train)

# perform permutation importance
importance = permutation_importance(model,
                                    X_val,
                                    y_val,
                                    random_state = 1)

# get importance
importance = pd.DataFrame({'variable' : X.columns.values, 'imp' : np.abs(importance.importances_mean)/np.max(np.abs(importance.importances_mean))}).sort_values(by = 'imp', ascending = False)
importance

Unnamed: 0,variable,imp
4,wt,1.0
2,hp,0.554354
9,carb,0.361234
8,gear,0.3127
1,disp,0.274731
3,drat,0.137531
6,vs,0.125779
5,qsec,0.115322
0,cyl,0.032733
7,am,0.030779


If we want to reduce our dataset to only three input variables we will do.

In [14]:
n_vars = 3
important_vars = importance.variable.values[0:n_vars].tolist()
important_vars

['wt', 'hp', 'carb']

Different!!! This is a more generic variable importance.

## Define Custom Function

In [19]:
def important_vars(X_train, y_train, model_constructor,
                   X_val = None, y_val = None,
                   method = 'generic', p = 0.5):
    if method == 'generic':

        # 3) Define the model
        model = model_constructor()

        # 4) Train the model
        model.fit(X_train, y_train)

        # Perform permutation importance
        importance = permutation_importance(model, X_val, y_val, random_state = 1) # In real life use validation!!!

        # Get importance
        importance = pd.DataFrame({'variable' : X.columns.values, 'imp' : np.abs(importance.importances_mean)/np.max(np.abs(importance.importances_mean))}).sort_values(by = 'imp', ascending = False)

    elif method == 'tree':
        # 3) Define the model
        model = DecisionTreeRegressor(random_state = 0)

        # 4) Train the model
        model.fit(X_train, y_train)

        # Get importance
        importance = pd.DataFrame({'variable' : X.columns.values, 'imp' : model.feature_importances_/np.max(model.feature_importances_)}).sort_values(by = 'imp', ascending = False)


    # Get important vars
    important_vars = importance.variable.values[0:int(X.shape[1]*p)].tolist()

    # Select important vars
    X_train = X_train[important_vars]
    if not X_val is None:
        X_val = X_val[important_vars]

    #Return results
    return X_train, X_val

In [20]:
from sklearn.linear_model import LinearRegression as model_constructor

In [21]:
X_train_new, X_val_new = important_vars(X_train, y_train,
                                       model_constructor = model_constructor,
                                       X_val = X_val, y_val = y_val,
                                       p = 0.5,  method = 'generic')
X_train_new

Unnamed: 0,wt,hp,carb,gear,disp
24,3.845,175,2,3,400.0
10,3.44,123,4,4,167.6
20,2.465,97,1,3,120.1
26,2.14,91,2,5,120.3
4,3.44,175,2,3,360.0
2,2.32,93,1,4,108.0
25,1.935,66,1,4,79.0
6,3.57,245,4,3,360.0
13,3.78,180,3,3,275.8
7,3.19,62,2,4,146.7


In [22]:
X_train_new, X_val_new = important_vars(X_train, y_train,
                                       model_constructor = None,
                                       X_val = X_val, y_val = y_val,
                                       p = 0.5,  method = 'tree')
X_train_new

Unnamed: 0,wt,hp,qsec,disp,drat
24,3.845,175,17.05,400.0,3.08
10,3.44,123,18.9,167.6,3.92
20,2.465,97,20.01,120.1,3.7
26,2.14,91,16.7,120.3,4.43
4,3.44,175,17.02,360.0,3.15
2,2.32,93,18.61,108.0,3.85
25,1.935,66,18.9,79.0,4.08
6,3.57,245,15.84,360.0,3.21
13,3.78,180,18.0,275.8,3.07
7,3.19,62,20.0,146.7,3.69
