In [3]:
## For data handling
import pandas as pd
import numpy as np

## For plotting
import matplotlib.pyplot as plt
from seaborn import set_style

## This sets the plot style
## to have a grid on a white background
set_style("whitegrid")

## Random forest for feature importances

Random forests can also provide feature importance scores. 

The `sklearn` algorithm measures importance in the following way. For each feature it looks at every tree and identifies the nodes using that feature to make a cut. It then measures how much those cuts reduced impurity and averages that value over all the trees in the forest. After getting the average impurity reduction for each feature, `sklearn` scales the results so that the sum of all feature importances is equal to $1$.

We will demonstrate this on the `iris` data set.

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [5]:
# load the data
data_test = pd.read_csv('SVM/midFeaturesTrainFinalWithChars.csv')

data_test

Unnamed: 0,FileID,actorID,Emotion,SentenceID,Age,Sex,Race,Ethnicity,zcr_mean,energy_mean,...,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std
0,1001_DFA_ANG_XX,1001,ANG,DFA,51,Male,Caucasian,Not Hispanic,0.159956,0.012981,...,0.024107,0.014803,0.017961,0.013412,0.008655,0.010352,0.009738,0.010600,0.004328,0.009167
1,1001_DFA_DIS_XX,1001,DIS,DFA,51,Male,Caucasian,Not Hispanic,0.175069,0.006502,...,0.022395,0.015510,0.008768,0.014533,0.009661,0.002533,0.004223,0.007513,0.003662,0.007296
2,1001_DFA_FEA_XX,1001,FEA,DFA,51,Male,Caucasian,Not Hispanic,0.199849,0.016796,...,0.007043,0.003129,0.006915,0.007791,0.013899,0.005247,0.003474,0.014306,0.005781,0.008250
3,1001_DFA_HAP_XX,1001,HAP,DFA,51,Male,Caucasian,Not Hispanic,0.148663,0.007960,...,0.021737,0.005675,0.009277,0.026797,0.010147,0.010658,0.017229,0.013203,0.010011,0.007488
4,1001_DFA_NEU_XX,1001,NEU,DFA,51,Male,Caucasian,Not Hispanic,0.174283,0.010704,...,0.031970,0.012929,0.017969,0.037496,0.013379,0.008354,0.005615,0.008907,0.007483,0.013592
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4871,1091_WSI_DIS_XX,1091,DIS,WSI,29,Female,Asian,Not Hispanic,0.115959,0.008173,...,0.026035,0.012841,0.008842,0.007892,0.005362,0.017915,0.016780,0.020343,0.015986,0.008793
4872,1091_WSI_FEA_XX,1091,FEA,WSI,29,Female,Asian,Not Hispanic,0.222025,0.007925,...,0.017277,0.025758,0.029299,0.006912,0.001970,0.006664,0.012525,0.030749,0.019316,0.012804
4873,1091_WSI_HAP_XX,1091,HAP,WSI,29,Female,Asian,Not Hispanic,0.139819,0.017985,...,0.015223,0.017986,0.012653,0.018009,0.012961,0.006186,0.015287,0.032246,0.015019,0.010603
4874,1091_WSI_NEU_XX,1091,NEU,WSI,29,Female,Asian,Not Hispanic,0.114266,0.004263,...,0.003359,0.017843,0.018368,0.020915,0.010360,0.021104,0.021054,0.024364,0.011053,0.008904


In [13]:
iris = load_iris(as_frame=True)

X = iris['data']
X = X.rename(columns={'sepal length (cm)':'sepal_length',
                         'sepal width (cm)':'sepal_width',
                         'petal length (cm)':'petal_length',
                         'petal width (cm)':'petal_width'})
y = iris['target']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X.copy(),y.copy(),
                                                       shuffle=True,
                                                       random_state=153,
                                                       stratify=y,
                                                       test_size=.2)

In [17]:
forest = RandomForestClassifier(n_estimators=500, 
                                max_depth=4,
                                random_state = 8973489)

forest.fit(X_train, y_train)

RandomForestClassifier(max_depth=4, n_estimators=500, random_state=8973489)

The `sklearn` scaled impurity reduction can be found with `feature_importances_`.

In [16]:
forest.feature_importances_

array([0.09352778, 0.01868163, 0.45875146, 0.42903913])

We can make it a little more readable with a dataframe.

In [18]:
score_df = pd.DataFrame({'feature':X_train.columns,
                            'importance_score': forest.feature_importances_})

score_df.sort_values('importance_score',ascending=False)

Unnamed: 0,feature,importance_score
2,petal_length,0.458751
3,petal_width,0.429039
0,sepal_length,0.093528
1,sepal_width,0.018682


This is a nice feature of random forests, it allows us to understand what variables are most important, which can help us explain the algorithm. It is also useful as another method for feature selection.

##### Extra Trees

Extra trees classifiers also has the ability to be used for feature importance scores.

In [18]:
et = ExtraTreesClassifier(n_estimators=500, 
                          max_depth=4,
                         random_state =38383)

et.fit(X_train, y_train)

ExtraTreesClassifier(max_depth=4, n_estimators=500, random_state=38383)

In [19]:
et_score_df = pd.DataFrame({'feature':X_train.columns,
                            'importance_score': et.feature_importances_})

et_score_df.sort_values('importance_score',ascending=False)

Unnamed: 0,feature,importance_score
2,petal_length,0.439881
3,petal_width,0.40762
0,sepal_length,0.102791
1,sepal_width,0.049709


--------------------------

This notebook was written for the Erd&#337;s Institute C&#337;de Data Science Boot Camp by Matthew Osborne, Ph. D., 2022.

Any potential redistributors must seek and receive permission from Matthew Tyler Osborne, Ph.D. prior to redistribution. Redistribution of the material contained in this repository is conditional on acknowledgement of Matthew Tyler Osborne, Ph.D.'s original authorship and sponsorship of the Erdős Institute as subject to the license (see License.md)