In [None]:
import numpy as np
import pandas as pd 

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
import seaborn as sns
plt.style.use('seaborn-talk')
plt.style.use('bmh')
plt.rcParams['font.weight'] = 'medium'
#plt.rcParams['figure.figsize'] = 10,7
blue, green, red, purple, gold, teal = sns.color_palette('colorblind', 6)

import os
print(f"Current dir: {os.getcwd()}")
os.chdir('..')
print(f"Current dir: {os.getcwd()}")

# Chapter 8: Feature Importance
___

## Exercises

**8.1** Using the code presented in Section 8.6:
- **(a)** Generate a dataset $(X, y)$.
- **(b)** Apply a PCA transformation on $X$, which we denote $\dot{X}$.
- **(c)** Compute MDI, MDA, and SFI feature importance on $(\dot{X}, y)$, where the base estimator is RF.
- **(d)** Do the three methods agree on what features are important? Why?

**8.2** From exercise 1, generate a new dataset  $(\ddot{X}, y)$, where $\ddot{X}$ is a feature union of X and Ẋ.
- **(a)** Compute MDI, MDA, and SFI feature importance on $(\ddot{X}, y)$, where the base estimator is RF.
- **(b)** Do the three methods agree on the important features? Why?

**8.3** Take the results from exercise 2:
- **(a)** Drop the most important features according to each method, resulting in a features matrix $\dddot{X}$.
- **(b)** Compute MDI, MDA, and SFI feature importance on $(\dddot{X}, y)$, where the base estimator is RF.
- **(c)** Do you appreciate significant changes in the rankings of important features, relative to the results from exercise 2?

**8.4** Using the code presented in Section 8.6:
- **(a)**  Generate a dataset $(X, y)$ of 1E6 observations, where 5 features are informa-
tive, 5 are redundant and 10 are noise.
- **(b)**  Split (X, y) into 10 datasets $\{(X_i, y_i)\}_{i=1,...,10}$ each of 1E5 observations.
- **(c)**  Compute the parallelized feature importance (Section8.5),on each of the 10 datasets, $\{(X_i, y_i)\}_{i=1,...,10}$.
- **(d)**  Compute the stacked feature importance on the combined dataset $(X, y)$.
- **(r)**  What causes the discrepancy between the two? Which one is more reliable?

**8.5** Repeat all MDI calculations from exercises 1–4, but this time allow for masking effects. That means, do not set max_features=int(1) in Snippet 8.2. How do results differ as a consequence of this change? Why?

In [None]:
from src.snippets.ch8 import getTestData

My own SNIPPETS 8.8, 8.9 and 8.10

In [None]:
n_features=40 
n_informative=10
n_redundant=10
n_estimators=1000
n_samples=10000
cv = 10

trnsX,cont=getTestData(n_features,n_informative,n_redundant,n_samples)

In [None]:
from src.snippets.ch8 import featImpMDI
from src.snippets.ch8 import featImpMDA
from src.snippets.ch8 import auxFeatImpSFI

from src.snippets.ch7 import PurgedKFold
from src.snippets.ch7 import cvScore

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

def featImportance_no_para(trnsX, cont, n_estimators=1000, cv=10, max_samples=1., numThreads=24,
                   pctEmbargo=0, scoring='accuracy', method='SFI', minWLeaf=0., **kargs):

    n_jobs = (-1 if numThreads > 1 else 1)
    #1) prepare classifier,cv. max_features=1, to prevent masking
    clf = DecisionTreeClassifier(criterion='entropy', max_features=1,
                                 class_weight='balanced', min_weight_fraction_leaf=minWLeaf)
    clf = BaggingClassifier(base_estimator=clf, n_estimators=n_estimators,
                            max_features=1., max_samples=max_samples,
                            oob_score=True, n_jobs=n_jobs)
    fit = clf.fit(X=trnsX, y=cont['bin'], sample_weight=cont['w'].values)
    oob = fit.oob_score_
    
    print(f'oob: {oob} , {clf.oob_score_}')
    print(f"score: {clf.score(X=trnsX, y=cont['bin'])}")
    
    if method == 'MDI':
        imp = featImpMDI(fit, featNames=trnsX.columns)
        oos = cvScore(clf, X=trnsX, y=cont['bin'], cv=cv, sample_weight=cont['w'],
                      t1=cont['t1'], pctEmbargo=pctEmbargo, scoring=scoring)
        print(f'fold acc: {oos}')
        oos = oos.mean()
    elif method == 'MDA':
        imp, oos = featImpMDA(clf, X=trnsX, y=cont['bin'], cv=cv, sample_weight=cont['w'],
                              t1=cont['t1'], pctEmbargo=pctEmbargo, scoring=scoring)
    elif method == 'SFI':
        
        cvGen = PurgedKFold(n = trnsX.shape[0],
                            n_folds=cv, 
                            t1=cont['t1'], 
                            pctEmbargo=pctEmbargo)
        
        oos = cvScore(clf, 
                      X=trnsX, 
                      y=cont['bin'], 
                      sample_weight=cont['w'], 
                      scoring=scoring, 
                      cvGen=cvGen).mean()
        
        clf.n_jobs = 1  # paralellize auxFeatImpSFI rather than clf
        imp = auxFeatImpSFI(trnsX.columns, clf=clf, trnsX=trnsX, cont=cont, scoring=scoring, cvGen=cvGen) 
    return imp, oob, oos

In [None]:
def plotFeatImportance(imp, oob, oos, method, tag=0, simNum=0, **kargs):
    '''
    SNIPPET 8.10 FEATURE IMPORTANCE PLOTTING FUNCTION
    plot mean imp bars with std
    '''
    plt.figure(figsize=(10, imp.shape[0]/5.))
    imp = imp.sort_values('mean', ascending=True)
    ax = imp['mean'].plot(kind='barh', color='b',
                          alpha=.25, xerr=imp['std'],
                          error_kw={'ecolor': 'r'})
    if method == 'MDI':
        plt.xlim([0, imp.sum(axis=1).max()])
        plt.axvline(1./imp.shape[0], linewidth=1,
                    color='r', linestyle='dotted')
    ax.get_yaxis().set_visible(False)
    for i, j in zip(ax.patches, imp.index):
        ax.text(i.get_width()/2,
                i.get_y()+i.get_height()/2, j, ha='center', va='center',
                color='black')
    plt.title(f'tag={tag} | simNum={simNum} | oob={oob:.{2}} | oos={oos:.{2}}')

In [None]:
method = 'MDI'
imp,oob,oos = featImportance_no_para(trnsX=trnsX,cont=cont, method = method)
plotFeatImportance(imp=imp,oob=oob,oos=oos, method = method)

In [None]:
method = 'MDA'
imp,oob,oos = featImportance_no_para(trnsX=trnsX,cont=cont, method = method)
plotFeatImportance(imp=imp,oob=oob,oos=oos, method = method)

In [None]:
method = 'SFI'
imp,oob,oos = featImportance_no_para(trnsX=trnsX,cont=cont, method = method)

In [None]:
plotFeatImportance(imp=imp,oob=oob,oos=oos, method = method)