In [None]:
# This requires stability_selection package from:
# https://github.com/scikit-learn-contrib/stability-selection
# Please follow the instructions there for installation

## Comparison of various models prediction for feature importance

Inspired by Andos Saabas' discussion of different ways of picking important features
https://blog.datadive.net/selecting-good-features-part-iv-stability-selection-rfe-and-everything-side-by-side/

### Produces Table 2 in the paper

Note that this assumes that the best fit LASSO and RF models that were saved used polynomial basis. In other words:

RUN rfandlasso_vertical_poly.ipynb first to generate
 - bst_lasso.joblib
 - bst_rf.joblib

If you ran rfandlasso_vertical_nopoly.ipynb, then these models will be making predictions/giving feature importances for the linear basis. This notebook assumes all models operate on the polynomial basis.

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression, Ridge, Lasso#, RandomizedLasso is no longer part of sklearn
from stability_selection import RandomizedLasso
from sklearn.feature_selection import RFE, f_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

from minepy import MINE

from joblib import dump, load



## Download data for Rm = $1.5 \times 10^4$

In [None]:
!wget -O mfields_R15e3.npz https://sid.erda.dk/public/archives/0cf0c2b6d34e20da8971304b06d8f913/pencil/alpha2/shock_k10_R15000_256_xyaver_alpha_eta/mfields.npz

## Convert numpy arrays to dataframe

In [3]:
from preprocess import *

In [4]:
df15 = gen_df_tave(fname='mfields_R15e3.npz',t1=1500,t2=-1,verbose=0)

In [5]:
# Generate polynomial basis
df15_poly = gen_df_poly(df15)
df15_poly.head()

Feature names: ['Bx', 'By', 'Bx By', 'Ex', 'Ey', 'B^2', 'B^2 Bx', 'B^2 By']
Feature array shape: (256, 8)


Unnamed: 0,Ex,Ey,B^2,B^2 Bx,B^2 By,Bx,By,Bx By
0,0.000467,0.000502,0.989009,0.692821,0.698131,0.70052,0.70589,0.49449
1,0.000453,0.000515,0.987386,0.675544,0.711528,0.684174,0.720619,0.493029
2,0.000439,0.000527,0.98517,0.656874,0.724351,0.666762,0.735254,0.490239
3,0.000425,0.000538,0.982811,0.637277,0.737016,0.648423,0.749906,0.486256
4,0.000411,0.000551,0.980822,0.617161,0.750117,0.629228,0.764784,0.481224


### Train Test split

In [6]:
df15_ss, scl = scale_df(df15_poly)

tst_sz = 0.2

flds = ['Ex','Ey']
fld  ='Ex' # Target

#X_train, X_test, y_train, y_test = train_test_seq(df15_poly.drop(flds,axis=1),df15_poly[fld],test_size=tst_sz)
X_train, X_test, Y_train, Y_test = train_test_seq(df15_ss.drop(flds,axis=1),df15_ss[fld],test_size=tst_sz)

Test size: 0.2


In [7]:
X_train.head()

Unnamed: 0,B^2,B^2 Bx,B^2 By,Bx,By,Bx By
0,0.419213,1.021195,1.003289,0.996046,1.018508,1.445872
1,0.37075,0.995885,1.02296,0.972804,1.03976,1.441586
2,0.304606,0.968535,1.041787,0.948046,1.060878,1.433406
3,0.234177,0.939827,1.060384,0.92197,1.082018,1.421723
4,0.174791,0.910358,1.079619,0.894679,1.103484,1.406963


## Compute correlations

In [8]:
df_train = pd.DataFrame.copy(X_train)
df_train['target'] = Y_train
pcorr_df = df_train.drop("target", axis=1).apply(lambda x: x.corr(df_train.target))
pcorr_df_np = pcorr_df.to_numpy() # Conver to numpy
#pcorr_df

## Construct the feature rankings

In [9]:
ranks = pd.DataFrame(index=X_train.columns)

In [10]:
ranks

B^2
B^2 Bx
B^2 By
Bx
By
Bx By


In [11]:
lr = LinearRegression(fit_intercept=False)
lr.fit(X_train, Y_train)
ranks["Linear reg"] = np.abs(lr.coef_)#, names

#ridge = Ridge(alpha=7)
#ridge.fit(X_train, Y_train)
#ranks["Ridge"] = np.abs(ridge.coef_)

#lasso = Lasso(alpha=.05, fit_intercept=False)
lasso = load('bst_lasso.joblib')
lasso.fit(X_train, Y_train)
ranks["Lasso"] = np.abs(lasso.coef_)

rlasso = RandomizedLasso(alpha=0.04, fit_intercept=False)
rlasso.fit(X_train, Y_train)
ranks["Stability"] = np.abs(rlasso.coef_)

#rf = RandomForestRegressor()
rf = load('bst_rf.joblib') # Load the optimized model
rf.fit(X_train,Y_train)
ranks["RF"] = rf.feature_importances_

# RECURSIVE FEATURE ELIMINATION
# 1. Linear Regression
rfe = RFE(lr, n_features_to_select=1)
rfe.fit(X_train,Y_train)
#ranks["RFE"] = rfe.ranking_.astype(float)
ranks["RFE"] = np.abs(rfe.ranking_.astype(float) - 7)
#ranks["RFE_sort"] = np.sort(rfe.ranking_.astype(float))#,reverse=True)#[::-1]
#ranks["RFE_pd_sort"] = ranks["RFE"].sort_values()  

# 2. Random Forests
rfe_rf = RFE(rf, n_features_to_select=1)
rfe_rf.fit(X_train,Y_train)
#ranks["RFE_rf"] = rfe_rf.ranking_.astype(float)
ranks["RFE_rf"] = np.abs(rfe_rf.ranking_.astype(float) - 7)

# DO NOT install using conda, use: pip install minepy
mine = MINE()
mic_scores = []
for i in range(X_train.shape[1]):
    mine.compute_score(X_train.to_numpy()[:,i], Y_train)
    m = mine.mic()
    mic_scores.append(m)
 
ranks["MIC"] = np.array(mic_scores)

  positive)


In [12]:
ranks

Unnamed: 0,Linear reg,Lasso,Stability,RF,RFE,RFE_rf,MIC
B^2,0.037653,0.00374,0.0,0.003645,2.0,4.0,0.48249
B^2 Bx,1.72006,0.200741,0.933395,0.690994,5.0,6.0,1.0
B^2 By,0.251227,0.018714,0.0,0.000267,4.0,1.0,0.688393
Bx,2.700802,0.779052,0.0,0.30445,6.0,5.0,1.0
By,0.236876,0.0,0.0,0.000278,3.0,3.0,0.691966
Bx By,0.01737,0.011295,0.0,0.000365,1.0,2.0,0.5906


## Make sure all values are between 0 and 1

In [13]:
minmax = MinMaxScaler()
ranks_norm = pd.DataFrame(minmax.fit_transform(ranks),index=X_train.columns,columns=ranks.columns)

In [14]:
ranks_norm

Unnamed: 0,Linear reg,Lasso,Stability,RF,RFE,RFE_rf,MIC
B^2,0.007559,0.004801,0.0,0.004891,0.2,0.6,0.0
B^2 Bx,0.634519,0.257674,1.0,1.0,0.8,1.0,1.0
B^2 By,0.087148,0.024022,0.0,0.0,0.6,0.0,0.397872
Bx,1.0,1.0,0.0,0.440381,1.0,0.8,1.0
By,0.0818,0.0,0.0,1.5e-05,0.4,0.4,0.404777
Bx By,0.0,0.014499,0.0,0.000142,0.0,0.2,0.208905


## Pearson correlation coefficients

In [15]:
ranks_norm['Corr'] = pcorr_df.abs()

## Take mean across all columns for each row (variable)

In [16]:
ranks_norm['mean'] = ranks_norm.agg("mean", axis="columns")

In [17]:
ranks_norm

Unnamed: 0,Linear reg,Lasso,Stability,RF,RFE,RFE_rf,MIC,Corr,mean
B^2,0.007559,0.004801,0.0,0.004891,0.2,0.6,0.0,0.101946,0.1149
B^2 Bx,0.634519,0.257674,1.0,1.0,0.8,1.0,1.0,0.997166,0.83617
B^2 By,0.087148,0.024022,0.0,0.0,0.6,0.0,0.397872,0.120997,0.153755
Bx,1.0,1.0,0.0,0.440381,1.0,0.8,1.0,0.998248,0.779829
By,0.0818,0.0,0.0,1.5e-05,0.4,0.4,0.404777,0.117886,0.17556
Bx By,0.0,0.014499,0.0,0.000142,0.0,0.2,0.208905,0.136582,0.070016


In [18]:
ranks_norm.to_latex()

'\\begin{tabular}{lrrrrrrrrr}\n\\toprule\n{} &  Linear reg &     Lasso &  Stability &        RF &  RFE &  RFE\\_rf &       MIC &      Corr &      mean \\\\\n\\midrule\nB\\textasciicircum 2    &    0.007559 &  0.004801 &        0.0 &  0.004891 &  0.2 &     0.6 &  0.000000 &  0.101946 &  0.114900 \\\\\nB\\textasciicircum 2 Bx &    0.634519 &  0.257674 &        1.0 &  1.000000 &  0.8 &     1.0 &  1.000000 &  0.997166 &  0.836170 \\\\\nB\\textasciicircum 2 By &    0.087148 &  0.024022 &        0.0 &  0.000000 &  0.6 &     0.0 &  0.397872 &  0.120997 &  0.153755 \\\\\nBx     &    1.000000 &  1.000000 &        0.0 &  0.440381 &  1.0 &     0.8 &  1.000000 &  0.998248 &  0.779829 \\\\\nBy     &    0.081800 &  0.000000 &        0.0 &  0.000015 &  0.4 &     0.4 &  0.404777 &  0.117886 &  0.175560 \\\\\nBx By  &    0.000000 &  0.014499 &        0.0 &  0.000142 &  0.0 &     0.2 &  0.208905 &  0.136582 &  0.070016 \\\\\n\\bottomrule\n\\end{tabular}\n'

## Comparison of Mutual Information with Pearson and Spearman correlation

In [28]:
#x = np.linspace(-1,1,20)
x  = np.random.uniform(-1,+1,1000)
x2 = x**2
df_dummy = pd.DataFrame({'x': x, 'x2': x2})
mine.compute_score(x,x**2)
print(mine.mic())
print(df_dummy.corr())
print(df_dummy.corr(method='spearman'))

1.0000000000000002
          x       x2
x   1.00000  0.00672
x2  0.00672  1.00000
           x        x2
x   1.000000 -0.023768
x2 -0.023768  1.000000
