In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.datasets import make_regression
from feature_selection.mdi import mdi_feature_importance
from feature_selection.sfi import single_feature_importance_cv

In [2]:
n_features = 4
n_informative = 3
X,y, coef = make_regression(n_samples=5000,
                            n_features=n_features,
                            n_informative=n_informative,
                            random_state=1233,
                            coef=True)
feature_names = ["f{}".format(i) for i in range(n_features)]
columns = feature_names + ["target_return"] 

y  = y.reshape(-1,1)
df = pd.DataFrame(np.hstack([X,y]), columns=columns)
true_imp = pd.DataFrame({"feature":feature_names,
                         "feature_score":coef}).sort_values("feature_score", ascending=False).reset_index(drop=True)
true_imp

Unnamed: 0,feature,feature_score
0,f3,93.679364
1,f1,90.393254
2,f2,51.60985
3,f0,0.0


In [3]:
mdi_result = mdi_feature_importance(df=df,
                                    feature_names=feature_names,
                                    target_name="target_return",
                                    random_state=12)

mdi_result = mdi_result.sort_values("mean", ascending=False)["mean"]
mdi_result = mdi_result.reset_index()
mdi_result.columns = ["feature", "feature_score"]
assert np.all(true_imp.feature == mdi_result.feature)
mdi_result

Unnamed: 0,feature,feature_score
0,f3,0.428592
1,f1,0.377964
2,f2,0.149161
3,f0,0.044282


In [4]:
sfi_results = []

for f in feature_names:
    r2_arr = single_feature_importance_cv(df=df,
                             feature_name=f,
                             target_name="target_return",
                             n_splits=5)
    sfi_results.append((f, np.mean(r2_arr)))
sfi_results = pd.DataFrame(sfi_results,
                           columns=["feature", "feature_score"])
sfi_results = sfi_results.sort_values("feature_score",
                                                  ascending=False)
sfi_results = sfi_results.reset_index(drop=True)
assert np.all(true_imp.feature == sfi_results.feature)
sfi_results

Unnamed: 0,feature,feature_score
0,f3,0.472493
1,f1,0.407644
2,f2,0.127875
3,f0,-0.000757
