# MDI+: Example Usages

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score, roc_auc_score

from imodels.importance import RandomForestPlusRegressor, RandomForestPlusClassifier, \
    RidgeRegressorPPM, LassoRegressorPPM, IdentityTransformer
from imodels.importance.rf_plus import _fast_r2_score

In [2]:
def neg_mae(y_true, y_pred, **kwargs):
    """
    Evaluates negative mean absolute error
    """
    return -mean_absolute_error(y_true, y_pred, **kwargs)

In [3]:
# helper variables
rf_regressor = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=331)
rf_classifier = RandomForestClassifier(n_estimators=100, min_samples_leaf=1, max_features="sqrt", random_state=331)

## 1. Regression Example

In [4]:
# generate data from linear model: y = x1 + x2 + N(0, 1)
n = 200
p = 10
s = 2
X = np.random.normal(size=(n, p))
beta = np.concatenate((np.ones(s), np.zeros(p-s)))
y = np.matmul(X, beta) + np.random.normal(size=n)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=12345)

### 1.1 MDI+ with default settings for regression

In [5]:
# fit RF+
rf_plus_model = RandomForestPlusRegressor(rf_model=rf_regressor)
rf_plus_model.fit(X_train, y_train)

In [6]:
# make predictions with RF+
preds = rf_plus_model.predict(X_test)
r2_score(y_test, preds)

0.602898866954644

In [7]:
# get MDI+ scores (higher r^2 value = greater importance)
mdi_plus_scores = rf_plus_model.get_mdi_plus_scores(X_train, y_train)
mdi_plus_scores.sort_values("importance", ascending=False)

Unnamed: 0,var,importance
0,0,0.354405
1,1,0.321326
8,8,0.026648
3,3,0.0001
2,2,-0.002104
9,9,-0.003035
6,6,-0.00328
5,5,-0.00415
4,4,-0.00529
7,7,-0.005371


### 1.2 MDI+ with custom partial prediction model and evaluation metric(s)

In [8]:
# fit RF+ with custom partial prediction model
rf_plus_model = RandomForestPlusRegressor(rf_model=rf_regressor, prediction_model=LassoRegressorPPM())
rf_plus_model.fit(X_train, y_train)

In [9]:
# get MDI+ scores with custom evaluation metrics/scorers
mdi_plus_scores = rf_plus_model.get_mdi_plus_scores(X_train, y_train, scoring_fns={"r2_score": _fast_r2_score, "negative_mae": neg_mae})
mdi_plus_scores.sort_values("r2_score", ascending=False)

Unnamed: 0,var,r2_score,negative_mae
0,0,0.35849,-1.140341
1,1,0.328313,-1.159828
8,8,0.021085,-1.423189
3,3,-0.001271,-1.445924
2,2,-0.002602,-1.447519
9,9,-0.002837,-1.446741
5,5,-0.003045,-1.447727
6,6,-0.003163,-1.447022
7,7,-0.00355,-1.447024
4,4,-0.003911,-1.447655


### 1.3 MDI+ with custom transformer

The example below is equivalent to running RF+ with `include_raw=True`

In [10]:
# fit RF+ with custom transformer
rf_plus_model = RandomForestPlusRegressor(rf_model=rf_regressor, include_raw=False, add_transformers=[IdentityTransformer()])
rf_plus_model.fit(X_train, y_train)

In [11]:
# get MDI+ scores
mdi_plus_scores = rf_plus_model.get_mdi_plus_scores(X_train, y_train)
mdi_plus_scores.sort_values("importance", ascending=False)

Unnamed: 0,var,importance
0,0,0.355724
1,1,0.325702
8,8,0.025157
3,3,0.000388
2,2,-0.00216
9,9,-0.003352
6,6,-0.003477
5,5,-0.004867
7,7,-0.005865
4,4,-0.0059


### 1.4 Choosing the GLM and scoring metric via stability score

There are many choices of GLMs and scoring metrics that can be made within the MDI+ framework.

One way to select the GLM and scoring metric in MDI+ is by evaluating the stability of the feature importances/rankings for each choice of GLM/metric and taking the GLM/metric that is the most stable across different bootstrap samples of trees. For example, we can take the GLM and metric with the highest stability score, as measured by RBO below.

In [12]:
n_bootstraps = 25
prediction_models = {"ridge": RidgeRegressorPPM(), "lasso": LassoRegressorPPM()}
scoring_fns = {"r2": _fast_r2_score, "neg_mae": neg_mae}
stability_dict = {}
for model_name, prediction_model in prediction_models.items():
    # fit RF+
    rf_plus_model = RandomForestPlusRegressor(rf_model=rf_regressor, prediction_model=prediction_model)
    rf_plus_model.fit(X_train, y_train)
    # get MDI+ scores
    mdi_plus_scores = rf_plus_model.get_mdi_plus_scores(X_train, y_train, scoring_fns=scoring_fns)
    # get MDI+ stability scores
    mdi_plus_stability_scores = rf_plus_model.get_mdi_plus_stability_scores(B=n_bootstraps)
    stability_dict[model_name] = mdi_plus_stability_scores

In [13]:
pd.concat(stability_dict, axis=0).reset_index().rename(columns={"level_0": "ppm"}).drop(columns=["level_1"]).sort_values("RBO", ascending=False)

Unnamed: 0,ppm,scorer,RBO,tauAP
0,ridge,r2,0.943672,0.901981
2,lasso,r2,0.902463,0.811391
3,lasso,neg_mae,0.900443,0.784878
1,ridge,neg_mae,0.899285,0.851119


### 1.5 Aggregating multiple MDI+ rankings in an ensemble

Instead of choosing a single GLM and metric to use in MDI+, it may be preferable in some cases to aggregate MDI+ feature importances/rankings across multiple choices of GLMs and metrics.

One naive method for doing this ensembling is to take the median rank across each choice of GLM and metric (as shown below). However, more creative ensembling schemes can also be explored.

In [14]:
prediction_models = {"ridge": RidgeRegressorPPM(), "lasso": LassoRegressorPPM()}
scoring_fns = {"r2": _fast_r2_score, "neg_mae": neg_mae}
mdi_plus_scores_dict = {}
for model_name, prediction_model in prediction_models.items():
    # fit RF+
    rf_plus_model = RandomForestPlusRegressor(rf_model=rf_regressor, prediction_model=prediction_model)
    rf_plus_model.fit(X_train, y_train)
    # get MDI+ scores
    mdi_plus_scores = rf_plus_model.get_mdi_plus_scores(X_train, y_train, scoring_fns=scoring_fns)
    for col in mdi_plus_scores.columns:
        if col != "var":
            mdi_plus_scores = mdi_plus_scores.rename(columns={col: model_name + "_" + col})
    mdi_plus_scores_dict[model_name] = mdi_plus_scores

In [15]:
mdi_plus_scores_df = pd.concat([df.set_index('var') for df in mdi_plus_scores_dict.values()], axis=1)
mdi_plus_ranks_df = mdi_plus_scores_df.rank(ascending=False).median(axis=1)
mdi_plus_ranks_df = pd.DataFrame(mdi_plus_ranks_df, columns=["median_rank"]).reset_index()
mdi_plus_ranks_df.sort_values("median_rank")

Unnamed: 0,var,median_rank
0,0,1.0
1,1,2.0
8,8,3.0
3,3,4.0
2,2,6.0
9,9,6.0
6,6,6.5
7,7,8.5
4,4,9.0
5,5,9.0


## 2. Classification Example

In [16]:
# generate data from logistic model: logit(E[Y|X]) = x1 + x2
n = 200
p = 10
s = 2
X = np.random.normal(size=(n, p))
beta = np.concatenate((np.ones(s), np.zeros(p-s)))
probs = 1 / (1 + np.exp(-np.matmul(X, beta)))
y = (np.random.uniform(size=n) < probs) * 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=12345)

### 2.1 MDI+ with default classification settings

In [17]:
# fit RF+
rf_plus_model = RandomForestPlusClassifier(rf_model=rf_classifier)
rf_plus_model.fit(X_train, y_train)

In [18]:
# make predictions with RF+
preds = rf_plus_model.predict(X_test)
prob_preds = rf_plus_model.predict_proba(X_test)
accuracy_score(y_test, preds), roc_auc_score(y_test, prob_preds[:, 1])

(0.7878787878787878, 0.8585858585858586)

In [19]:
# get MDI+ scores (higher ngative log-loss value = greater importance)
mdi_plus_scores = rf_plus_model.get_mdi_plus_scores(X_train, y_train)
mdi_plus_scores.sort_values("importance", ascending=False)

Unnamed: 0,var,importance
0,0,-0.643608
1,1,-0.664461
4,4,-0.681521
9,9,-0.693593
3,3,-0.695175
6,6,-0.695259
7,7,-0.698201
2,2,-0.698987
8,8,-0.700452
5,5,-0.701411
