# GMDI: Example Usages

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score, roc_auc_score

from imodels.importance import RandomForestPlusRegressor, RandomForestPlusClassifier, \
    RidgeRegressorPPM, LassoRegressorPPM, IdentityTransformer
from imodels.importance.rf_plus import _fast_r2_score

In [2]:
def neg_mae(y_true, y_pred, **kwargs):
    """
    Evaluates negative mean absolute error
    """
    return -mean_absolute_error(y_true, y_pred, **kwargs)

In [3]:
# helper variables
rf_regressor = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=331)
rf_classifier = RandomForestClassifier(n_estimators=100, min_samples_leaf=1, max_features="sqrt", random_state=331)

## 1. Regression Example

In [4]:
# generate data from linear model: y = x1 + x2 + N(0, 1)
n = 200
p = 10
s = 2
X = np.random.normal(size=(n, p))
beta = np.concatenate((np.ones(s), np.zeros(p-s)))
y = np.matmul(X, beta) + np.random.normal(size=n)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=12345)

### 1.1 GMDI with default settings for regression

In [5]:
# fit RF+
rf_plus_model = RandomForestPlusRegressor(rf_model=rf_regressor)
rf_plus_model.fit(X_train, y_train)

In [6]:
# make predictions with RF+
preds = rf_plus_model.predict(X_test)
r2_score(y_test, preds)

0.6610271198828321

In [7]:
# get GMDI scores (higher r^2 value = greater importance)
gmdi_scores = rf_plus_model.get_gmdi_scores(X_train, y_train)
gmdi_scores.sort_values("importance", ascending=False)

Unnamed: 0,var,importance
1,1,0.363404
0,0,0.270952
4,4,0.026317
7,7,0.005177
3,3,0.00152
5,5,0.000985
8,8,0.000197
2,2,-0.000164
6,6,-0.003904
9,9,-0.004322


### 1.2 GMDI with custom partial prediction model and evaluation metric(s)

In [8]:
# fit RF+ with custom partial prediction model
rf_plus_model = RandomForestPlusRegressor(rf_model=rf_regressor, prediction_model=LassoRegressorPPM())
rf_plus_model.fit(X_train, y_train)

In [9]:
# get GMDI scores with custom evaluation metrics/scorers
gmdi_scores = rf_plus_model.get_gmdi_scores(X_train, y_train, scoring_fns={"r2_score": _fast_r2_score, "negative_mae": neg_mae})
gmdi_scores.sort_values("r2_score", ascending=False)

Unnamed: 0,var,r2_score,negative_mae
1,1,0.365616,-1.112907
0,0,0.279965,-1.193934
4,4,0.016703,-1.416082
7,7,0.003432,-1.417898
3,3,0.000254,-1.424907
8,8,-0.00116,-1.428309
5,5,-0.001201,-1.426389
2,2,-0.002066,-1.430047
9,9,-0.002336,-1.429353
6,6,-0.002501,-1.429563


### 1.3 GMDI with custom transformer

The example below is equivalent to running RF+ with `include_raw=True`

In [10]:
# fit RF+ with custom transformer
rf_plus_model = RandomForestPlusRegressor(rf_model=rf_regressor, include_raw=False, add_transformers=[IdentityTransformer()])
rf_plus_model.fit(X_train, y_train)

In [11]:
# get GMDI scores
gmdi_scores = rf_plus_model.get_gmdi_scores(X_train, y_train)
gmdi_scores.sort_values("importance", ascending=False)

Unnamed: 0,var,importance
1,1,0.365055
0,0,0.275122
4,4,0.027359
7,7,0.004221
3,3,0.000404
5,5,0.000225
2,2,-0.000336
8,8,-0.00039
6,6,-0.004234
9,9,-0.005086


### 1.4 Choosing the GLM and scoring metric via stability score

There are many choices of GLMs and scoring metrics that can be made within the GMDI framework.

One way to select the GLM and scoring metric in GMDI is by evaluating the stability of the feature importances/rankings for each choice of GLM/metric and taking the GLM/metric that is the most stable across different bootstrap samples of trees. For example, we can take the GLM and metric with the highest stability score, as measured by RBO below.

In [12]:
n_bootstraps = 25
prediction_models = {"ridge": RidgeRegressorPPM(), "lasso": LassoRegressorPPM()}
scoring_fns = {"r2": _fast_r2_score, "neg_mae": neg_mae}
stability_dict = {}
for model_name, prediction_model in prediction_models.items():
    # fit RF+
    rf_plus_model = RandomForestPlusRegressor(rf_model=rf_regressor, prediction_model=prediction_model)
    rf_plus_model.fit(X_train, y_train)
    # get GMDI scores
    gmdi_scores = rf_plus_model.get_gmdi_scores(X_train, y_train, scoring_fns=scoring_fns)
    # get GMDI stability scores
    gmdi_stability_scores = rf_plus_model.get_gmdi_stability_scores(B=n_bootstraps)
    stability_dict[model_name] = gmdi_stability_scores

In [13]:
pd.concat(stability_dict, axis=0).reset_index().rename(columns={"level_0": "ppm"}).drop(columns=["level_1"]).sort_values("RBO", ascending=False)

Unnamed: 0,ppm,metric,RBO,tauAP
3,lasso,neg_mae,0.965753,0.919002
2,lasso,r2,0.956417,0.879412
1,ridge,neg_mae,0.942471,0.916095
0,ridge,r2,0.915209,0.892579


### 1.5 Aggregating multiple GMDI rankings in an ensemble

Instead of choosing a single GLM and metric to use in GMDI, it may be preferable in some cases to aggregate GMDI feature importances/rankings across multiple choices of GLMs and metrics.

One naive method for doing this ensembling is to take the median rank across each choice of GLM and metric (as shown below). However, more creative ensembling schemes can also be explored.

In [14]:
prediction_models = {"ridge": RidgeRegressorPPM(), "lasso": LassoRegressorPPM()}
scoring_fns = {"r2": _fast_r2_score, "neg_mae": neg_mae}
gmdi_scores_dict = {}
for model_name, prediction_model in prediction_models.items():
    # fit RF+
    rf_plus_model = RandomForestPlusRegressor(rf_model=rf_regressor, prediction_model=prediction_model)
    rf_plus_model.fit(X_train, y_train)
    # get GMDI scores
    gmdi_scores = rf_plus_model.get_gmdi_scores(X_train, y_train, scoring_fns=scoring_fns)
    for col in gmdi_scores.columns:
        if col != "var":
            gmdi_scores = gmdi_scores.rename(columns={col: model_name + "_" + col})
    gmdi_scores_dict[model_name] = gmdi_scores

In [15]:
gmdi_scores_df = pd.concat([df.set_index('var') for df in gmdi_scores_dict.values()], axis=1)
gmdi_ranks_df = gmdi_scores_df.rank(ascending=False).median(axis=1)
gmdi_ranks_df = pd.DataFrame(gmdi_ranks_df, columns=["median_rank"]).reset_index()
gmdi_ranks_df.sort_values("median_rank")

Unnamed: 0,var,median_rank
1,1,1.0
0,0,2.0
4,4,3.0
7,7,4.0
3,3,5.0
5,5,6.0
8,8,7.0
2,2,8.0
9,9,9.0
6,6,9.5


## 2. Classification Example

In [16]:
# generate data from logistic model: logit(E[Y|X]) = x1 + x2
n = 200
p = 10
s = 2
X = np.random.normal(size=(n, p))
beta = np.concatenate((np.ones(s), np.zeros(p-s)))
probs = 1 / (1 + np.exp(-np.matmul(X, beta)))
y = (np.random.uniform(size=n) < probs) * 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=12345)

### 2.1 GMDI with default classification settings

In [17]:
# fit RF+
rf_plus_model = RandomForestPlusClassifier(rf_model=rf_classifier)
rf_plus_model.fit(X_train, y_train)

In [18]:
# make predictions with RF+
preds = rf_plus_model.predict(X_test)
prob_preds = rf_plus_model.predict_proba(X_test)
accuracy_score(y_test, preds), roc_auc_score(y_test, prob_preds[:, 1])

(0.5909090909090909, 0.7027777777777777)

In [19]:
# get GMDI scores (higher r^2 value = greater importance)
gmdi_scores = rf_plus_model.get_gmdi_scores(X_train, y_train)
gmdi_scores.sort_values("importance", ascending=False)

Unnamed: 0,var,importance
0,0,-0.596457
1,1,-0.626334
8,8,-0.686463
2,2,-0.695486
3,3,-0.698196
4,4,-0.699199
6,6,-0.699878
7,7,-0.701878
9,9,-0.70196
5,5,-0.704134
