In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [41]:
## Simulate Data

n_obs = 10
n_cols = 20

X = pd.DataFrame(np.random.rand(n_obs, n_cols), columns=range(n_cols))
coefs = np.random.rand(n_cols)
coefs[3] = 1000

probs = 1 / (1 + np.exp(-(-300 + (coefs*X).sum(axis=1) + 0.1*np.random.normal(size=n_obs))))
y = pd.Series([0]*len(probs))
y[probs>0.5] = 1
print("Percent y=1:\t" + str(np.sum(y) / float(len(y))))
print("Percent y=0:\t" + str(1 - (np.sum(y) / float(len(y)))))

Percent y=1:	0.8
Percent y=0:	0.2


In [42]:
## This is the marginal effect for varying from 0.05 to 0.95 percentile
## For categorical variable it is effect of varying from 0 to 1
def local_marginal_effects(model, X, y):

    def determine_categorical_or_continuous(X):
        out_dict = {}
        for col in X.columns:
            if((X[col].max() == 1) & (X[col].min() == 0)):
                out_dict[col] = True
            else:
                out_dict[col] = False
        return out_dict

    def calculate_effects(row, model, mean_X, categorical_dict):
        row_indices = row.index
        row = row.values.reshape(1,-1)
        out_series = pd.Series(np.nan, index=row_indices)
        prediction = rf_model.predict_proba(row)[:,1]
        for i, col in enumerate(row_indices.values):
            row_old = row.copy()
            if(categorical_dict[col]):
                row[:,i] = 1-row[:,i]
                out_series[col] = prediction - model.predict_proba(row)[:,1]
            else:
                row[:,i] = mean_X[col]
                out_series[col] = prediction - model.predict_proba(row)[:,1]
            row = row_old
        return out_series

    dydx = pd.DataFrame(np.nan, index=X.index, columns=X.columns)
    mean_X = X.mean()
    categorical_dict = determine_categorical_or_continuous(X)
    out_df = X.apply(calculate_effects, axis=1, model=model, mean_X=mean_X, categorical_dict=categorical_dict)       

    return out_df

In [43]:
## Random Forest

%time rf_model = RandomForestClassifier(n_estimators=500, n_jobs=12)
%time rf_model.fit(X, y)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 46 µs
CPU times: user 1.37 s, sys: 304 ms, total: 1.68 s
Wall time: 1.63 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=12, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [44]:
%time out_df = local_marginal_effects(rf_model, X, y)

CPU times: user 1min 21s, sys: 28.6 s, total: 1min 50s
Wall time: 1min 53s


In [45]:
out_df.mean()

0    -0.0254
1    -0.0074
2    -0.0164
3    -0.0390
4    -0.0038
5     0.0012
6     0.0002
7    -0.0008
8    -0.0132
9    -0.0106
10   -0.0050
11   -0.0026
12   -0.0094
13    0.0022
14   -0.0078
15   -0.0022
16   -0.0208
17   -0.0102
18   -0.0036
19   -0.0004
dtype: float64

In [7]:
from multiprocessing import Pool

def parallelize_dataframe(df, func, num_cores=12, num_partitions=10, **kwargs):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split, **kwargs))
    pool.close()
    pool.join()
    return df

## This is the marginal effect for varying from 0.05 to 0.95 percentile
## For categorical variable it is effect of varying from 0 to 1
def local_marginal_effects(model, X, y):

    def determine_categorical_or_continuous(X):
        out_dict = {}
        for col in X.columns:
            if((X[col].max() == 1) & (X[col].min() == 0)):
                out_dict[col] = True
            else:
                out_dict[col] = False
        return out_dict

    def calculate_effects(row, model, mean_X, categorical_dict):
        row_indices = row.index
        row = row.values.reshape(1,-1)
        out_series = pd.Series(np.nan, index=row_indices)
        prediction = rf_model.predict_proba(row)[:,1]
        for i, col in enumerate(row_indices.values):
            row_old = row.copy()
            if(categorical_dict[col]):
                row[:,i] = 1-row[:,i]
                out_series[col] = prediction - model.predict_proba(row)[:,1]
            else:
                row[:,i] = mean_X[col]
                out_series[col] = prediction - model.predict_proba(row)[:,1]
            row = row_old
        return out_series

    dydx = pd.DataFrame(np.nan, index=X.index, columns=X.columns)
    mean_X = X.mean()
    categorical_dict = determine_categorical_or_continuous(X)
    #out_df = X.apply(calculate_effects, axis=1, model=model, mean_X=mean_X, categorical_dict=categorical_dict)       
    out_df = parallelize_dataframe(X, calculate_effects, model=model, mean_X=mean_X, categorical_dict=categorical_dict)       


    return out_df