# # Decomposing Random Forest predictions with treeinterpreter

### Lets take a sample dataset, train a random forst model and predict some values on the test set and then    decompose predictions 

In [None]:
## load libraries
from treeinterpreter import treeinterpreter as ti
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [None]:
## import boston housing dataset and fit a random forest model
from sklearn.datasets import load_boston
boston = load_boston()
rf = RandomForestRegressor()
rf.fit(boston.data[:300], boston.target[:300])

In [None]:
## let us pick up two arbitrary points that yields two different predictions
instances = boston.data[[300, 309]]
print "Instance 0 prediction:", rf.predict(instances[0])
print "Instance 1 prediction:", rf.predict(instances[1])

In [None]:
#### Predictions that the random forest model made for the two data points are quite different. 
#### But why? We can now decompose the predictions into the bias term (which is just the trainset mean) and individual feature contributions, 
#### so we see which features contributed to the difference and by how much.
#### We can simply call the treeinterpreter predict method with the model and the data.

In [None]:
prediction, bias, contributions = ti.predict(rf, instances)

In [None]:
for i in range(len(instances)):
    print "Instance 0 ", i
    print "Bias (trainset mean)", bias[0]
    print "Feature Contributions:"
    for c, feature in sorted(zip(contributions[i], boston.feature_names), key = lambda x:-abs(x[0])):
        print feature, round(c, 2)
        print "-"*10

In [None]:
#### The feature contributions are sorted by their absolute impact. We can see that in the first instance 
#### where the prediction was high, most of the positive contributions came from RM, LSTAT and PTRATIO feaures. 
#### On the second instance the predicted value is much lower, since RM feature actually has a very large negative 
#### impact that is not offset by the positive impact of other features, thus taking the prediction below the dataset mean.
###### But is the prediction actually correct? This is easy way to check: bias and contributions need to sum up to the predictions

In [None]:
print prediction
print bias + np.sum(contributions, axis = 1)

In [None]:
###### Note that when summing up the contributions, we are dealing with floating point numbers so the values can slightly different due to rounding errors
#### lets split the dataset into two test datasets and compute average estimated price

In [None]:
ds1 = boston.data[300:400]
ds2 = boston.data[400:]

print np.mean(rf.predict(ds1))
print np.mean(rf.predict(ds2))

In [None]:
###### We can see that the average predicted prices for the houses in the two datasets are quite different. 
###### We can now trivially break down the contributors to this difference: which features contribute to this different and by how much.

prediction1, bias1, contributions1 = ti.predict(rf, ds1)
prediction2, bias2, contributions2 = ti.predict(rf, ds2)

###### Now we can calculate mean contributions of each feature

totalC1 = np.mean(contributions1, axis = 0)
totalC2 = np.mean(contributions2, axis =0)

###### Since bias are equal since the training dataset is the same, the difference to the predicted values has come from only feature
###### contributions. In other words, the sum of feature contribution differenes should only be equal to the difference in 
###### average prediction values.

print np.sum(totalC1 - totalC2)
print np.mean(prediction1) - np.mean(prediction2)

###### Finally, we can print out the differences of feature contributions in the two datasets. The sum of these is 
###### exactly the difference between the average prediction values.

for c, feature in sorted(zip(totalC1 - totalC2, boston.feature_names), reverse = True):
    print feature, round(c, 2)
    print "-"*5


In [None]:
# Classification trees and random forest
#### In classification trees, the feature contribute to the estimated probabilities of a given class. We can see this on the 
#### iris dataset

### load required libraries

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
iris = load_iris()

rf = RandomForestClassifier(max_depth = 4)
idx = range(len(iris.target))
np.random.shuffle(idx)

rf.fit(iris.data[idx][:100], iris.target[idx][:100])

#### lets predict for a new single instance

instance = iris.data[idx][100:101]
print rf.predict_proba(instance)

##### Breakdown of feature contributions

prediction, bias, contribution = ti.predict(rf, instance)
print "Prediction:", prediction
print "Bias (trainset mean):", bias
print "Feature Contributions:"
for c, feature in zip(contribution[0], iris.feature_names):
    print feature, c
