/
h2o_explainers.py
170 lines (139 loc) · 5.68 KB
/
h2o_explainers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
class H2OPartialDependenceExplainer:
"""
Partial Dependence explanation for binary classification H2O models.
Works for both numerical and categorical (enum) features.
Parameters
----------
h2o_model : H2OEstimator
H2O Model that was already fitted on the data.
Attributes
----------
feature_name_ : str
The input feature_name to the .fit unmodified, will
be used in subsequent method.
is_cat_col_ : bool
Whether the feature we're aiming to explain is a
categorical feature or not.
partial_dep_ : DataFrame
A pandas dataframe that contains three columns, the
feature's value and their corresponding mean prediction
and standard deviation of the prediction. e.g.
feature_name mean_response stddev_response
3000.000000 0.284140 0.120659
318631.578947 0.134414 0.076054
634263.157895 0.142961 0.083630
The feature_name column will be the actual feature_name that
we pass to the .fit method whereas the mean_response and
stddev_response column will be fixed columns generated.
"""
def __init__(self, h2o_model):
self.h2o_model = h2o_model
def fit(self, data, feature_name, n_bins=20):
"""
Obtain the partial dependence result.
Parameters
----------
data : H2OFrame, shape [n_samples, n_features]
Input data to the H2O estimator/model.
feature_name : str
Feature name in the data what we wish to explain.
n_bins : int, default 20
Number of bins used. For categorical columns, we will make sure the number
of bins exceed the distinct level count.
Returns
-------
self
"""
self.is_cat_col_ = data[feature_name].isfactor()[0]
if self.is_cat_col_:
n_levels = len(data[feature_name].levels()[0])
n_bins = max(n_levels, n_bins)
partial_dep = self.h2o_model.partial_plot(data, cols=[feature_name],
nbins=n_bins, plot=False)
self.feature_name_ = feature_name
self.partial_dep_ = partial_dep[0].as_data_frame()
return self
def plot(self, centered=True, plot_stddev=True):
"""
Use the partial dependence result to generate
a partial dependence plot (using matplotlib).
Parameters
----------
centered : bool, default True
Center the partial dependence plot by subtacting every partial
dependence result table's column value with the value of the first
column, i.e. first column's value will serve as the baseline
(centered at 0) for all other values.
plot_stddev : bool, default True
Apart from plotting the mean partial dependence, also show the
standard deviation as a fill between.
Returns
-------
matplotlib figure
"""
figure = GridSpec(5, 1)
ax1 = plt.subplot(figure[0, :])
self._plot_title(ax1)
ax2 = plt.subplot(figure[1:, :])
self._plot_content(ax2, centered, plot_stddev)
return figure
def _plot_title(self, ax):
font_family = 'Arial'
title = "Partial Dependence Plot for '{}' feature".format(self.feature_name_)
subtitle = 'Number of unique grid points: {}'.format(self.partial_dep_.shape[0])
title_fontsize = 15
subtitle_fontsize = 12
ax.set_facecolor('white')
ax.text(
0, 0.7, title,
fontsize=title_fontsize, fontname=font_family)
ax.text(
0, 0.4, subtitle, color='grey',
fontsize=subtitle_fontsize, fontname=font_family)
ax.axis('off')
def _plot_content(self, ax, centered, plot_stddev):
# pd (partial dependence)
pd_linewidth = 2
pd_markersize = 5
pd_color = '#1A4E5D'
fill_alpha = 0.2
fill_color = '#66C2D7'
zero_linewidth = 1.5
zero_color = '#E75438'
xlabel_fontsize = 10
pd_mean = self.partial_dep_['mean_response']
if centered:
# center the partial dependence plot by subtacting every value
# with the value of the first column, i.e. first column's value
# will serve as the baseline (centered at 0) for all other values
pd_mean -= pd_mean[0]
std = self.partial_dep_['stddev_response']
upper = pd_mean + std
lower = pd_mean - std
x = self.partial_dep_[self.feature_name_]
ax.plot(
x, pd_mean, color=pd_color, linewidth=pd_linewidth,
marker='o', markersize=pd_markersize)
ax.plot(
x, [0] * pd_mean.size, color=zero_color,
linestyle='--', linewidth=zero_linewidth)
if plot_stddev:
ax.fill_between(x, upper, lower, alpha=fill_alpha, color=fill_color)
ax.set_xlabel(self.feature_name_, fontsize=xlabel_fontsize)
self._modify_axis(ax)
def _modify_axis(self, ax):
tick_labelsize = 8
tick_colors = '#9E9E9E'
tick_labelcolor = '#424242'
ax.tick_params(
axis='both', which='major', colors=tick_colors,
labelsize=tick_labelsize, labelcolor=tick_labelcolor)
ax.set_facecolor('white')
ax.get_yaxis().tick_left()
ax.get_xaxis().tick_bottom()
for direction in ('top', 'left', 'right', 'bottom'):
ax.spines[direction].set_visible(False)
for axis in ('x', 'y'):
ax.grid(True, 'major', axis, ls='--', lw=.5, c='k', alpha=.3)