-
Notifications
You must be signed in to change notification settings - Fork 94
/
historic_mean.py
95 lines (80 loc) · 3.48 KB
/
historic_mean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""Historic Mean for Time-Series problems. Predicts the mean of the target for each time
group for regression problems."""
import datatable as dt
import numpy as np
import pandas as pd
from h2oaicore.models import CustomTimeSeriesModel
class HistoricMeanModel(CustomTimeSeriesModel):
_can_handle_non_numeric = True
_regression = True
_display_name = "HistoricMean"
_description = "Historic Mean"
_testing_can_skip_failure = False # ensure tested as if shouldn't fail
@staticmethod
def do_acceptance_test():
return True
@staticmethod
def is_enabled():
return True
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
self.tgc = self.params_base['tgc']
self.time_column = self.params_base['time_column']
self.encoder = self.params_base.get('encoder')
self.nan_value = y.mean()
self.means = {}
if not all([x in X.names for x in self.tgc]):
raise RuntimeError(
"Internal error: need all time group cols (%s) in X, but only got %s" % (self.tgc, X.names))
tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
# Datatable code
if len(tgc_wo_time) > 0:
self.nan_value = np.mean(y)
self.ntrain = X.shape[0]
X_dt = X.copy()
X_dt.cbind(dt.Frame({"y": y}))
self.group_means = X_dt[:, dt.mean(dt.f.y), dt.by(*tgc_wo_time)]
# Have meaningful column names
self.group_means.names = tgc_wo_time + ["yhat"]
else:
self.group_means = np.mean(y)
# # Pandas code
# XX = X[:, self.tgc].to_pandas()
# XX['y'] = np.array(y)
# if len(tgc_wo_time) > 0:
# self.nan_value = np.mean(y)
# self.ntrain = X.shape[0]
# self.group_means = XX.groupby(tgc_wo_time)["y"].mean().reset_index()
# # Have meaningful column names
# self.group_means.columns = tgc_wo_time + ["yhat"]
# else:
# self.group_means = np.mean(y)
def predict(self, X, **kwargs):
if self.tgc is None or not all([x in X.names for x in self.tgc]):
return np.ones(X.shape[0]) * self.nan_value
tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
# Datatable code
if len(tgc_wo_time) > 0:
# Join the average per group to the input datafrane
self.group_means.key = tgc_wo_time
# Predictions for unknown tgc will be None in DT
yhat_dt = X[:, :, dt.join(self.group_means)][:, "yhat"]
# In DT missing values after the join are None
# Need to cast to float64 to replace None or np.nan
yhat_dt.replace(None, np.float64(self.nan_value))
return yhat_dt.to_numpy()[:, 0]
else:
# if no Groups are avaible then just return the target average
return np.full((X.shape[0], 1), self.nan_value)
# # Pandas code
# XX = X[:, self.tgc].to_pandas()
# if len(tgc_wo_time) > 0:
# # Join the average per group to the input datafrane
# return XX[tgc_wo_time].merge(
# right=self.group_means,
# on=tgc_wo_time,
# how='left'
# )["yhat"].fillna(self.nan_value).values
#
# else:
# # if no Groups are avaible then just return the target average
# return np.full((X.shape[0], 1), self.nan_value)