/
test_incremental.py
136 lines (101 loc) · 4.05 KB
/
test_incremental.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import dask.array as da
import numpy as np
import pytest
import sklearn.model_selection
import sklearn.datasets
from dask.array.utils import assert_eq
from sklearn.base import clone
from sklearn.linear_model import SGDClassifier
from dask_ml.wrappers import Incremental
from dask_ml.utils import assert_estimator_equal
import dask_ml.metrics
from dask_ml.metrics.scorer import check_scoring
def test_get_params():
clf = Incremental(SGDClassifier())
result = clf.get_params()
assert 'estimator__alpha' in result
assert result['scoring'] is None
def test_set_params():
clf = Incremental(SGDClassifier())
clf.set_params(**{'scoring': 'accuracy',
'estimator__alpha': 0.1})
result = clf.get_params()
assert result['estimator__alpha'] == 0.1
assert result['scoring'] == 'accuracy'
def test_incremental_basic(scheduler, xy_classification):
X, y = xy_classification
with scheduler() as (s, [_, _]):
est1 = SGDClassifier(random_state=0, tol=1e-3)
est2 = clone(est1)
clf = Incremental(est1)
result = clf.fit(X, y, classes=[0, 1])
for slice_ in da.core.slices_from_chunks(X.chunks):
est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1])
assert result is clf
assert isinstance(result.estimator_.coef_, np.ndarray)
np.testing.assert_array_almost_equal(result.estimator_.coef_,
est2.coef_)
assert_estimator_equal(clf.estimator_, est2,
exclude=['loss_function_'])
# Predict
result = clf.predict(X)
expected = est2.predict(X)
assert isinstance(result, da.Array)
assert_eq(result, expected)
# score
result = clf.score(X, y)
expected = est2.score(X, y)
# assert isinstance(result, da.Array)
assert_eq(result, expected)
clf = Incremental(SGDClassifier(random_state=0, tol=1e-3))
clf.partial_fit(X, y, classes=[0, 1])
assert_estimator_equal(clf.estimator_, est2,
exclude=['loss_function_'])
def test_in_gridsearch(scheduler, xy_classification):
X, y = xy_classification
clf = Incremental(SGDClassifier(random_state=0, tol=1e-3))
param_grid = {'estimator__alpha': [0.1, 10]}
gs = sklearn.model_selection.GridSearchCV(clf, param_grid, iid=False)
with scheduler() as (s, [a, b]):
gs.fit(X, y, classes=[0, 1])
def test_scoring(scheduler, xy_classification,
scoring=dask_ml.metrics.accuracy_score):
X, y = xy_classification
with scheduler() as (s, [a, b]):
clf = Incremental(SGDClassifier(tol=1e-3), scoring=scoring)
with pytest.raises(ValueError,
match='metric function rather than a scorer'):
clf.fit(X, y, classes=np.unique(y))
@pytest.mark.parametrize("scoring", [
"accuracy", "neg_mean_squared_error", "r2", None
])
def test_scoring_string(scheduler, xy_classification, scoring):
X, y = xy_classification
with scheduler() as (s, [a, b]):
clf = Incremental(SGDClassifier(tol=1e-3), scoring=scoring)
if scoring:
assert (dask_ml.metrics.scorer.SCORERS[scoring] ==
check_scoring(clf, scoring=scoring))
assert callable(check_scoring(clf, scoring=scoring))
clf.fit(X, y, classes=np.unique(y))
clf.score(X, y)
def test_fit_ndarrays():
X = np.ones((10, 5))
y = np.concatenate([np.zeros(5), np.ones(5)])
sgd = SGDClassifier(tol=1e-3)
inc = Incremental(sgd)
inc.partial_fit(X, y, classes=[0, 1])
sgd.fit(X, y)
assert inc.estimator is sgd
assert_eq(inc.coef_, inc.estimator_.coef_)
def test_score_ndarrays():
X = np.ones((10, 5))
y = np.ones(10)
sgd = SGDClassifier(tol=1e-3)
inc = Incremental(sgd, scoring='accuracy')
inc.partial_fit(X, y, classes=[0, 1])
inc.fit(X, y, classes=[0, 1])
assert inc.score(X, y) == 1
dX = da.from_array(X, chunks=(2, 5))
dy = da.from_array(y, chunks=2)
assert inc.score(dX, dy) == 1