-
-
Notifications
You must be signed in to change notification settings - Fork 254
/
conftest.py
182 lines (139 loc) · 4.32 KB
/
conftest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import contextlib
import string
import dask.dataframe as dd
import numpy as np
import pytest
from distributed.utils_test import cluster
from dask_ml.datasets import (
make_blobs,
make_classification,
make_counts,
make_regression,
)
# pytest.register_assert_rewrite('dask_ml.utils')
@pytest.fixture
def xy_classification():
"""X, y pair for classification"""
X, y = make_classification(chunks=(10, 20), random_state=0)
return X, y
@pytest.fixture
def xy_classification_pandas(xy_classification):
"""Dask DataFrame (Series) X, y pair for classification"""
X, y = xy_classification
X = dd.from_dask_array(X, columns=list(string.ascii_letters[: X.shape[1]]))
y = dd.from_dask_array(y, "target")
return X, y
@pytest.fixture
def xy_regression():
"""X, y pair for classification"""
X, y = make_regression(chunks=(10, 20), random_state=0)
return X, y
@pytest.fixture
def xy_counts():
"""X, y pair for predicting counts"""
X, y = make_counts(n_samples=100, n_features=5, chunks=10)
return X, y
@pytest.fixture
def Xl_blobs():
"""
Tuple of (X, labels) for a classification task. `X`
and `l` are both dask arrays
"""
X, l = make_classification(n_samples=1000, n_features=4, chunks=500, random_state=1)
return X, l
@pytest.fixture
def Xl_blobs_easy():
"""
Tuple of (X, labels) for classification.
The centers are very spread out, so the clustering is easy.
"""
centers = np.array([[-7, -7], [0, 0], [7, 7]])
X, y = make_blobs(cluster_std=0.1, centers=centers, chunks=50, random_state=0)
return X, y
@pytest.fixture
def X_blobs(Xl_blobs):
"""
X dataset from `Xl_blobs`.
"""
return Xl_blobs[0]
@pytest.fixture
def single_chunk_classification():
"""X, y pair for classification.
The `X` and `y` have a single block, so chunksize is 100.
Useful for testing `partial_fit` methods.
"""
X, y = make_classification(chunks=100, random_state=0)
return X, y
@pytest.fixture
def single_chunk_regression():
"""X, y pair for regression.
The `X` and `y` have a single block, so chunksize is 100.
Useful for testing `partial_fit` methods.
"""
X, y = make_regression(chunks=100, random_state=0)
return X, y
@pytest.fixture
def single_chunk_count_classification():
"""X, y pair for classification.
The `X` and `y` have a single block, so chunksize is 100.
Useful for testing `partial_fit` methods. The `X` data
is count data
"""
X, y = make_classification(chunks=100, random_state=0)
X = (abs(X) * 10).astype(int)
return X, y
@pytest.fixture
def single_chunk_binary_classification():
"""X, y pair for classification.
The `X` and `y` have a single block, so chunksize is 100.
Useful for testing `partial_fit` methods. The `X` data
are binary features
"""
X, y = make_classification(chunks=100, random_state=0)
X = (abs(X) > 0).astype(int)
return X, y
@pytest.fixture
def single_chunk_blobs():
"""X, y pair for clustering
The `X` and `y` have a single block, so chunksize is 100.
Useful for testing `partial_fit` methods.
"""
X, y = make_blobs(chunks=100, random_state=0)
return X, y
@contextlib.contextmanager
def not_cluster(nworkers=2, **kwargs):
yield (None, [None] * nworkers)
@pytest.fixture(scope="module", params=["threads", "distributed"])
def scheduler(request):
if request.param == "distributed":
yield cluster
else:
yield not_cluster
@pytest.fixture
def medium_size_regression():
"""X, y pair for regression with N >> p.
There are many more samples in this problem than there are
features. Useful for testing stability of solutions.
"""
X, y = make_regression(
chunks=100, n_samples=500, n_features=100, n_informative=100, random_state=0
)
return X, y
@pytest.fixture
def medium_size_counts():
"""X, y pair for classification with N >> p.
The samples outnumber the total features, leading to
greater stability of the solutions. Useful for testing
the accuracy of solvers.
"""
sample_size = 500
n_features = 100
X, y = make_counts(
chunks=100,
n_samples=sample_size,
n_features=n_features,
n_informative=n_features,
random_state=0,
scale=1 / np.sqrt(n_features),
)
return X, y