forked from numpy/numpy
-
Notifications
You must be signed in to change notification settings - Fork 1
/
bench_io.py
244 lines (188 loc) · 7.46 KB
/
bench_io.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
from .common import Benchmark, get_squares
import numpy as np
from io import StringIO
class Copy(Benchmark):
params = ["int8", "int16", "float32", "float64",
"complex64", "complex128"]
param_names = ['type']
def setup(self, typename):
dtype = np.dtype(typename)
self.d = np.arange((50 * 500), dtype=dtype).reshape((500, 50))
self.e = np.arange((50 * 500), dtype=dtype).reshape((50, 500))
self.e_d = self.e.reshape(self.d.shape)
self.dflat = np.arange((50 * 500), dtype=dtype)
def time_memcpy(self, typename):
self.d[...] = self.e_d
def time_memcpy_large_out_of_place(self, typename):
l = np.ones(1024**2, dtype=np.dtype(typename))
l.copy()
def time_cont_assign(self, typename):
self.d[...] = 1
def time_strided_copy(self, typename):
self.d[...] = self.e.T
def time_strided_assign(self, typename):
self.dflat[::2] = 2
class CopyTo(Benchmark):
def setup(self):
self.d = np.ones(50000)
self.e = self.d.copy()
self.m = (self.d == 1)
self.im = (~ self.m)
self.m8 = self.m.copy()
self.m8[::8] = (~ self.m[::8])
self.im8 = (~ self.m8)
def time_copyto(self):
np.copyto(self.d, self.e)
def time_copyto_sparse(self):
np.copyto(self.d, self.e, where=self.m)
def time_copyto_dense(self):
np.copyto(self.d, self.e, where=self.im)
def time_copyto_8_sparse(self):
np.copyto(self.d, self.e, where=self.m8)
def time_copyto_8_dense(self):
np.copyto(self.d, self.e, where=self.im8)
class Savez(Benchmark):
def setup(self):
self.squares = get_squares()
def time_vb_savez_squares(self):
np.savez('tmp.npz', **self.squares)
class LoadtxtCSVComments(Benchmark):
# benchmarks for np.loadtxt comment handling
# when reading in CSV files
params = [10, int(1e2), int(1e4), int(1e5)]
param_names = ['num_lines']
def setup(self, num_lines):
data = [u'1,2,3 # comment'] * num_lines
# unfortunately, timeit will only run setup()
# between repeat events, but not for iterations
# within repeats, so the StringIO object
# will have to be rewinded in the benchmark proper
self.data_comments = StringIO(u'\n'.join(data))
def time_comment_loadtxt_csv(self, num_lines):
# benchmark handling of lines with comments
# when loading in from csv files
# inspired by similar benchmark in pandas
# for read_csv
# need to rewind StringIO object (unfortunately
# confounding timing result somewhat) for every
# call to timing test proper
np.loadtxt(self.data_comments,
delimiter=u',')
self.data_comments.seek(0)
class LoadtxtCSVdtypes(Benchmark):
# benchmarks for np.loadtxt operating with
# different dtypes parsed / cast from CSV files
params = (['float32', 'float64', 'int32', 'int64',
'complex128', 'str', 'object'],
[10, int(1e2), int(1e4), int(1e5)])
param_names = ['dtype', 'num_lines']
def setup(self, dtype, num_lines):
data = [u'5, 7, 888'] * num_lines
self.csv_data = StringIO(u'\n'.join(data))
def time_loadtxt_dtypes_csv(self, dtype, num_lines):
# benchmark loading arrays of various dtypes
# from csv files
# state-dependent timing benchmark requires
# rewind of StringIO object
np.loadtxt(self.csv_data,
delimiter=u',',
dtype=dtype)
self.csv_data.seek(0)
class LoadtxtCSVStructured(Benchmark):
# benchmarks for np.loadtxt operating with
# a structured data type & CSV file
def setup(self):
num_lines = 50000
data = [u"M, 21, 72, X, 155"] * num_lines
self.csv_data = StringIO(u'\n'.join(data))
def time_loadtxt_csv_struct_dtype(self):
# obligate rewind of StringIO object
# between iterations of a repeat:
np.loadtxt(self.csv_data,
delimiter=u',',
dtype=[('category_1', 'S1'),
('category_2', 'i4'),
('category_3', 'f8'),
('category_4', 'S1'),
('category_5', 'f8')])
self.csv_data.seek(0)
class LoadtxtCSVSkipRows(Benchmark):
# benchmarks for loadtxt row skipping when
# reading in csv file data; a similar benchmark
# is present in the pandas asv suite
params = [0, 500, 10000]
param_names = ['skiprows']
def setup(self, skiprows):
np.random.seed(123)
test_array = np.random.rand(100000, 3)
self.fname = 'test_array.csv'
np.savetxt(fname=self.fname,
X=test_array,
delimiter=',')
def time_skiprows_csv(self, skiprows):
np.loadtxt(self.fname,
delimiter=',',
skiprows=skiprows)
class LoadtxtReadUint64Integers(Benchmark):
# pandas has a similar CSV reading benchmark
# modified to suit np.loadtxt
params = [550, 1000, 10000]
param_names = ['size']
def setup(self, size):
arr = np.arange(size).astype('uint64') + 2**63
self.data1 = StringIO(u'\n'.join(arr.astype(str).tolist()))
arr = arr.astype(object)
arr[500] = -1
self.data2 = StringIO(u'\n'.join(arr.astype(str).tolist()))
def time_read_uint64(self, size):
# mandatory rewind of StringIO object
# between iterations of a repeat:
np.loadtxt(self.data1)
self.data1.seek(0)
def time_read_uint64_neg_values(self, size):
# mandatory rewind of StringIO object
# between iterations of a repeat:
np.loadtxt(self.data2)
self.data2.seek(0)
class LoadtxtUseColsCSV(Benchmark):
# benchmark selective column reading from CSV files
# using np.loadtxt
params = [2, [1, 3], [1, 3, 5, 7]]
param_names = ['usecols']
def setup(self, usecols):
num_lines = 5000
data = [u'0, 1, 2, 3, 4, 5, 6, 7, 8, 9'] * num_lines
self.csv_data = StringIO(u'\n'.join(data))
def time_loadtxt_usecols_csv(self, usecols):
# must rewind StringIO because of state
# dependence of file reading
np.loadtxt(self.csv_data,
delimiter=u',',
usecols=usecols)
self.csv_data.seek(0)
class LoadtxtCSVDateTime(Benchmark):
# benchmarks for np.loadtxt operating with
# datetime data in a CSV file
params = [20, 200, 2000, 20000]
param_names = ['num_lines']
def setup(self, num_lines):
# create the equivalent of a two-column CSV file
# with date strings in the first column and random
# floating point data in the second column
dates = np.arange('today', 20, dtype=np.datetime64)
np.random.seed(123)
values = np.random.rand(20)
date_line = u''
for date, value in zip(dates, values):
date_line += (str(date) + ',' + str(value) + '\n')
# expand data to specified number of lines
data = date_line * (num_lines // 20)
self.csv_data = StringIO(data)
def time_loadtxt_csv_datetime(self, num_lines):
# rewind StringIO object -- the timing iterations
# are state-dependent
X = np.loadtxt(self.csv_data,
delimiter=u',',
dtype=([('dates', 'M8[us]'),
('values', 'float64')]))
self.csv_data.seek(0)