-
Notifications
You must be signed in to change notification settings - Fork 1.4k
/
pickle_dataset.py
175 lines (121 loc) · 4.61 KB
/
pickle_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import threading
import six
import six.moves.cPickle as pickle
from chainer.dataset import dataset_mixin
class PickleDatasetWriter(object):
"""Writer class that makes PickleDataset.
To make :class:`PickleDataset`, a user needs to prepare data using
:class:`PickleDatasetWriter`.
Args:
writer: File like object that supports ``write`` and ``tell`` methods.
protocol (int): Valid protocol for :mod:`pickle`.
.. seealso: chainer.datasets.PickleDataset
"""
def __init__(self, writer, protocol=pickle.HIGHEST_PROTOCOL):
self._positions = []
self._writer = writer
self._protocol = protocol
def close(self):
self._writer.close()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
def write(self, x):
position = self._writer.tell()
pickle.dump(x, self._writer, protocol=self._protocol)
self._positions.append(position)
def flush(self):
if hasattr(self._writer, 'flush'):
self._writer.flush()
class PickleDataset(dataset_mixin.DatasetMixin):
"""Dataset stored in a storage using pickle.
:mod:`pickle` is the default serialization library of Python.
This dataset stores any objects in a storage using :mod:`pickle`.
Even when a user wants to use a large dataset, this dataset can stores all
data in a large storage like HDD and each data can be randomly accessible.
.. testsetup::
import tempfile
fs, path_to_data = tempfile.mkstemp()
>>> with chainer.datasets.open_pickle_dataset_writer(path_to_data) as w:
... w.write((1, 2.0, 'hello'))
... w.write((2, 3.0, 'good-bye'))
...
>>> with chainer.datasets.open_pickle_dataset(path_to_data) as dataset:
... print(dataset[1])
...
(2, 3.0, 'good-bye')
.. testcleanup::
import os
os.close(fs)
Args:
reader: File like object. `reader` must support random access.
"""
def __init__(self, reader):
# Only py3 supports `seekable` method
if six.PY3 and not reader.seekable():
raise ValueError('reader must support random access')
self._reader = reader
self._positions = []
reader.seek(0)
while True:
position = reader.tell()
try:
pickle.load(reader)
except EOFError:
break
self._positions.append(position)
self._lock = threading.RLock()
def close(self):
"""Closes a file reader.
After a user calls this method, the dataset will no longer be
accessible..
"""
with self._lock:
self._reader.close()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
def __len__(self):
return len(self._positions)
def get_example(self, index):
with self._lock:
self._reader.seek(self._positions[index])
return pickle.load(self._reader)
def open_pickle_dataset(path):
"""Opens a dataset stored in a given path.
This is a helper function to open :class:`PickleDataset`. It opens a given
file in binary mode, and creates a :class:`PickleDataset` instance.
This method does not close the opened file. A user needs to call
:func:`PickleDataset.close` or use `with`:
.. code-block:: python
with chainer.datasets.open_pickle_dataset('path') as dataset:
pass # use dataset
Args:
path (str): Path to a dataset.
Returns:
chainer.datasets.PickleDataset: Opened dataset.
.. seealso: chainer.datasets.PickleDataset
"""
reader = open(path, 'rb')
return PickleDataset(reader)
def open_pickle_dataset_writer(path, protocol=pickle.HIGHEST_PROTOCOL):
"""Opens a writer to make a PickleDataset.
This is a helper function to open :class:`PickleDatasetWriter`. It opens a
given file in binary mode and creates a :class:`PickleDatasetWriter`
instance.
This method does not close the opened file. A user needs to call
:func:`PickleDatasetWriter.close` or use `with`:
.. code-block:: python
with chainer.datasets.open_pickle_dataset_writer('path') as writer:
pass # use writer
Args:
path (str): Path to a dataset.
protocol (int): Valid protocol for :mod:`pickle`.
Returns:
chainer.datasets.PickleDatasetWriter: Opened writer.
.. seealso: chainer.datasets.PickleDataset
"""
writer = open(path, 'wb')
return PickleDatasetWriter(writer, protocol=protocol)