From b1cb12d726078589bf99b31ab66f553a1ab717a9 Mon Sep 17 00:00:00 2001 From: Richard Calland Date: Wed, 28 Jun 2017 15:30:17 +0900 Subject: [PATCH 1/8] Fetch the SVHN dataset SVHN is very similar to MNIST. I added code that will download and create datasets for train and test samples the same way `get_mnist()` does. --- chainer/datasets/svhn.py | 73 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 chainer/datasets/svhn.py diff --git a/chainer/datasets/svhn.py b/chainer/datasets/svhn.py new file mode 100644 index 000000000000..d9c1d56dc1fa --- /dev/null +++ b/chainer/datasets/svhn.py @@ -0,0 +1,73 @@ +import os + +import numpy +from scipy import io + + +from chainer.dataset import download +from chainer.datasets import tuple_dataset + +def get_svhn(withlabel=True, scale=1., dtype=numpy.float32, label_dtype=numpy.int32): + """Gets the SVHN dataset. + + `SVHN ` is a dataset similar to MNIST but + composed of cropped images of house numbers. The functionality is identical to the MNIST dataset, + with the exception that there is no ``ndim`` argument. + + Args: + withlabel (bool): If ``True``, it returns datasets with labels. In this + case, each example is a tuple of an image and a label. Otherwise, + the datasets only contain images. + scale (float): Pixel value scale. If it is 1 (default), pixels are + scaled to the interval ``[0, 1]``. + dtype: Data type of resulting image arrays. + label_dtype: Data type of the labels. + + Returns: + A tuple of two datasets. If ``withlabel`` is ``True``, both datasets + are :class:`~chainer.datasets.TupleDataset` instances. Otherwise, both + datasets are arrays of images. + + """ + train_raw = _retrieve_svhn_training() + train = _preprocess_svhn(train_raw, withlabel, scale, dtype, + label_dtype) + test_raw = _retrieve_svhn_test() + test = _preprocess_svhn(test_raw, withlabel, scale, dtype, + label_dtype) + return train, test + +def _preprocess_svhn(raw, withlabel, scale, image_dtype, label_dtype): + images = raw["x"].transpose(3,2,0,1) + images = images.astype(image_dtype) + images *= scale / 255. + + labels = raw["y"].astype(label_dtype).flatten() + # labels go from 1-10, but we want zero-ed 0-9 + labels -= 1 + + if withlabel: + return tuple_dataset.TupleDataset(images, labels) + else: + return images + +def _retrieve_svhn_training(): + return _retrieve_svhn("train.npz", "http://ufldl.stanford.edu/housenumbers/train_32x32.mat") + +def _retrieve_svhn_test(): + return _retrieve_svhn("test.npz", "http://ufldl.stanford.edu/housenumbers/test_32x32.mat") + +def _retrieve_svhn(name, url): + root = download.get_dataset_directory('pfnet/chainer/svhn') + path = os.path.join(root, name) + return download.cache_or_load_file( + path, lambda path: _make_npz(path, url), numpy.load) + +def _make_npz(path, url): + _path = download.cached_download(url) + raw = io.loadmat(_path) + images = raw["X"].astype(numpy.uint8) + labels = raw["y"].astype(numpy.uint8) + + numpy.savez_compressed(path, x=images, y=labels) + return {'x': images, 'y': labels} From 71d804a312ebdbd2844a96b5f963c0fc357a2de3 Mon Sep 17 00:00:00 2001 From: Richard Calland Date: Tue, 4 Jul 2017 15:59:13 +0900 Subject: [PATCH 2/8] Fix label ordering Made label ordering intuitive --- chainer/datasets/svhn.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/chainer/datasets/svhn.py b/chainer/datasets/svhn.py index d9c1d56dc1fa..d87c99eaa32d 100644 --- a/chainer/datasets/svhn.py +++ b/chainer/datasets/svhn.py @@ -3,7 +3,6 @@ import numpy from scipy import io - from chainer.dataset import download from chainer.datasets import tuple_dataset @@ -43,8 +42,9 @@ def _preprocess_svhn(raw, withlabel, scale, image_dtype, label_dtype): images *= scale / 255. labels = raw["y"].astype(label_dtype).flatten() - # labels go from 1-10, but we want zero-ed 0-9 - labels -= 1 + # labels go from 1-10, with the digit "0" having label 10. + # Set "0" to be label 0 to restore expected ordering + labels[labels==10] = 0 if withlabel: return tuple_dataset.TupleDataset(images, labels) From 8738dabe194bd9ec81d91e368abbb3f9ee64b630 Mon Sep 17 00:00:00 2001 From: Richard Calland Date: Tue, 4 Jul 2017 16:00:21 +0900 Subject: [PATCH 3/8] Update __init__.py for svhn dataset --- chainer/datasets/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/chainer/datasets/__init__.py b/chainer/datasets/__init__.py index b9e2f431f6a1..a43d90c376ae 100644 --- a/chainer/datasets/__init__.py +++ b/chainer/datasets/__init__.py @@ -2,6 +2,7 @@ from chainer.datasets import dict_dataset # NOQA from chainer.datasets import image_dataset # NOQA from chainer.datasets import mnist # NOQA +from chainer.datasets import svhn # NOQA from chainer.datasets import ptb # NOQA from chainer.datasets import sub_dataset # NOQA from chainer.datasets import transform_dataset # NOQA From 02ae96e691c3d545059092d720e5dfb77d0ae6f1 Mon Sep 17 00:00:00 2001 From: rcalland Date: Mon, 10 Jul 2017 17:41:48 +0900 Subject: [PATCH 4/8] fixed flake8 errors --- chainer/datasets/__init__.py | 2 +- chainer/datasets/svhn.py | 28 +++++++++++++++++++--------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/chainer/datasets/__init__.py b/chainer/datasets/__init__.py index a43d90c376ae..f7b5b13e981d 100644 --- a/chainer/datasets/__init__.py +++ b/chainer/datasets/__init__.py @@ -2,7 +2,7 @@ from chainer.datasets import dict_dataset # NOQA from chainer.datasets import image_dataset # NOQA from chainer.datasets import mnist # NOQA -from chainer.datasets import svhn # NOQA +from chainer.datasets import svhn # NOQA from chainer.datasets import ptb # NOQA from chainer.datasets import sub_dataset # NOQA from chainer.datasets import transform_dataset # NOQA diff --git a/chainer/datasets/svhn.py b/chainer/datasets/svhn.py index d87c99eaa32d..bd1024acc0fd 100644 --- a/chainer/datasets/svhn.py +++ b/chainer/datasets/svhn.py @@ -6,11 +6,14 @@ from chainer.dataset import download from chainer.datasets import tuple_dataset -def get_svhn(withlabel=True, scale=1., dtype=numpy.float32, label_dtype=numpy.int32): + +def get_svhn(withlabel=True, scale=1., dtype=numpy.float32, + label_dtype=numpy.int32): """Gets the SVHN dataset. - `SVHN ` is a dataset similar to MNIST but - composed of cropped images of house numbers. The functionality is identical to the MNIST dataset, + `SVHN ` is a dataset + similar to MNIST but composed of cropped images of house numbers. + The functionality is identical to the MNIST dataset, with the exception that there is no ``ndim`` argument. Args: @@ -30,32 +33,38 @@ def get_svhn(withlabel=True, scale=1., dtype=numpy.float32, label_dtype=numpy.in """ train_raw = _retrieve_svhn_training() train = _preprocess_svhn(train_raw, withlabel, scale, dtype, - label_dtype) + label_dtype) test_raw = _retrieve_svhn_test() test = _preprocess_svhn(test_raw, withlabel, scale, dtype, - label_dtype) + label_dtype) return train, test + def _preprocess_svhn(raw, withlabel, scale, image_dtype, label_dtype): - images = raw["x"].transpose(3,2,0,1) + images = raw["x"].transpose(3, 2, 0, 1) images = images.astype(image_dtype) images *= scale / 255. labels = raw["y"].astype(label_dtype).flatten() # labels go from 1-10, with the digit "0" having label 10. # Set "0" to be label 0 to restore expected ordering - labels[labels==10] = 0 + labels[labels == 10] = 0 if withlabel: return tuple_dataset.TupleDataset(images, labels) else: return images + def _retrieve_svhn_training(): - return _retrieve_svhn("train.npz", "http://ufldl.stanford.edu/housenumbers/train_32x32.mat") + url = "http://ufldl.stanford.edu/housenumbers/train_32x32.mat" + return _retrieve_svhn("train.npz", url) + def _retrieve_svhn_test(): - return _retrieve_svhn("test.npz", "http://ufldl.stanford.edu/housenumbers/test_32x32.mat") + url = "http://ufldl.stanford.edu/housenumbers/test_32x32.mat" + return _retrieve_svhn("test.npz", url) + def _retrieve_svhn(name, url): root = download.get_dataset_directory('pfnet/chainer/svhn') @@ -63,6 +72,7 @@ def _retrieve_svhn(name, url): return download.cache_or_load_file( path, lambda path: _make_npz(path, url), numpy.load) + def _make_npz(path, url): _path = download.cached_download(url) raw = io.loadmat(_path) From b8e0f3845871124fef438580b824fe99f52c8eea Mon Sep 17 00:00:00 2001 From: rcalland Date: Wed, 12 Jul 2017 11:24:35 +0900 Subject: [PATCH 5/8] added get_svhn to __init__.py --- chainer/datasets/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/chainer/datasets/__init__.py b/chainer/datasets/__init__.py index f7b5b13e981d..a75b376dcecf 100644 --- a/chainer/datasets/__init__.py +++ b/chainer/datasets/__init__.py @@ -17,6 +17,7 @@ from chainer.datasets.image_dataset import ImageDataset # NOQA from chainer.datasets.image_dataset import LabeledImageDataset # NOQA from chainer.datasets.mnist import get_mnist # NOQA +from chainer.datasets.svhn import get_svhn # NOQA from chainer.datasets.ptb import get_ptb_words # NOQA from chainer.datasets.ptb import get_ptb_words_vocabulary # NOQA from chainer.datasets.sub_dataset import get_cross_validation_datasets # NOQA From 25410270f2659526e8e5ac72da53867d48a74c0c Mon Sep 17 00:00:00 2001 From: rcalland Date: Thu, 13 Jul 2017 17:43:30 +0900 Subject: [PATCH 6/8] fix alphabetical order of imports --- chainer/datasets/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chainer/datasets/__init__.py b/chainer/datasets/__init__.py index a75b376dcecf..292f991386a1 100644 --- a/chainer/datasets/__init__.py +++ b/chainer/datasets/__init__.py @@ -2,9 +2,9 @@ from chainer.datasets import dict_dataset # NOQA from chainer.datasets import image_dataset # NOQA from chainer.datasets import mnist # NOQA -from chainer.datasets import svhn # NOQA from chainer.datasets import ptb # NOQA from chainer.datasets import sub_dataset # NOQA +from chainer.datasets import svhn # NOQA from chainer.datasets import transform_dataset # NOQA from chainer.datasets import tuple_dataset # NOQA @@ -17,7 +17,6 @@ from chainer.datasets.image_dataset import ImageDataset # NOQA from chainer.datasets.image_dataset import LabeledImageDataset # NOQA from chainer.datasets.mnist import get_mnist # NOQA -from chainer.datasets.svhn import get_svhn # NOQA from chainer.datasets.ptb import get_ptb_words # NOQA from chainer.datasets.ptb import get_ptb_words_vocabulary # NOQA from chainer.datasets.sub_dataset import get_cross_validation_datasets # NOQA @@ -27,5 +26,6 @@ from chainer.datasets.sub_dataset import split_dataset_n_random # NOQA from chainer.datasets.sub_dataset import split_dataset_random # NOQA from chainer.datasets.sub_dataset import SubDataset # NOQA +from chainer.datasets.svhn import get_svhn # NOQA from chainer.datasets.transform_dataset import TransformDataset # NOQA from chainer.datasets.tuple_dataset import TupleDataset # NOQA From 31fc9de93010e3e9048056896df4262b2cb124a1 Mon Sep 17 00:00:00 2001 From: rcalland Date: Mon, 31 Jul 2017 13:04:14 +0900 Subject: [PATCH 7/8] add check for scipy availability --- chainer/datasets/svhn.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/chainer/datasets/svhn.py b/chainer/datasets/svhn.py index bd1024acc0fd..55c73f22043b 100644 --- a/chainer/datasets/svhn.py +++ b/chainer/datasets/svhn.py @@ -1,7 +1,11 @@ import os import numpy -from scipy import io +try: + from scipy import io + _scipy_available = True +except ImportError: + _scipy_available = False from chainer.dataset import download from chainer.datasets import tuple_dataset @@ -31,6 +35,9 @@ def get_svhn(withlabel=True, scale=1., dtype=numpy.float32, datasets are arrays of images. """ + if not _scipy_available: + raise RuntimeError('scipy is not available') + train_raw = _retrieve_svhn_training() train = _preprocess_svhn(train_raw, withlabel, scale, dtype, label_dtype) From f46bacf20a2216af2a97b4619b206e91506ab1d8 Mon Sep 17 00:00:00 2001 From: rcalland Date: Mon, 31 Jul 2017 13:46:23 +0900 Subject: [PATCH 8/8] fix alignment issue --- chainer/datasets/svhn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chainer/datasets/svhn.py b/chainer/datasets/svhn.py index 55c73f22043b..d27c34e009e8 100644 --- a/chainer/datasets/svhn.py +++ b/chainer/datasets/svhn.py @@ -36,7 +36,7 @@ def get_svhn(withlabel=True, scale=1., dtype=numpy.float32, """ if not _scipy_available: - raise RuntimeError('scipy is not available') + raise RuntimeError('scipy is not available') train_raw = _retrieve_svhn_training() train = _preprocess_svhn(train_raw, withlabel, scale, dtype,