# Dataset module introduction

In [None]:
# Initial setup following http://docs.chainer.org/en/stable/tutorial/basic.html
import numpy as np
import chainer
from chainer import cuda, Function, gradient_check, report, training, utils, Variable
from chainer import datasets, iterators, optimizers, serializers
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
from chainer.training import extensions
import chainer.dataset
import chainer.datasets

## Built-in dataset modules

Some dataset format is already implemented in `chainer.datasets`

### TupleDataset



In [5]:
from chainer.datasets import TupleDataset

x = np.arange(10)
t = x * x

data = TupleDataset(x, t)

print('data type: {}, len: {}'.format(type(data), len(data)))

data type: <class 'chainer.datasets.tuple_dataset.TupleDataset'>, len: 10


In [6]:
# Unlike numpy, it does not have shape property.
data.shape

AttributeError: 'TupleDataset' object has no attribute 'shape'

`i`-th data can be accessed by `data[i]`

which is a tuple of format ($x_i$, $t_i$, ...)

In [10]:
# get forth data -> x=3, t=9
data[3]

(3, 9)

Slice accessing

When TupleDataset is accessed by slice indexing, e.g. `data[i:j]`, returned value is __list of tuple__
$[(x_i, t_i), ..., (x_{j-1}, t_{j-1})]$

In [19]:
# Get 1st, 2nd, 3rd data at the same time.
examples = data[0:4]

print(examples)
print('examples type: {}, len: {}'
      .format(type(examples), len(examples)))

[(0, 0), (1, 1), (2, 4), (3, 9)]
examples type: <class 'list'>, len: 4


To convert examples into minibatch format, you can use `concat_examples` function in `chainer.dataset`.

Its return value is in format `([x_array], [t array], ...)`

In [24]:
from chainer.dataset import concat_examples

data_minibatch = concat_examples(examples)

#print(data_minibatch)
#print('data_minibatch type: {}, len: {}'
#      .format(type(data_minibatch), len(data_minibatch)))

x_minibatch, t_minibatch = data_minibatch
# Now it is array format, which has shape
print('x_minibatch = {}, type: {}, shape: {}'.format(x_minibatch, type(x_minibatch), x_minibatch.shape))
print('t_minibatch = {}, type: {}, shape: {}'.format(t_minibatch, type(t_minibatch), t_minibatch.shape))

x_minibatch = [0 1 2 3], type: <class 'numpy.ndarray'>, shape: (4,)
t_minibatch = [0 1 4 9], type: <class 'numpy.ndarray'>, shape: (4,)


### DictDataset

TBD

The important point is that `get_example` function is called every time when the data is accessed by [] indexing.

Thus you may put random value generation for data augmentation code in get_example.

### ImageDataset

TBD

Let's see a concrete example to create new dataset from original tuple dataset by adding a small noise.

### LabeledImageDataset

TBD

To convert examples into minibatch format, you can use `concat_examples` function in `chainer.dataset` in the sameway explained at TupleDataset.

### SubDataset

TBD

It can be used for cross validation.

In [4]:
datasets.split_dataset_n_random()

## Implement your own custom dataset

You can define your own dataset by implementing a sub class of `DatasetMixin` in `chainer.dataset`

## DatasetMixin

If you want to define custom dataset, `DatasetMixin` provides the base function to make compatible with other dataset format.

Another important usage for `DatasetMixin` is to __preprocess__ the input data, including __data augmentation__.

To implement subclass of `DatasetMixin`, you usually need to implement these 3 functions.
 - Override `__init__(self, *args)` function: It is not compulsary but
 - Override `__len__(self)` function        : Iterator need to know the length of this dataset to understand the end of epoch.
 - Override `get_examples(self, i)` function: 

In [8]:
from chainer.dataset import DatasetMixin


print_debug = True
class SimpleDataset(DatasetMixin):
    def __init__(self, values):
        self.values = values
        
    def __len__(self):
        return len(self.values)

    def get_example(self, i):
        if print_debug: 
            print('get_example, i = {}'.format(i))
        return self.values[i]

Important function in `DatasetMixin` is `get_examples(self, i)` function. 
This function is called when they access data[i]


In [9]:
simple_data = SimpleDataset([0, 1, 4, 9, 16, 25])

In [10]:
# get_example(self, i) is called when data is accessed by data[i]
simple_data[3]

get_example, i = 3


9

In [11]:
# data can be accessed using slice indexing as well

simple_data[1:3]

get_example, i = 1
get_example, i = 2


[1, 4]

The important point is that `get_example` function is called every time when the data is accessed by [] indexing.

Thus you may put random value generation for data augmentation code in get_example.

In [36]:
import numpy as np
from chainer.dataset import DatasetMixin

print_debug = False


def calc(x):
    return x * x


class SquareNoiseDataset(DatasetMixin):
    def __init__(self, values):
        self.values = values
        
    def __len__(self):
        return len(self.values)

    def get_example(self, i):
        if print_debug: 
            print('get_example, i = {}'.format(i))
        x = self.values[i]
        t = calc(x) 
        t_noise = t + np.random.normal(0, 0.1)
        return x, t_noise

In [37]:
square_noise_data = SquareNoiseDataset(np.arange(10))

Below `SimpleNoiseDataset` adds small Gaussian noise to the original value,
and every time the value is accessed, `get_example` is called and differenct noise is added even if you access to the data with same index.

In [38]:
# Accessing to the same index, but the value is different!
print('Accessing square_noise_data[3]', )
print('1st: ', square_noise_data[3])
print('2nd: ', square_noise_data[3])
print('3rd: ', square_noise_data[3])

Accessing square_noise_data[3]
1st:  (3, 9.2819845958846461)
2nd:  (3, 8.8891164834607377)
3rd:  (3, 8.9558600783664595)


In [39]:
# Same applies for slice index accessing.
print('Accessing square_noise_data[0:4]')
print('1st: ', square_noise_data[0:4])
print('2nd: ', square_noise_data[0:4])
print('3rd: ', square_noise_data[0:4])

Accessing square_noise_data[0:4]
1st:  [(0, -0.11465820953532918), (1, 1.0652752646373151), (2, 3.8295759509448666), (3, 8.8347514838735872)]
2nd:  [(0, 0.094033330171766155), (1, 0.89304583140240168), (2, 3.9135077150785431), (3, 8.7598211647390194)]
3rd:  [(0, 0.094427684579741974), (1, 1.0638976835182377), (2, 3.86773676682447), (3, 9.1075182057573691)]


To convert examples into minibatch format, you can use `concat_examples` function in `chainer.dataset` in the sameway explained at TupleDataset.

In [40]:
from chainer.dataset import concat_examples

examples = square_noise_data[0:4]
data_minibatch = concat_examples(examples)

x_minibatch, t_minibatch = data_minibatch
# Now it is array format, which has shape
print('x_minibatch = {}, type: {}, shape: {}'.format(x_minibatch, type(x_minibatch), x_minibatch.shape))
print('t_minibatch = {}, type: {}, shape: {}'.format(t_minibatch, type(t_minibatch), t_minibatch.shape))

x_minibatch = [0 1 2 3], type: <class 'numpy.ndarray'>, shape: (4,)
t_minibatch = [ 0.03676176  1.0472368   4.04931546  8.98907645], type: <class 'numpy.ndarray'>, shape: (4,)


## TransformDataset

Transform dataset can be used to create/modify dataset from existing dataset.
New (modified) dataset can be created by `TransformDataset(original_data, transform_function)`.



Let's see a concrete example to create new dataset from original tuple dataset by adding a small noise.

In [21]:
from chainer.datasets import TransformDataset

x = np.arange(10)
t = x * x - x

original_dataset = TupleDataset(x, t)

def transform_function(in_data):
    x_i, t_i = in_data
    new_t_i = t_i + np.random.normal(0, 0.1)
    return x_i, new_t_i

transformed_dataset = TransformDataset

ImportError: cannot import name 'TransformDataset'