Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Standard scaler link #309

Merged
merged 5 commits into from
Jan 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions chainer_chemistry/links/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from chainer_chemistry.links.readout.nfp_readout import NFPReadout # NOQA
from chainer_chemistry.links.readout.schnet_readout import SchNetReadout # NOQA

from chainer_chemistry.links.scaler.standard_scaler import StandardScaler # NOQA

from chainer_chemistry.links.update.ggnn_update import GGNNUpdate # NOQA
from chainer_chemistry.links.update.nfp_update import NFPUpdate # NOQA
from chainer_chemistry.links.update.relgat_update import RelGATUpdate # NOQA
Expand Down
Empty file.
39 changes: 39 additions & 0 deletions chainer_chemistry/links/scaler/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import chainer


def to_array(x):
"""Convert x into numpy.ndarray or cupy.ndarray"""
if isinstance(x, chainer.Variable):
x = x.data
return x


class BaseScaler(chainer.Link):
"""Base class for scaler.

x maybe array or Variable
"""

def fit(self, x, **kwargs):
"""fit parameter from given input `x`.

It should return self after fitting parameters.
"""
raise NotImplementedError

def transform(self, x, **kwargs):
"""transform input `x` using fitted parameters.

This method should be called after `fit` is called.
"""
raise NotImplementedError

def inverse_transform(self, x, **kwargs):
"""inverse operation of `transform`.

This method should be called after `fit` is called.
"""
raise NotImplementedError

def fit_transform(self, x, **kwargs):
return self.fit(x, **kwargs).transform(x)
99 changes: 99 additions & 0 deletions chainer_chemistry/links/scaler/standard_scaler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from logging import getLogger

import numpy
from chainer import cuda

from chainer_chemistry.links.scaler.base import BaseScaler, to_array # NOQA


def format_x(x):
"""x may be array or Variable"""
# currently, only consider the case x is 2-dim, (batchsize, feature)
if x.ndim == 1:
# Deal with as 1 feature with several samples.
x = x[:, None]
if x.ndim != 2:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

elif ?

raise ValueError(
"Unexpected value x.shape={}, only x.ndim=2 is supported."
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

x.ndim 1 or 2?

.format(x.shape))
return x


class StandardScaler(BaseScaler):

def __init__(self):
super(StandardScaler, self).__init__()
self.indices = None
self.register_persistent('indices')
self.mean = None
self.register_persistent('mean')
self.std = None
self.register_persistent('std')

def fit(self, x, indices=None):
"""Fitting parameter.

Args:
x:
indices (list or tuple or None):
indices for applying standard scaling.

Returns:
self (StandardScaler): this instance.
"""
x = to_array(x)
x = format_x(x)

if indices is None:
pass
elif isinstance(indices, (list, tuple)):
indices = numpy.asarray(indices)
self.indices = indices
if self.indices is not None:
x = x[:, self.indices]

xp = self.xp
if xp is numpy:
self.mean = xp.nanmean(x, axis=0)
self.std = xp.nanstd(x, axis=0)
else:
if int(xp.sum(xp.isnan(x))) > 0:
raise NotImplementedError(
"StandardScaling with nan value on GPU is not supported.")
# cupy.nanmean, cupy.nanstd is not implemented yet.
self.mean = xp.mean(x, axis=0)
self.std = xp.std(x, axis=0)

# result consistency check
if xp.sum(self.std == 0) > 0:
logger = getLogger(__name__)
ind = numpy.argwhere(cuda.to_cpu(self.std) == 0)[:, 0]
logger.warning('fit: std was 0 at indices {}'.format(ind))
return self

def _compute_mean_std_all(self, input_dim):
if self.indices is None:
std_all = self.xp.ones(input_dim, dtype=self.xp.float32)
std_all[self.std != 0] = self.std[self.std != 0]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is kind to raise warning when std == 0. Because it is very strange situation.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated: show warning at fit.

return self.mean, std_all
else:
mean_all = self.xp.zeros(input_dim, dtype=self.xp.float32)
mean_all[self.indices] = self.mean
std_all = self.xp.ones(input_dim, dtype=self.xp.float32)
non_zero_indices = self.indices[self.std != 0]
std_all[non_zero_indices] = self.std[self.std != 0]
return mean_all, std_all

def transform(self, x):
if self.mean is None:
raise AttributeError('[Error] mean is None, call fit beforehand!')
x = format_x(x)
mean_all, std_all = self._compute_mean_std_all(x.shape[1])
return (x - mean_all[None, :]) / std_all[None, :]

def inverse_transform(self, x):
if self.mean is None:
raise AttributeError('[Error] mean is None, call fit beforehand!')
x = format_x(x)
mean_all, std_all = self._compute_mean_std_all(x.shape[1])
return x * std_all[None, :] + mean_all[None, :]
133 changes: 133 additions & 0 deletions tests/links_tests/scaler_tests/test_standard_scaler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import os

import numpy
import pytest
from chainer import serializers, Variable, cuda

from chainer_chemistry.links.scaler.standard_scaler import StandardScaler


@pytest.fixture
def data():
x = numpy.array(
[[0.1, 10., 0.3],
[0.2, 20., 0.1],
[0.3, 30., 0.],
[0.4, 40., 0.]],
dtype=numpy.float32)
expect_x_scaled = numpy.array(
[[-1.3416407, -1.3416408, 1.6329931],
[-0.44721353, -0.4472136, 0.],
[0.44721368, 0.4472136, -0.8164965],
[1.3416407, 1.3416408, -0.8164965]],
dtype=numpy.float32)
return x, expect_x_scaled


@pytest.mark.parametrize('indices', [None, [0], [1, 2]])
def test_standard_scaler_transform(data, indices):
x, expect_x_scaled = data
scaler = StandardScaler()
scaler.fit(x, indices=indices)
x_scaled = scaler.transform(x)

if indices is None:
indices = numpy.arange(x.shape[1])
for index in range(x.shape[1]):
if index in indices:
assert numpy.allclose(x_scaled[:, index],
expect_x_scaled[:, index])
else:
assert numpy.allclose(x_scaled[:, index], x[:, index])


def test_standard_scaler_transform_variable(data):
x, expect_x_scaled = data
xvar = Variable(x)
scaler = StandardScaler()
scaler.fit(xvar)
x_scaled = scaler.transform(xvar)

assert isinstance(x_scaled, Variable)
assert numpy.allclose(x_scaled.array, expect_x_scaled)


@pytest.mark.gpu
def test_standard_scaler_transform_gpu(data):
x, expect_x_scaled = data
scaler = StandardScaler()
scaler.to_gpu()
x = cuda.to_gpu(x)
scaler.fit(x)
x_scaled = scaler.transform(x)

assert isinstance(x_scaled, cuda.cupy.ndarray)
assert numpy.allclose(cuda.to_cpu(x_scaled), expect_x_scaled)


@pytest.mark.parametrize('indices', [None, [0], [1, 2]])
def test_standard_scaler_inverse_transform(data, indices):
x, expect_x_scaled = data
scaler = StandardScaler()
scaler.fit(x, indices=indices)
x_inverse = scaler.inverse_transform(expect_x_scaled)

if indices is None:
indices = numpy.arange(x.shape[1])
for index in range(x.shape[1]):
if index in indices:
assert numpy.allclose(x_inverse[:, index], x[:, index])
else:
assert numpy.allclose(x_inverse[:, index],
expect_x_scaled[:, index])


def test_standard_scaler_fit_transform(data):
x, expect_x_scaled = data
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
assert numpy.allclose(x_scaled, expect_x_scaled)


@pytest.mark.parametrize('indices', [None, [0]])
def test_standard_scaler_serialize(tmpdir, data, indices):
x, expect_x_scaled = data
scaler = StandardScaler()
scaler.fit(x, indices=indices)

scaler_filepath = os.path.join(str(tmpdir), 'scaler.npz')
serializers.save_npz(scaler_filepath, scaler)

scaler2 = StandardScaler()
serializers.load_npz(scaler_filepath, scaler2)

# print('scaler2 attribs:', scaler2.mean, scaler2.std, scaler2.indices)
assert numpy.allclose(scaler.mean, scaler2.mean)
assert numpy.allclose(scaler.std, scaler2.std)
assert scaler.indices == scaler2.indices


def test_standard_scaler_assert_raises():
x = numpy.array([[0.1, 0.2, 0.3], [0.5, 0.3, 0.1]],
dtype=numpy.float32)
scaler = StandardScaler()

# call transform before fit raises error
with pytest.raises(AttributeError):
scaler.transform(x)
with pytest.raises(AttributeError):
scaler.inverse_transform(x)


def test_standard_scaler_transform_zero_std():
x = numpy.array([[1, 2], [1, 2], [1, 2]], dtype=numpy.float32)
expect_x_scaled = numpy.array([[0, 0], [0, 0], [0, 0]],
dtype=numpy.float32)
scaler = StandardScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)
assert numpy.allclose(x_scaled, expect_x_scaled)


if __name__ == '__main__':
pytest.main([__file__, '-v', '-s'])