Skip to content

Commit

Permalink
Merge pull request #309 from corochann/standard_scaler_link
Browse files Browse the repository at this point in the history
Standard scaler link
  • Loading branch information
mottodora committed Jan 24, 2019
2 parents f44268e + 309fd55 commit e0fd17e
Show file tree
Hide file tree
Showing 5 changed files with 273 additions and 0 deletions.
2 changes: 2 additions & 0 deletions chainer_chemistry/links/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from chainer_chemistry.links.readout.nfp_readout import NFPReadout # NOQA
from chainer_chemistry.links.readout.schnet_readout import SchNetReadout # NOQA

from chainer_chemistry.links.scaler.standard_scaler import StandardScaler # NOQA

from chainer_chemistry.links.update.ggnn_update import GGNNUpdate # NOQA
from chainer_chemistry.links.update.nfp_update import NFPUpdate # NOQA
from chainer_chemistry.links.update.relgat_update import RelGATUpdate # NOQA
Expand Down
Empty file.
39 changes: 39 additions & 0 deletions chainer_chemistry/links/scaler/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import chainer


def to_array(x):
"""Convert x into numpy.ndarray or cupy.ndarray"""
if isinstance(x, chainer.Variable):
x = x.data
return x


class BaseScaler(chainer.Link):
"""Base class for scaler.
x maybe array or Variable
"""

def fit(self, x, **kwargs):
"""fit parameter from given input `x`.
It should return self after fitting parameters.
"""
raise NotImplementedError

def transform(self, x, **kwargs):
"""transform input `x` using fitted parameters.
This method should be called after `fit` is called.
"""
raise NotImplementedError

def inverse_transform(self, x, **kwargs):
"""inverse operation of `transform`.
This method should be called after `fit` is called.
"""
raise NotImplementedError

def fit_transform(self, x, **kwargs):
return self.fit(x, **kwargs).transform(x)
99 changes: 99 additions & 0 deletions chainer_chemistry/links/scaler/standard_scaler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from logging import getLogger

import numpy
from chainer import cuda

from chainer_chemistry.links.scaler.base import BaseScaler, to_array # NOQA


def format_x(x):
"""x may be array or Variable"""
# currently, only consider the case x is 2-dim, (batchsize, feature)
if x.ndim == 1:
# Deal with as 1 feature with several samples.
x = x[:, None]
if x.ndim != 2:
raise ValueError(
"Unexpected value x.shape={}, only x.ndim=2 is supported."
.format(x.shape))
return x


class StandardScaler(BaseScaler):

def __init__(self):
super(StandardScaler, self).__init__()
self.indices = None
self.register_persistent('indices')
self.mean = None
self.register_persistent('mean')
self.std = None
self.register_persistent('std')

def fit(self, x, indices=None):
"""Fitting parameter.
Args:
x:
indices (list or tuple or None):
indices for applying standard scaling.
Returns:
self (StandardScaler): this instance.
"""
x = to_array(x)
x = format_x(x)

if indices is None:
pass
elif isinstance(indices, (list, tuple)):
indices = numpy.asarray(indices)
self.indices = indices
if self.indices is not None:
x = x[:, self.indices]

xp = self.xp
if xp is numpy:
self.mean = xp.nanmean(x, axis=0)
self.std = xp.nanstd(x, axis=0)
else:
if int(xp.sum(xp.isnan(x))) > 0:
raise NotImplementedError(
"StandardScaling with nan value on GPU is not supported.")
# cupy.nanmean, cupy.nanstd is not implemented yet.
self.mean = xp.mean(x, axis=0)
self.std = xp.std(x, axis=0)

# result consistency check
if xp.sum(self.std == 0) > 0:
logger = getLogger(__name__)
ind = numpy.argwhere(cuda.to_cpu(self.std) == 0)[:, 0]
logger.warning('fit: std was 0 at indices {}'.format(ind))
return self

def _compute_mean_std_all(self, input_dim):
if self.indices is None:
std_all = self.xp.ones(input_dim, dtype=self.xp.float32)
std_all[self.std != 0] = self.std[self.std != 0]
return self.mean, std_all
else:
mean_all = self.xp.zeros(input_dim, dtype=self.xp.float32)
mean_all[self.indices] = self.mean
std_all = self.xp.ones(input_dim, dtype=self.xp.float32)
non_zero_indices = self.indices[self.std != 0]
std_all[non_zero_indices] = self.std[self.std != 0]
return mean_all, std_all

def transform(self, x):
if self.mean is None:
raise AttributeError('[Error] mean is None, call fit beforehand!')
x = format_x(x)
mean_all, std_all = self._compute_mean_std_all(x.shape[1])
return (x - mean_all[None, :]) / std_all[None, :]

def inverse_transform(self, x):
if self.mean is None:
raise AttributeError('[Error] mean is None, call fit beforehand!')
x = format_x(x)
mean_all, std_all = self._compute_mean_std_all(x.shape[1])
return x * std_all[None, :] + mean_all[None, :]
133 changes: 133 additions & 0 deletions tests/links_tests/scaler_tests/test_standard_scaler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import os

import numpy
import pytest
from chainer import serializers, Variable, cuda

from chainer_chemistry.links.scaler.standard_scaler import StandardScaler


@pytest.fixture
def data():
x = numpy.array(
[[0.1, 10., 0.3],
[0.2, 20., 0.1],
[0.3, 30., 0.],
[0.4, 40., 0.]],
dtype=numpy.float32)
expect_x_scaled = numpy.array(
[[-1.3416407, -1.3416408, 1.6329931],
[-0.44721353, -0.4472136, 0.],
[0.44721368, 0.4472136, -0.8164965],
[1.3416407, 1.3416408, -0.8164965]],
dtype=numpy.float32)
return x, expect_x_scaled


@pytest.mark.parametrize('indices', [None, [0], [1, 2]])
def test_standard_scaler_transform(data, indices):
x, expect_x_scaled = data
scaler = StandardScaler()
scaler.fit(x, indices=indices)
x_scaled = scaler.transform(x)

if indices is None:
indices = numpy.arange(x.shape[1])
for index in range(x.shape[1]):
if index in indices:
assert numpy.allclose(x_scaled[:, index],
expect_x_scaled[:, index])
else:
assert numpy.allclose(x_scaled[:, index], x[:, index])


def test_standard_scaler_transform_variable(data):
x, expect_x_scaled = data
xvar = Variable(x)
scaler = StandardScaler()
scaler.fit(xvar)
x_scaled = scaler.transform(xvar)

assert isinstance(x_scaled, Variable)
assert numpy.allclose(x_scaled.array, expect_x_scaled)


@pytest.mark.gpu
def test_standard_scaler_transform_gpu(data):
x, expect_x_scaled = data
scaler = StandardScaler()
scaler.to_gpu()
x = cuda.to_gpu(x)
scaler.fit(x)
x_scaled = scaler.transform(x)

assert isinstance(x_scaled, cuda.cupy.ndarray)
assert numpy.allclose(cuda.to_cpu(x_scaled), expect_x_scaled)


@pytest.mark.parametrize('indices', [None, [0], [1, 2]])
def test_standard_scaler_inverse_transform(data, indices):
x, expect_x_scaled = data
scaler = StandardScaler()
scaler.fit(x, indices=indices)
x_inverse = scaler.inverse_transform(expect_x_scaled)

if indices is None:
indices = numpy.arange(x.shape[1])
for index in range(x.shape[1]):
if index in indices:
assert numpy.allclose(x_inverse[:, index], x[:, index])
else:
assert numpy.allclose(x_inverse[:, index],
expect_x_scaled[:, index])


def test_standard_scaler_fit_transform(data):
x, expect_x_scaled = data
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
assert numpy.allclose(x_scaled, expect_x_scaled)


@pytest.mark.parametrize('indices', [None, [0]])
def test_standard_scaler_serialize(tmpdir, data, indices):
x, expect_x_scaled = data
scaler = StandardScaler()
scaler.fit(x, indices=indices)

scaler_filepath = os.path.join(str(tmpdir), 'scaler.npz')
serializers.save_npz(scaler_filepath, scaler)

scaler2 = StandardScaler()
serializers.load_npz(scaler_filepath, scaler2)

# print('scaler2 attribs:', scaler2.mean, scaler2.std, scaler2.indices)
assert numpy.allclose(scaler.mean, scaler2.mean)
assert numpy.allclose(scaler.std, scaler2.std)
assert scaler.indices == scaler2.indices


def test_standard_scaler_assert_raises():
x = numpy.array([[0.1, 0.2, 0.3], [0.5, 0.3, 0.1]],
dtype=numpy.float32)
scaler = StandardScaler()

# call transform before fit raises error
with pytest.raises(AttributeError):
scaler.transform(x)
with pytest.raises(AttributeError):
scaler.inverse_transform(x)


def test_standard_scaler_transform_zero_std():
x = numpy.array([[1, 2], [1, 2], [1, 2]], dtype=numpy.float32)
expect_x_scaled = numpy.array([[0, 0], [0, 0], [0, 0]],
dtype=numpy.float32)
scaler = StandardScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)
assert numpy.allclose(x_scaled, expect_x_scaled)


if __name__ == '__main__':
pytest.main([__file__, '-v', '-s'])

0 comments on commit e0fd17e

Please sign in to comment.