Merge pull request #309 from corochann/standard_scaler_link

Standard scaler link
chainer · Jan 24, 2019 · e0fd17e · e0fd17e
2 parents f44268e + 309fd55
commit e0fd17e
Show file tree

Hide file tree

Showing 5 changed files with 273 additions and 0 deletions.
diff --git a/chainer_chemistry/links/__init__.py b/chainer_chemistry/links/__init__.py
@@ -8,6 +8,8 @@
 from chainer_chemistry.links.readout.nfp_readout import NFPReadout  # NOQA
 from chainer_chemistry.links.readout.schnet_readout import SchNetReadout  # NOQA
 
+from chainer_chemistry.links.scaler.standard_scaler import StandardScaler  # NOQA
+
 from chainer_chemistry.links.update.ggnn_update import GGNNUpdate  # NOQA
 from chainer_chemistry.links.update.nfp_update import NFPUpdate  # NOQA
 from chainer_chemistry.links.update.relgat_update import RelGATUpdate  # NOQA

diff --git a/chainer_chemistry/links/scaler/__init__.py b/chainer_chemistry/links/scaler/__init__.py
diff --git a/chainer_chemistry/links/scaler/base.py b/chainer_chemistry/links/scaler/base.py
@@ -0,0 +1,39 @@
+import chainer
+
+
+def to_array(x):
+    """Convert x into numpy.ndarray or cupy.ndarray"""
+    if isinstance(x, chainer.Variable):
+        x = x.data
+    return x
+
+
+class BaseScaler(chainer.Link):
+    """Base class for scaler.
+
+    x maybe array or Variable
+    """
+
+    def fit(self, x, **kwargs):
+        """fit parameter from given input `x`.
+
+        It should return self after fitting parameters.
+        """
+        raise NotImplementedError
+
+    def transform(self, x, **kwargs):
+        """transform input `x` using fitted parameters.
+
+        This method should be called after `fit` is called.
+        """
+        raise NotImplementedError
+
+    def inverse_transform(self, x, **kwargs):
+        """inverse operation of `transform`.
+
+        This method should be called after `fit` is called.
+        """
+        raise NotImplementedError
+
+    def fit_transform(self, x, **kwargs):
+        return self.fit(x, **kwargs).transform(x)
diff --git a/chainer_chemistry/links/scaler/standard_scaler.py b/chainer_chemistry/links/scaler/standard_scaler.py
@@ -0,0 +1,99 @@
+from logging import getLogger
+
+import numpy
+from chainer import cuda
+
+from chainer_chemistry.links.scaler.base import BaseScaler, to_array  # NOQA
+
+
+def format_x(x):
+    """x may be array or Variable"""
+    # currently, only consider the case x is 2-dim, (batchsize, feature)
+    if x.ndim == 1:
+        # Deal with as 1 feature with several samples.
+        x = x[:, None]
+    if x.ndim != 2:
+        raise ValueError(
+            "Unexpected value x.shape={}, only x.ndim=2 is supported."
+            .format(x.shape))
+    return x
+
+
+class StandardScaler(BaseScaler):
+
+    def __init__(self):
+        super(StandardScaler, self).__init__()
+        self.indices = None
+        self.register_persistent('indices')
+        self.mean = None
+        self.register_persistent('mean')
+        self.std = None
+        self.register_persistent('std')
+
+    def fit(self, x, indices=None):
+        """Fitting parameter.
+
+        Args:
+            x:
+            indices (list or tuple or None):
+                indices for applying standard scaling.
+
+        Returns:
+            self (StandardScaler): this instance.
+        """
+        x = to_array(x)
+        x = format_x(x)
+
+        if indices is None:
+            pass
+        elif isinstance(indices, (list, tuple)):
+            indices = numpy.asarray(indices)
+        self.indices = indices
+        if self.indices is not None:
+            x = x[:, self.indices]
+
+        xp = self.xp
+        if xp is numpy:
+            self.mean = xp.nanmean(x, axis=0)
+            self.std = xp.nanstd(x, axis=0)
+        else:
+            if int(xp.sum(xp.isnan(x))) > 0:
+                raise NotImplementedError(
+                    "StandardScaling with nan value on GPU is not supported.")
+            # cupy.nanmean, cupy.nanstd is not implemented yet.
+            self.mean = xp.mean(x, axis=0)
+            self.std = xp.std(x, axis=0)
+
+        # result consistency check
+        if xp.sum(self.std == 0) > 0:
+            logger = getLogger(__name__)
+            ind = numpy.argwhere(cuda.to_cpu(self.std) == 0)[:, 0]
+            logger.warning('fit: std was 0 at indices {}'.format(ind))
+        return self
+
+    def _compute_mean_std_all(self, input_dim):
+        if self.indices is None:
+            std_all = self.xp.ones(input_dim, dtype=self.xp.float32)
+            std_all[self.std != 0] = self.std[self.std != 0]
+            return self.mean, std_all
+        else:
+            mean_all = self.xp.zeros(input_dim, dtype=self.xp.float32)
+            mean_all[self.indices] = self.mean
+            std_all = self.xp.ones(input_dim, dtype=self.xp.float32)
+            non_zero_indices = self.indices[self.std != 0]
+            std_all[non_zero_indices] = self.std[self.std != 0]
+            return mean_all, std_all
+
+    def transform(self, x):
+        if self.mean is None:
+            raise AttributeError('[Error] mean is None, call fit beforehand!')
+        x = format_x(x)
+        mean_all, std_all = self._compute_mean_std_all(x.shape[1])
+        return (x - mean_all[None, :]) / std_all[None, :]
+
+    def inverse_transform(self, x):
+        if self.mean is None:
+            raise AttributeError('[Error] mean is None, call fit beforehand!')
+        x = format_x(x)
+        mean_all, std_all = self._compute_mean_std_all(x.shape[1])
+        return x * std_all[None, :] + mean_all[None, :]
diff --git a/tests/links_tests/scaler_tests/test_standard_scaler.py b/tests/links_tests/scaler_tests/test_standard_scaler.py
@@ -0,0 +1,133 @@
+import os
+
+import numpy
+import pytest
+from chainer import serializers, Variable, cuda
+
+from chainer_chemistry.links.scaler.standard_scaler import StandardScaler
+
+
+@pytest.fixture
+def data():
+    x = numpy.array(
+        [[0.1, 10., 0.3],
+         [0.2, 20., 0.1],
+         [0.3, 30., 0.],
+         [0.4, 40., 0.]],
+        dtype=numpy.float32)
+    expect_x_scaled = numpy.array(
+        [[-1.3416407, -1.3416408, 1.6329931],
+         [-0.44721353, -0.4472136, 0.],
+         [0.44721368, 0.4472136, -0.8164965],
+         [1.3416407, 1.3416408, -0.8164965]],
+        dtype=numpy.float32)
+    return x, expect_x_scaled
+
+
+@pytest.mark.parametrize('indices', [None, [0], [1, 2]])
+def test_standard_scaler_transform(data, indices):
+    x, expect_x_scaled = data
+    scaler = StandardScaler()
+    scaler.fit(x, indices=indices)
+    x_scaled = scaler.transform(x)
+
+    if indices is None:
+        indices = numpy.arange(x.shape[1])
+    for index in range(x.shape[1]):
+        if index in indices:
+            assert numpy.allclose(x_scaled[:, index],
+                                  expect_x_scaled[:, index])
+        else:
+            assert numpy.allclose(x_scaled[:, index], x[:, index])
+
+
+def test_standard_scaler_transform_variable(data):
+    x, expect_x_scaled = data
+    xvar = Variable(x)
+    scaler = StandardScaler()
+    scaler.fit(xvar)
+    x_scaled = scaler.transform(xvar)
+
+    assert isinstance(x_scaled, Variable)
+    assert numpy.allclose(x_scaled.array, expect_x_scaled)
+
+
+@pytest.mark.gpu
+def test_standard_scaler_transform_gpu(data):
+    x, expect_x_scaled = data
+    scaler = StandardScaler()
+    scaler.to_gpu()
+    x = cuda.to_gpu(x)
+    scaler.fit(x)
+    x_scaled = scaler.transform(x)
+
+    assert isinstance(x_scaled, cuda.cupy.ndarray)
+    assert numpy.allclose(cuda.to_cpu(x_scaled), expect_x_scaled)
+
+
+@pytest.mark.parametrize('indices', [None, [0], [1, 2]])
+def test_standard_scaler_inverse_transform(data, indices):
+    x, expect_x_scaled = data
+    scaler = StandardScaler()
+    scaler.fit(x, indices=indices)
+    x_inverse = scaler.inverse_transform(expect_x_scaled)
+
+    if indices is None:
+        indices = numpy.arange(x.shape[1])
+    for index in range(x.shape[1]):
+        if index in indices:
+            assert numpy.allclose(x_inverse[:, index], x[:, index])
+        else:
+            assert numpy.allclose(x_inverse[:, index],
+                                  expect_x_scaled[:, index])
+
+
+def test_standard_scaler_fit_transform(data):
+    x, expect_x_scaled = data
+    scaler = StandardScaler()
+    x_scaled = scaler.fit_transform(x)
+    assert numpy.allclose(x_scaled, expect_x_scaled)
+
+
+@pytest.mark.parametrize('indices', [None, [0]])
+def test_standard_scaler_serialize(tmpdir, data, indices):
+    x, expect_x_scaled = data
+    scaler = StandardScaler()
+    scaler.fit(x, indices=indices)
+
+    scaler_filepath = os.path.join(str(tmpdir), 'scaler.npz')
+    serializers.save_npz(scaler_filepath, scaler)
+
+    scaler2 = StandardScaler()
+    serializers.load_npz(scaler_filepath, scaler2)
+
+    # print('scaler2 attribs:', scaler2.mean, scaler2.std, scaler2.indices)
+    assert numpy.allclose(scaler.mean, scaler2.mean)
+    assert numpy.allclose(scaler.std, scaler2.std)
+    assert scaler.indices == scaler2.indices
+
+
+def test_standard_scaler_assert_raises():
+    x = numpy.array([[0.1, 0.2, 0.3], [0.5, 0.3, 0.1]],
+                    dtype=numpy.float32)
+    scaler = StandardScaler()
+
+    # call transform before fit raises error
+    with pytest.raises(AttributeError):
+        scaler.transform(x)
+    with pytest.raises(AttributeError):
+        scaler.inverse_transform(x)
+
+
+def test_standard_scaler_transform_zero_std():
+    x = numpy.array([[1, 2], [1, 2], [1, 2]], dtype=numpy.float32)
+    expect_x_scaled = numpy.array([[0, 0], [0, 0], [0, 0]],
+                                  dtype=numpy.float32)
+    scaler = StandardScaler()
+    scaler.fit(x)
+    x_scaled = scaler.transform(x)
+    assert numpy.allclose(x_scaled, expect_x_scaled)
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v', '-s'])