From c0d97447d81221f031e81515600663dd6eb89579 Mon Sep 17 00:00:00 2001
From: karita <shigekikarita@gmail.com>
Date: Tue, 19 Dec 2017 01:21:57 +0900
Subject: [PATCH 01/17] add travis test

---
 .travis.yml              | 41 ++++++++++++++++++++++++++++++
 test/test_e2e_model.py   | 55 ++++++++++++++++++++++++++++++++++++++++
 test/test_io_voxforge.py | 54 +++++++++++++++++++++++++--------------
 test/test_loss.py        | 37 +++++++++++++++++++++++++++
 4 files changed, 168 insertions(+), 19 deletions(-)
 create mode 100644 .travis.yml
 create mode 100644 test/test_e2e_model.py
 create mode 100644 test/test_loss.py

diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 00000000000..10847df475f
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,41 @@
+language: python
+
+cache:
+  - pip
+  - ccache
+
+matrix:
+  include:
+    - os: linux
+      python: "2.7"
+    - os: linux
+      python: "3.6"
+
+
+install:
+  - pip install -U pip wheel
+  - python setup.py sdist
+  - pip install dist/*.tar.gz
+  - pip install pytest hacking mock
+  - pip install autopep8
+  - pip install -r tools/requirements.txt
+  - if [[ $TRAVIS_PYTHON_VERSION == 2.7 ]]; then pip install pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp27-cp27mu-linux_x86_64.whl; fi
+  - if [[ $TRAVIS_PYTHON_VERSION == 3.6 ]]; then pip install pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp27mu-linux_x86_64.whl; fi
+  - cd tools && make warp-ctc && cd -
+
+
+script:
+  # - flake8
+  # - autopep8 -r . --global-config .pep8 --diff | tee check_autopep8
+  # - test ! -s check_autopep8
+  - cd tests
+  - pytest test
+
+sudo: false
+
+addons:
+  apt:
+    packages:
+      - cmake
+      - python-dev
+      - python3-dev
diff --git a/test/test_e2e_model.py b/test/test_e2e_model.py
new file mode 100644
index 00000000000..1041eefde4d
--- /dev/null
+++ b/test/test_e2e_model.py
@@ -0,0 +1,55 @@
+# coding: utf-8
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import sys
+import argparse
+sys.path.append("./src/nets")
+
+
+import numpy
+
+import e2e_asr_attctc_th
+import e2e_asr_attctc
+
+args = argparse.Namespace(
+    elayers = 4,
+    subsample = "1_2_2_1_1",
+    etype = "vggblstmp",
+    eunits = 100,
+    eprojs = 100,
+    dlayers=1,
+    dunits=300,
+    atype="location",
+    aconv_chans=10,
+    aconv_filts=100,
+    mtlalpha=0.5,
+    adim=320,
+    dropout_rate=0.0,
+    beam_size=3,
+    penalty=0.5,
+    maxlenratio=1.0,
+    minlenratio=0.0,
+    verbose = True,
+    char_list = [u"あ", u"い", u"う", u"え", u"お"],
+    outdir = None
+)
+
+
+
+def test_model_trainable_and_decodable():
+    for m in [e2e_asr_attctc, e2e_asr_attctc_th]:
+        model = m.Loss(m.E2E(40, 5, args), 0.5)
+        out_data = "1 2 3 4"
+        data = [
+            ("aaa", dict(feat=numpy.random.randn(100, 40).astype(numpy.float32), tokenid=out_data)),
+            ("bbb", dict(feat=numpy.random.randn(200, 40).astype(numpy.float32), tokenid=out_data))
+        ]
+        attn_loss = model(data)
+        attn_loss.backward() # trainable
+
+        in_data = data[0][1]["feat"]
+        y = model.predictor.recognize(in_data, args, args.char_list) # decodable
+
diff --git a/test/test_io_voxforge.py b/test/test_io_voxforge.py
index 52c8b164d2e..cae00b35c6e 100644
--- a/test/test_io_voxforge.py
+++ b/test/test_io_voxforge.py
@@ -1,22 +1,38 @@
 # coding: utf-8
+import sys
+sys.path.append("./src/utils")
+
+import os
 import numpy
-import kaldi_io
-import kaldi_io_py
-import lazy_io
-
-train_scp = "scp:egs/voxforge/asr1/data/tr_it/feats.scp"
-
-r1 = kaldi_io_py.read_mat_scp(train_scp)
-r2 = kaldi_io.RandomAccessBaseFloatMatrixReader(train_scp)
-r3 = lazy_io.read_dict_scp(train_scp)
-
-for k, v1 in r1:
-    k = str(k)
-    print(k)
-    v2 = r2[k]
-    v3 = r3[k]
-    assert v1.shape == v2.shape
-    assert v1.shape == v3.shape
-    numpy.testing.assert_allclose(v1, v2, atol=1e-5)
-    numpy.testing.assert_allclose(v1, v3, atol=0)
+
+
+# TODO: use much smaller corpus like AN4 and download if it does not exists
+def test_voxforge_feats():
+    import kaldi_io_py
+    import lazy_io
+    try:
+        import kaldi_io
+    except:
+        print("skip test_voxforge_feats because kaldi_io (kaldi-python) is not installed")
+        return
+
+
+    train_scp = "scp:egs/voxforge/asr1/data/tr_it/feats.scp"
+    if not os.path.exists(train_scp):
+        print("skip test_voxforge_feats because voxforge scp has not been created")
+        return
+
+    r1 = kaldi_io_py.read_mat_scp(train_scp)
+    r2 = kaldi_io.RandomAccessBaseFloatMatrixReader(train_scp)
+    r3 = lazy_io.read_dict_scp(train_scp)
+
+    for k, v1 in r1:
+        k = str(k)
+        print(k)
+        v2 = r2[k]
+        v3 = r3[k]
+        assert v1.shape == v2.shape
+        assert v1.shape == v3.shape
+        numpy.testing.assert_allclose(v1, v2, atol=1e-5)
+        numpy.testing.assert_allclose(v1, v3, atol=0)
 
diff --git a/test/test_loss.py b/test/test_loss.py
new file mode 100644
index 00000000000..f65f7137518
--- /dev/null
+++ b/test/test_loss.py
@@ -0,0 +1,37 @@
+# coding: utf-8
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import sys
+sys.path.append("./src/nets")
+
+import numpy
+import chainer
+import chainer.functions as F
+import torch
+from warpctc_pytorch import CTCLoss
+from e2e_asr_attctc_th import pad_list
+
+
+def test_loss():
+    n_out = 7
+    n_batch = 3
+    input_length = numpy.array([11, 17, 15], dtype=numpy.int32)
+    label_length = numpy.array([4, 2, 3], dtype=numpy.int32)
+    np_pred = [numpy.random.rand(il, n_out).astype(numpy.float32) for il in input_length]
+    np_target = [numpy.random.randint(0, n_out, size=ol, dtype=numpy.int32) for ol in label_length]
+
+    # NOTE: np_pred[i] seems to be transposed and used axis=-1 in e2e_asr_attctc.py
+    ch_pred = F.separate(F.pad_sequence(np_pred), axis=-2)
+    ch_target = F.pad_sequence(np_target, padding=-1)
+    ch_loss = F.connectionist_temporal_classification(ch_pred, ch_target, 0, input_length, label_length).data
+
+    th_pred = pad_list([torch.autograd.Variable(torch.from_numpy(x)) for x in np_pred]).transpose(0, 1)
+    th_target = torch.autograd.Variable(torch.from_numpy(numpy.concatenate(np_target)))
+    th_ilen = torch.autograd.Variable(torch.from_numpy(input_length))
+    th_olen = torch.autograd.Variable(torch.from_numpy(label_length))
+    # NOTE: warpctc_pytorch.CTCLoss does not normalize itself by batch-size while chainer's default setting does
+    th_loss = (CTCLoss()(th_pred, th_target, th_ilen, th_olen) / n_batch).data.numpy()[0]
+    numpy.testing.assert_allclose(th_loss, ch_loss, 0.05)

From 23f5696fe79bbe5f0d9db752b7a2c37434d5a2ae Mon Sep 17 00:00:00 2001
From: karita <shigekikarita@gmail.com>
Date: Tue, 19 Dec 2017 01:24:41 +0900
Subject: [PATCH 02/17] fix .travis.yml

---
 .travis.yml                   | 15 ++++++-----
 src/nets/e2e_asr_attctc.py    | 47 ++++++++++++++++++++++++++++++++
 src/nets/e2e_asr_attctc_th.py | 50 ++---------------------------------
 test/test_e2e_model.py        | 16 ++++++-----
 test/test_io_voxforge.py      | 10 +++----
 test/test_loss.py             | 19 ++++++++-----
 tools/Makefile                | 14 ++++++++++
 7 files changed, 98 insertions(+), 73 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 10847df475f..8c693adb209 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,21 +14,22 @@ matrix:
 
 install:
   - pip install -U pip wheel
-  - python setup.py sdist
-  - pip install dist/*.tar.gz
   - pip install pytest hacking mock
   - pip install autopep8
-  - pip install -r tools/requirements.txt
-  - if [[ $TRAVIS_PYTHON_VERSION == 2.7 ]]; then pip install pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp27-cp27mu-linux_x86_64.whl; fi
-  - if [[ $TRAVIS_PYTHON_VERSION == 3.6 ]]; then pip install pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp27mu-linux_x86_64.whl; fi
-  - cd tools && make warp-ctc && cd -
+  # unable to install pytorch as https://github.com/pytorch/pytorch/issues/4178
+  # - if [[ $TRAVIS_PYTHON_VERSION == 2.7 ]]; then pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp27-cp27mu-linux_x86_64.whl; fi
+  # - if [[ $TRAVIS_PYTHON_VERSION == 3.6 ]]; then pip install install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl; fi
+  # - cd tools && make warp-ctc && cd -
+  - grep -v cupy tools/requirements.txt | pip install -r /dev/stdin
+  - cd tools && git clone https://github.com/vesis84/kaldi-io-for-python.git && cd -
 
 
 script:
+  # TODO test coding style?
   # - flake8
   # - autopep8 -r . --global-config .pep8 --diff | tee check_autopep8
   # - test ! -s check_autopep8
-  - cd tests
+  - export PYTHONPATH=`pwd`/src/nets:`pwd`/src/utils
   - pytest test
 
 sudo: false
diff --git a/src/nets/e2e_asr_attctc.py b/src/nets/e2e_asr_attctc.py
index c7715ee232b..542b0a73d6a 100644
--- a/src/nets/e2e_asr_attctc.py
+++ b/src/nets/e2e_asr_attctc.py
@@ -793,3 +793,50 @@ def __call__(self, xs, ilens):
         xs = [xs[i, :ilens[i], :] for i in range(len(ilens))]
 
         return xs, ilens
+
+
+if __name__ == '__main__':
+    import numpy
+    # from typing import NamedTuple
+    from argparse import Namespace
+    args = Namespace(
+        elayers = 4,
+        subsample = "1_2_2_1_1",
+        etype = "vggblstmp",
+        eunits = 100,
+        eprojs = 100,
+        dlayers=1,
+        dunits=300,
+        # attention related
+        atype="location",
+        aconv_chans=10,
+        aconv_filts=100,
+        mtlalpha=0.5,
+        # defaults
+        adim=320,
+        dropout_rate=0.0,
+        beam_size=3,
+        penalty=0.5,
+
+        maxlenratio=1.0,
+        minlenratio=0.0,
+
+        verbose = True,
+        char_list = ["a", "b", "c", "d", "e"],
+        outdir = None
+        )
+
+    model = Loss(E2E(40, 5, args), 0.5)
+    out_data = "1 2 3 4"
+    data = [
+        ("aaa", dict(feat=numpy.random.randn(100, 40).astype(numpy.float32), tokenid=out_data)),
+        ("bbb", dict(feat=numpy.random.randn(200, 40).astype(numpy.float32), tokenid=out_data))
+    ]
+    attn_loss = model(data)
+    print(attn_loss)
+    attn_loss.backward()
+
+    in_data = data[0][1]["feat"]
+    y = model.predictor.recognize(in_data, args, args.char_list)
+    print(y)
+    print("OK")
diff --git a/src/nets/e2e_asr_attctc_th.py b/src/nets/e2e_asr_attctc_th.py
index 76897ea2074..46b4832dacb 100644
--- a/src/nets/e2e_asr_attctc_th.py
+++ b/src/nets/e2e_asr_attctc_th.py
@@ -84,7 +84,7 @@ def report(self, loss_ctc, loss_att, acc, mtl_loss):
 
 # TODO merge Loss and E2E: there is no need to make these separately
 class Loss(torch.nn.Module):
-    def __init__(self, predictor, mtlalpha=0.0):
+    def __init__(self, predictor, mtlalpha):
         super(Loss, self).__init__()
         self.mtlalpha = mtlalpha
         self.loss = None
@@ -104,7 +104,7 @@ def forward(self, x):
         self.loss = alpha * loss_ctc + (1 - alpha) * loss_att
 
         if self.loss.data[0] < CTC_LOSS_THRESHOLD and not math.isnan(self.loss.data[0]):
-            self.reporter.report(loss_ctc, loss_att.data[0], acc, self.loss.data[0])
+            self.reporter.report(loss_ctc.data[0], loss_att.data[0], acc, self.loss.data[0])
         else:
             logging.warning('loss (=%f) is not correct', self.loss.data)
 
@@ -847,49 +847,3 @@ def forward(self, xs, ilens):
         xs = [xs[i, :ilens[i]] for i in range(len(ilens))]
         xs = pad_list(xs, 0.0)
         return xs, ilens
-
-
-if __name__ == '__main__':
-    import numpy
-    from typing import NamedTuple
-    class args(NamedTuple):
-        elayers = 4
-        subsample = "1_2_2_1_1"
-        etype = "vggblstmp"
-        eunits = 100
-        eprojs = 100
-        dlayers=1
-        dunits=300
-        # attention related
-        atype="location"
-        aconv_chans=10
-        aconv_filts=100
-        mtlalpha=0.5
-        # defaults
-        adim=320
-        dropout_rate=0.0
-        beam_size=3
-        penalty=0.5
-
-        maxlenratio=1.0
-        minlenratio=0.0
-
-        verbose = True
-        char_list = ["a", "b", "c", "d", "e"]
-        outdir = None
-
-    model = Loss(E2E(40, 5, args))
-    model.cuda()
-    out_data = "1 2 3 4"
-    data = [
-        ("aaa", dict(feat=numpy.random.randn(100, 40).astype(numpy.float32), tokenid=out_data)),
-        ("bbb", dict(feat=numpy.random.randn(200, 40).astype(numpy.float32), tokenid=out_data))
-    ]
-    attn_loss = model(data)
-    print(attn_loss.data[0])
-    attn_loss.backward()
-
-    in_data = data[0][1]["feat"]
-    y = model.predictor.recognize(in_data, args, args.char_list)
-    print(y)
-    print("OK")
diff --git a/test/test_e2e_model.py b/test/test_e2e_model.py
index 1041eefde4d..2ced58c04b4 100644
--- a/test/test_e2e_model.py
+++ b/test/test_e2e_model.py
@@ -4,15 +4,12 @@
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
 
-import sys
+import importlib
 import argparse
-sys.path.append("./src/nets")
-
 
+import pytest
 import numpy
 
-import e2e_asr_attctc_th
-import e2e_asr_attctc
 
 args = argparse.Namespace(
     elayers = 4,
@@ -40,7 +37,14 @@
 
 
 def test_model_trainable_and_decodable():
-    for m in [e2e_asr_attctc, e2e_asr_attctc_th]:
+    for m_str in ["e2e_asr_attctc", "e2e_asr_attctc_th"]:
+        try:
+            import torch
+        except:
+            if m_str[-3:] == "_th":
+                pytest.skip("pytorch is not installed")
+
+        m = importlib.import_module(m_str)
         model = m.Loss(m.E2E(40, 5, args), 0.5)
         out_data = "1 2 3 4"
         data = [
diff --git a/test/test_io_voxforge.py b/test/test_io_voxforge.py
index cae00b35c6e..44d12d5481b 100644
--- a/test/test_io_voxforge.py
+++ b/test/test_io_voxforge.py
@@ -1,8 +1,8 @@
 # coding: utf-8
+import os
 import sys
-sys.path.append("./src/utils")
 
-import os
+import pytest
 import numpy
 
 
@@ -13,14 +13,12 @@ def test_voxforge_feats():
     try:
         import kaldi_io
     except:
-        print("skip test_voxforge_feats because kaldi_io (kaldi-python) is not installed")
-        return
+        pytest.skip("kaldi_io (kaldi-python) is not installed")
 
 
     train_scp = "scp:egs/voxforge/asr1/data/tr_it/feats.scp"
     if not os.path.exists(train_scp):
-        print("skip test_voxforge_feats because voxforge scp has not been created")
-        return
+        pytest.skip("voxforge scp has not been created")
 
     r1 = kaldi_io_py.read_mat_scp(train_scp)
     r2 = kaldi_io.RandomAccessBaseFloatMatrixReader(train_scp)
diff --git a/test/test_loss.py b/test/test_loss.py
index f65f7137518..a4fea589515 100644
--- a/test/test_loss.py
+++ b/test/test_loss.py
@@ -4,18 +4,25 @@
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
 
-import sys
-sys.path.append("./src/nets")
-
+import pytest
 import numpy
 import chainer
 import chainer.functions as F
-import torch
-from warpctc_pytorch import CTCLoss
-from e2e_asr_attctc_th import pad_list
 
 
 def test_loss():
+    try:
+        import torch
+    except:
+        pytest.skip("pytorch is not installed")
+    try:
+        from warpctc_pytorch import CTCLoss
+    except:
+        pytest.skip("warpctc_pytorch is not installed")
+
+    from e2e_asr_attctc_th import pad_list
+
+
     n_out = 7
     n_batch = 3
     input_length = numpy.array([11, 17, 15], dtype=numpy.int32)
diff --git a/tools/Makefile b/tools/Makefile
index 28ad00fba2f..4842c2b38e5 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,3 +1,6 @@
+.PHONY: all clean
+
+
 all: kaldi venv venv/bin/activate nkf kaldi-io-for-python.git
 
 kaldi-io-for-python.git:
@@ -27,3 +30,14 @@ nkf:
 clean:
 	rm -fr kaldi_github kaldi kaldi_python venv nkf kaldi-io-for-python ../src/utils/kaldi_io_py.py
 	find -iname "*.pyc" -delete
+
+
+# optional deps for pytorch backends
+pytorch:
+	pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp27-cp27mu-linux_x86_64.whl
+
+warp-ctc:
+	git clone https://github.com/SeanNaren/warp-ctc.git
+	cd warp-ctc && mkdir build && cd build && cmake .. && make -j4
+	pip install cffi
+	cd warp-ctc/pytorch_binding && python setup.py install # maybe need to: apt-get install python-dev

From a7a43f370b55858284ca66e3ba1ef10d32fe1097 Mon Sep 17 00:00:00 2001
From: karita <karita.shigeki@lab.ntt.co.jp>
Date: Tue, 19 Dec 2017 14:17:50 +0900
Subject: [PATCH 03/17] make pytorch initialization consistent to chainer and
 test partialy

---
 src/nets/e2e_asr_attctc_th.py | 27 ++++++++++++++-
 test/test_initialization.py   | 62 +++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+), 1 deletion(-)
 create mode 100644 test/test_initialization.py

diff --git a/src/nets/e2e_asr_attctc_th.py b/src/nets/e2e_asr_attctc_th.py
index 46b4832dacb..b8e65904895 100644
--- a/src/nets/e2e_asr_attctc_th.py
+++ b/src/nets/e2e_asr_attctc_th.py
@@ -40,6 +40,29 @@ def _ilens_to_index(ilens):
     return x[1:]
 
 
+def lecun_normal_init_parameters(module):
+    for p in module.parameters():
+        data = p.data
+        if data.dim() == 1:
+            # bias
+            data.zero_()
+        elif data.dim() == 2:
+            # linear weight
+            n = data.size(1)
+            stdv = 1. / math.sqrt(n)
+            data.normal_(0, stdv)
+        elif data.dim() == 4:
+            # conv weight
+            n = data.size(1)
+            for k in data.size()[2:]:
+                n *= k
+            stdv = 1. / math.sqrt(n)
+            data.normal_(0, stdv)
+        else:
+            raise NotImplementedError
+        
+
+
 # get output dim for latter BLSTM
 def _get_vgg2l_odim(idim, in_channel=3, out_channel=128):
     idim = idim / in_channel
@@ -161,6 +184,8 @@ def __init__(self, idim, odim, args):
         # decoder
         self.dec = Decoder(args.eprojs, odim, args.dlayers, args.dunits,
                            self.sos, self.eos, self.att, self.verbose, self.char_list)
+        # maybe consistent to chainer
+        lecun_normal_init_parameters(self)
 
     # x[i]: ('utt_id', {'ilen':'xxx',...}})
     def forward(self, data):
@@ -272,7 +297,7 @@ def forward(self, hpad, ilens, ys):
         # expected shape of seqLength x batchSize x alphabet_size
         y_hat = y_hat.transpose(0, 1)
         
-        self.loss = to_cuda(self, self.loss_fn(y_hat, y_true, ilens, olens))
+        self.loss = to_cuda(self, self.loss_fn(y_hat, y_true, ilens, olens)) / len(ys)
         logging.info('ctc loss:' + str(self.loss.data[0]))
 
         return self.loss
diff --git a/test/test_initialization.py b/test/test_initialization.py
new file mode 100644
index 00000000000..79df38932f3
--- /dev/null
+++ b/test/test_initialization.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import importlib
+import argparse
+
+import pytest
+import numpy
+
+
+args = argparse.Namespace(
+    elayers = 4,
+    subsample = "1_2_2_1_1",
+    etype = "vggblstmp",
+    eunits = 100,
+    eprojs = 100,
+    dlayers=1,
+    dunits=300,
+    atype="location",
+    aconv_chans=10,
+    aconv_filts=100,
+    mtlalpha=0.5,
+    adim=320,
+    dropout_rate=0.0,
+    beam_size=3,
+    penalty=0.5,
+    maxlenratio=1.0,
+    minlenratio=0.0,
+    verbose = True,
+    char_list = [u"あ", u"い", u"う", u"え", u"お"],
+    outdir = None
+)
+
+
+
+def test_lecun_init_torch():
+    try:
+        import torch
+    except ImportError:
+        pytest.skip("pytorch is not installed")
+
+    import e2e_asr_attctc_th as m
+    model = m.Loss(m.E2E(40, 5, args), 0.5)
+    b = model.predictor.ctc.ctc_lo.bias.data.numpy()
+    assert numpy.all(b == 0.0)
+    w = model.predictor.ctc.ctc_lo.weight.data.numpy()
+    numpy.testing.assert_allclose(w.mean(), 0.0, 1e-2, 1e-2)
+    numpy.testing.assert_allclose(w.var(), 1.0 / w.shape[1], 1e-2, 1e-2)
+
+
+def test_lecun_init_chainer():
+    import e2e_asr_attctc as m
+    model = m.Loss(m.E2E(40, 5, args), 0.5)
+    b = model.predictor.ctc.ctc_lo.b.data
+    assert numpy.all(b == 0.0)
+    w = model.predictor.ctc.ctc_lo.W.data
+    numpy.testing.assert_allclose(w.mean(), 0.0, 1e-2, 1e-2)
+    numpy.testing.assert_allclose(w.var(), 1.0 / w.shape[1], 1e-2, 1e-2)
+    

From 2462ce3a827e88cc0ce8a897f4c2fcc900ca597a Mon Sep 17 00:00:00 2001
From: karita <karita.shigeki@lab.ntt.co.jp>
Date: Tue, 19 Dec 2017 14:24:15 +0900
Subject: [PATCH 04/17] fix encoding utf-8

---
 src/nets/e2e_asr_attctc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nets/e2e_asr_attctc.py b/src/nets/e2e_asr_attctc.py
index 542b0a73d6a..1eb70c14dff 100644
--- a/src/nets/e2e_asr_attctc.py
+++ b/src/nets/e2e_asr_attctc.py
@@ -466,8 +466,8 @@ def __call__(self, hs, ys):
                 idx_true = y_true_[y_true_ != -1]
                 seq_hat = [self.char_list[int(idx)] for idx in idx_hat]
                 seq_true = [self.char_list[int(idx)] for idx in idx_true]
-                seq_hat = "".join(seq_hat).encode('utf-8').replace('<space>', ' ')
-                seq_true = "".join(seq_true).encode('utf-8').replace('<space>', ' ')
+                seq_hat = "".join(seq_hat).replace('<space>', ' ')
+                seq_true = "".join(seq_true).replace('<space>', ' ')
                 logging.info("groundtruth[%d]: " + seq_true, i)
                 logging.info("prediction [%d]: " + seq_hat, i)
 

From d3f6bb102c02c85602bdd0c47d9f853ed4f9f293 Mon Sep 17 00:00:00 2001
From: karita <karita.shigeki@lab.ntt.co.jp>
Date: Wed, 20 Dec 2017 00:40:16 +0900
Subject: [PATCH 05/17] more compatible chainer/pytorch

---
 src/bin/asr_train_th.py       |  2 +-
 src/nets/e2e_asr_attctc_th.py | 34 +++++++++++++++++++++++-
 src/utils/concatjson.py       |  2 +-
 src/utils/filt.py             |  2 +-
 src/utils/json2trn.py         |  2 +-
 src/utils/mergejson.py        |  2 +-
 src/utils/scp2json.py         |  2 +-
 src/utils/text2token.py       |  2 +-
 test/test_initialization.py   | 34 ++++++++++++++++++++++--
 test/test_loss.py             | 49 ++++++++++++++++++++++++++++++++++-
 10 files changed, 120 insertions(+), 11 deletions(-)

diff --git a/src/bin/asr_train_th.py b/src/bin/asr_train_th.py
index ee71322d3d9..2fd3b670c74 100755
--- a/src/bin/asr_train_th.py
+++ b/src/bin/asr_train_th.py
@@ -460,7 +460,7 @@ def main():
 
     # Setup an optimizer
     if args.opt == 'adadelta':
-        optimizer = torch.optim.Adadelta(model.parameters(), eps=args.eps)
+        optimizer = torch.optim.Adadelta(model.parameters(), rho=0.95, eps=args.eps)
     elif args.opt == 'adam':
         optimizer = torch.optim.Adam(model.parameters())
 
diff --git a/src/nets/e2e_asr_attctc_th.py b/src/nets/e2e_asr_attctc_th.py
index b8e65904895..5ba495a7382 100644
--- a/src/nets/e2e_asr_attctc_th.py
+++ b/src/nets/e2e_asr_attctc_th.py
@@ -144,6 +144,12 @@ def pad_list(xs, pad_value=float("nan")):
     return pad
 
 
+def set_forget_bias_to_one(bias):
+    n = bias.size(0)
+    start, end = n//4, n//2
+    bias.data[start:end].fill_(1.)
+
+
 class E2E(torch.nn.Module):
     def __init__(self, idim, odim, args):
         super(E2E, self).__init__()
@@ -184,9 +190,35 @@ def __init__(self, idim, odim, args):
         # decoder
         self.dec = Decoder(args.eprojs, odim, args.dlayers, args.dunits,
                            self.sos, self.eos, self.att, self.verbose, self.char_list)
-        # maybe consistent to chainer
+
+        # weight initialization
+        self.init_like_chainer()
+        # additional forget-bias init in encoder ?
+        # for m in self.modules():
+        #     if isinstance(m, torch.nn.LSTM):
+        #         for name, p in m.named_parameters():
+        #             if "bias_ih" in name:
+        #                 set_forget_bias_to_one(p)
+    
+    def init_like_chainer(self):
+        """
+        chainer basically uses LeCun way: W ~ Normal(0, fan_in ** -0.5), b = 0
+        pytorch basically uses W, b ~ Uniform(-fan_in**-0.5, fan_in**-0.5)
+
+        however, there are two exceptions as far as I know.
+        - EmbedID.W ~ Normal(0, 1)
+        - LSTM.upward.b[forget_gate_range] = 1 (but not used in NStepLSTM)
+        """
         lecun_normal_init_parameters(self)
 
+        # exceptions
+        # embed weight ~ Normal(0, 1)
+        self.dec.embed.weight.data.normal_(0, 1)
+        # forget-bias = 1.0
+        # https://discuss.pytorch.org/t/set-forget-gate-bias-of-lstm/1745
+        set_forget_bias_to_one(self.dec.decoder.bias_ih)
+
+
     # x[i]: ('utt_id', {'ilen':'xxx',...}})
     def forward(self, data):
         '''
diff --git a/src/utils/concatjson.py b/src/utils/concatjson.py
index f187195bc43..08e70d35b91 100755
--- a/src/utils/concatjson.py
+++ b/src/utils/concatjson.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2
 # encoding: utf-8
 
 # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
diff --git a/src/utils/filt.py b/src/utils/filt.py
index 7bb9e3535be..1a3fb46398f 100755
--- a/src/utils/filt.py
+++ b/src/utils/filt.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2
 
 # Apache 2.0
 
diff --git a/src/utils/json2trn.py b/src/utils/json2trn.py
index 6b274c76cce..c0cea44a3ce 100755
--- a/src/utils/json2trn.py
+++ b/src/utils/json2trn.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2
 # encoding: utf-8
 
 # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
diff --git a/src/utils/mergejson.py b/src/utils/mergejson.py
index 9894b7438db..ef8500e37bb 100755
--- a/src/utils/mergejson.py
+++ b/src/utils/mergejson.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2
 # encoding: utf-8
 
 # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
diff --git a/src/utils/scp2json.py b/src/utils/scp2json.py
index 00b5eee0399..ad05534c62c 100755
--- a/src/utils/scp2json.py
+++ b/src/utils/scp2json.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2
 # encoding: utf-8
 
 # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
diff --git a/src/utils/text2token.py b/src/utils/text2token.py
index d72993db0af..a8dd1670db4 100755
--- a/src/utils/text2token.py
+++ b/src/utils/text2token.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2
 
 # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
diff --git a/test/test_initialization.py b/test/test_initialization.py
index 79df38932f3..4618f5fdc87 100644
--- a/test/test_initialization.py
+++ b/test/test_initialization.py
@@ -15,8 +15,8 @@
     elayers = 4,
     subsample = "1_2_2_1_1",
     etype = "vggblstmp",
-    eunits = 100,
-    eprojs = 100,
+    eunits = 320,
+    eprojs = 320,
     dlayers=1,
     dunits=300,
     atype="location",
@@ -50,6 +50,22 @@ def test_lecun_init_torch():
     numpy.testing.assert_allclose(w.mean(), 0.0, 1e-2, 1e-2)
     numpy.testing.assert_allclose(w.var(), 1.0 / w.shape[1], 1e-2, 1e-2)
 
+    for name, p in model.named_parameters():
+        print(name)
+        data = p.data.numpy()
+        if "embed" in name:
+            numpy.testing.assert_allclose(data.mean(), 0.0, 5e-2, 5e-2)
+            numpy.testing.assert_allclose(data.var(), 1.0, 5e-2, 5e-2)
+        elif "predictor.dec.decoder.bias_ih" in name:
+            assert data.sum() == data.size // 4
+        elif data.ndim == 1:
+            assert numpy.all(data == 0.0)
+        else:
+            numpy.testing.assert_allclose(data.mean(), 0.0, 5e-2, 5e-2)
+            numpy.testing.assert_allclose(data.var(), 1.0 / numpy.prod(data.shape[1:]), 5e-2, 5e-2)
+
+        
+        
 
 def test_lecun_init_chainer():
     import e2e_asr_attctc as m
@@ -59,4 +75,18 @@ def test_lecun_init_chainer():
     w = model.predictor.ctc.ctc_lo.W.data
     numpy.testing.assert_allclose(w.mean(), 0.0, 1e-2, 1e-2)
     numpy.testing.assert_allclose(w.var(), 1.0 / w.shape[1], 1e-2, 1e-2)
+
+    for name, p in model.namedparams():
+        print(name)
+        data = p.data
+        if "decoder/upward/b" in name:
+            assert data.sum() == data.size // 4
+        elif "embed" in name:
+            numpy.testing.assert_allclose(data.mean(), 0.0, 5e-2, 5e-2)
+            numpy.testing.assert_allclose(data.var(), 1.0, 5e-2, 5e-2)
+        elif data.ndim == 1:
+            assert numpy.all(data == 0.0)
+        else:
+            numpy.testing.assert_allclose(data.mean(), 0.0, 5e-2, 5e-2)
+            numpy.testing.assert_allclose(data.var(), 1.0 / numpy.prod(data.shape[1:]), 5e-2, 5e-2)
     
diff --git a/test/test_loss.py b/test/test_loss.py
index a4fea589515..3a396a7ee11 100644
--- a/test/test_loss.py
+++ b/test/test_loss.py
@@ -10,7 +10,7 @@
 import chainer.functions as F
 
 
-def test_loss():
+def test_ctc_loss():
     try:
         import torch
     except:
@@ -42,3 +42,50 @@ def test_loss():
     # NOTE: warpctc_pytorch.CTCLoss does not normalize itself by batch-size while chainer's default setting does
     th_loss = (CTCLoss()(th_pred, th_target, th_ilen, th_olen) / n_batch).data.numpy()[0]
     numpy.testing.assert_allclose(th_loss, ch_loss, 0.05)
+
+
+
+def test_attn_loss():
+    try:
+        import torch
+    except:
+        pytest.skip("pytorch is not installed")
+    from e2e_asr_attctc_th import pad_list
+
+    n_out = 7
+    _sos = n_out - 1
+    _eos = n_out - 1
+    n_batch = 3
+    label_length = numpy.array([4, 2, 3], dtype=numpy.int32)
+    np_pred = numpy.random.rand(n_batch, max(label_length) + 1, n_out).astype(numpy.float32)
+    # NOTE: 0 is only used for CTC, never appeared in attn target
+    np_target = [numpy.random.randint(1, n_out-1, size=ol, dtype=numpy.int32) for ol in label_length]
+
+    eos = numpy.array([_eos], 'i')
+    sos = numpy.array([_sos], 'i')
+    ys_in = [F.concat([sos, y], axis=0) for y in np_target]
+    ys_out = [F.concat([y, eos], axis=0) for y in np_target]
+
+    # padding for ys with -1
+    # pys: utt x olen
+    pad_ys_in = F.pad_sequence(ys_in, padding=_eos)
+    pad_ys_out = F.pad_sequence(ys_out, padding=-1)  # NOTE: -1 is default ignore index for chainer
+
+    y_all = F.reshape(np_pred, (n_batch * (max(label_length) + 1), n_out))
+    ch_loss = F.softmax_cross_entropy(y_all, F.concat(pad_ys_out, axis=0))
+
+    # NOTE: this index 0 is only for CTC not attn. so it can be ignored
+    # unfortunately, torch cross_entropy does not accept out-of-bound ids
+    th_ignore = 0
+    th_pred = torch.autograd.Variable(torch.from_numpy(y_all.data))
+    th_target = pad_list([torch.autograd.Variable(torch.from_numpy(t.data)).long()
+                          for t in ys_out], th_ignore)
+    th_olen = torch.autograd.Variable(torch.from_numpy(label_length))
+    th_loss = torch.nn.functional.cross_entropy(th_pred, th_target.view(-1),
+                                                ignore_index=th_ignore, size_average=True)
+    print(ch_loss)
+    print(th_loss)
+
+    # NOTE: warpctc_pytorch.CTCLoss does not normalize itself by batch-size while chainer's default setting does
+
+    numpy.testing.assert_allclose(th_loss.data[0], ch_loss.data, 0.05)

From fe60ae8fb0469761b8f8e3fe1a791e2655c57b24 Mon Sep 17 00:00:00 2001
From: karita <karita.shigeki@lab.ntt.co.jp>
Date: Wed, 20 Dec 2017 00:49:48 +0900
Subject: [PATCH 06/17] remove unused main

---
 src/nets/e2e_asr_attctc.py | 47 --------------------------------------
 1 file changed, 47 deletions(-)

diff --git a/src/nets/e2e_asr_attctc.py b/src/nets/e2e_asr_attctc.py
index 1eb70c14dff..943fdaa14d3 100644
--- a/src/nets/e2e_asr_attctc.py
+++ b/src/nets/e2e_asr_attctc.py
@@ -793,50 +793,3 @@ def __call__(self, xs, ilens):
         xs = [xs[i, :ilens[i], :] for i in range(len(ilens))]
 
         return xs, ilens
-
-
-if __name__ == '__main__':
-    import numpy
-    # from typing import NamedTuple
-    from argparse import Namespace
-    args = Namespace(
-        elayers = 4,
-        subsample = "1_2_2_1_1",
-        etype = "vggblstmp",
-        eunits = 100,
-        eprojs = 100,
-        dlayers=1,
-        dunits=300,
-        # attention related
-        atype="location",
-        aconv_chans=10,
-        aconv_filts=100,
-        mtlalpha=0.5,
-        # defaults
-        adim=320,
-        dropout_rate=0.0,
-        beam_size=3,
-        penalty=0.5,
-
-        maxlenratio=1.0,
-        minlenratio=0.0,
-
-        verbose = True,
-        char_list = ["a", "b", "c", "d", "e"],
-        outdir = None
-        )
-
-    model = Loss(E2E(40, 5, args), 0.5)
-    out_data = "1 2 3 4"
-    data = [
-        ("aaa", dict(feat=numpy.random.randn(100, 40).astype(numpy.float32), tokenid=out_data)),
-        ("bbb", dict(feat=numpy.random.randn(200, 40).astype(numpy.float32), tokenid=out_data))
-    ]
-    attn_loss = model(data)
-    print(attn_loss)
-    attn_loss.backward()
-
-    in_data = data[0][1]["feat"]
-    y = model.predictor.recognize(in_data, args, args.char_list)
-    print(y)
-    print("OK")

From f08c91162a06243263b85a182c8518d3f8ef4dca Mon Sep 17 00:00:00 2001
From: karita <karita.shigeki@lab.ntt.co.jp>
Date: Wed, 20 Dec 2017 02:04:42 +0900
Subject: [PATCH 07/17] make accuracy consistent

---
 src/nets/e2e_asr_attctc_th.py | 13 ++++++-----
 test/test_loss.py             | 42 +++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/src/nets/e2e_asr_attctc_th.py b/src/nets/e2e_asr_attctc_th.py
index 5ba495a7382..33ff3f0079b 100644
--- a/src/nets/e2e_asr_attctc_th.py
+++ b/src/nets/e2e_asr_attctc_th.py
@@ -474,6 +474,13 @@ def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev, scaling=2.0):
         return c, w
 
 
+def th_accuracy(y_all, pad_target, ignore_label):
+    acc = 0
+    pad_pred = y_all.data.view(pad_target.size(0), pad_target.size(1), y_all.size(1)).max(2)[1]
+    mask = pad_target.data != ignore_label
+    return torch.sum(pad_pred.masked_select(mask) == pad_target.data.masked_select(mask)) / torch.sum(mask)
+
+
 # ------------- Decoder Network ----------------------------------------------------------------------------------------
 class Decoder(torch.nn.Module):
     def __init__(self, eprojs, odim, dlayers, dunits, sos, eos, att, verbose=0, char_list=None):
@@ -551,11 +558,7 @@ def forward(self, hpad, hlen, ys):
         # -1: eos, which is removed in the loss computation
         self.loss *= (np.mean([len(x) for x in ys_in]) - 1)
         # acc = F.accuracy(y_all, F.concat(pad_ys_out, axis=0), ignore_label=-1)
-        acc = 0
-        pred_pad = y_all.data.view(len(ys), olength, y_all.size(1)).max(2)[1]
-        for i in range(len(ys)):
-            acc += torch.sum(pred_pad[i, :ys[i].size(0)] == ys[i].data)
-        acc /= sum(map(len, ys))
+        acc = th_accuracy(y_all, pad_ys_out, ignore_label=ignore_id)
         logging.info('att loss:' + str(self.loss.data))
 
         # show predicted character sequence for debug
diff --git a/test/test_loss.py b/test/test_loss.py
index 3a396a7ee11..0c758d20e23 100644
--- a/test/test_loss.py
+++ b/test/test_loss.py
@@ -89,3 +89,45 @@ def test_attn_loss():
     # NOTE: warpctc_pytorch.CTCLoss does not normalize itself by batch-size while chainer's default setting does
 
     numpy.testing.assert_allclose(th_loss.data[0], ch_loss.data, 0.05)
+
+
+
+def test_train_acc():
+    try:
+        import torch
+    except:
+        pytest.skip("pytorch is not installed")
+    from e2e_asr_attctc_th import pad_list, th_accuracy
+
+    n_out = 7
+    _sos = n_out - 1
+    _eos = n_out - 1
+    n_batch = 3
+    label_length = numpy.array([4, 2, 3], dtype=numpy.int32)
+    np_pred = numpy.random.rand(n_batch, max(label_length) + 1, n_out).astype(numpy.float32)
+    # NOTE: 0 is only used for CTC, never appeared in attn target
+    np_target = [numpy.random.randint(1, n_out-1, size=ol, dtype=numpy.int32) for ol in label_length]
+
+    eos = numpy.array([_eos], 'i')
+    sos = numpy.array([_sos], 'i')
+    ys_in = [F.concat([sos, y], axis=0) for y in np_target]
+    ys_out = [F.concat([y, eos], axis=0) for y in np_target]
+
+    # padding for ys with -1
+    # pys: utt x olen
+    pad_ys_in = F.pad_sequence(ys_in, padding=_eos)
+    pad_ys_out = F.pad_sequence(ys_out, padding=-1)  # NOTE: -1 is default ignore index for chainer
+    y_all = F.reshape(np_pred, (n_batch * (max(label_length) + 1), n_out))
+    ch_acc = F.accuracy(y_all, F.concat(pad_ys_out, axis=0), ignore_label=-1)
+
+    # NOTE: this index 0 is only for CTC not attn. so it can be ignored
+    # unfortunately, torch cross_entropy does not accept out-of-bound ids
+    th_ignore = 0
+    th_pred = torch.autograd.Variable(torch.from_numpy(y_all.data))
+    th_ys = [torch.autograd.Variable(torch.from_numpy(numpy.append(t, eos))).long()
+             for t in np_target]
+    th_target = pad_list(th_ys, th_ignore)
+    th_olen = torch.autograd.Variable(torch.from_numpy(label_length))
+    th_acc = th_accuracy(th_pred, th_target, th_ignore)
+
+    numpy.testing.assert_allclose(ch_acc.data, th_acc)

From 4d48754f781641e2dd51bfa98b89d676836f2d06 Mon Sep 17 00:00:00 2001
From: karita <karita.shigeki@lab.ntt.co.jp>
Date: Wed, 20 Dec 2017 18:26:51 +0900
Subject: [PATCH 08/17] add more test

---
 src/bin/asr_train.py          |  2 +-
 src/bin/asr_train_th.py       | 19 +-------
 src/nets/e2e_asr_attctc.py    | 11 +++--
 src/nets/e2e_asr_attctc_th.py | 86 +++++++++++++++++++----------------
 test/test_torch.py            | 35 ++++++++++++++
 5 files changed, 91 insertions(+), 62 deletions(-)
 create mode 100644 test/test_torch.py

diff --git a/src/bin/asr_train.py b/src/bin/asr_train.py
index 6b0dd36d142..14688c4456f 100755
--- a/src/bin/asr_train.py
+++ b/src/bin/asr_train.py
@@ -281,7 +281,7 @@ def main():
     # network archtecture
     # encoder
     parser.add_argument('--etype', default='blstmp', type=str,
-                        choices=['blstmp', 'vggblstmp', 'vggblstm'],
+                        choices=['blstm', 'blstmp', 'vggblstmp', 'vggblstm'],
                         help='Type of encoder network architecture')
     parser.add_argument('--elayers', default=4, type=int,
                         help='Number of encoder layers')
diff --git a/src/bin/asr_train_th.py b/src/bin/asr_train_th.py
index 2fd3b670c74..88d358eb8ec 100755
--- a/src/bin/asr_train_th.py
+++ b/src/bin/asr_train_th.py
@@ -3,23 +3,6 @@
 # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
-"""
-options (batch_size is only changed because of my poor GPU at home): --gpu -1 --outdir exp/train_si284_vggblstmp_e4_subsample1_2_2_1_1_unit320_proj320_d1_unit300_location_aconvc10_aconvf100_mtlalpha0.5_adadelta_bs30_mli800_mlo150/results --debugmode 1 --dict data/lang_1char/train_si284_units.txt --debugdir exp/train_si284_vggblstmp_e4_subsample1_2_2_1_1_unit320_proj320_d1_unit300_location_aconvc10_aconvf100_mtlalpha0.5_adadelta_bs30_mli800_mlo150 --minibatches 0 --verbose 0 --train-feat scp:dump/train_si284/deltafalse/feats.scp --valid-feat scp:dump/test_dev93/deltafalse/feats.scp --train-label dump/train_si284/deltafalse/data.json --valid-label dump/test_dev93/deltafalse/data.json --etype blstmp --elayers 4 --eunits 320 --eprojs 320 --subsample 1_2_2_1_1 --dlayers 1 --dunits 300 --atype location --aconv-chans 10 --aconv-filts 100 --mtlalpha 0.5 --batch-size 5 --maxlen-in 800 --maxlen-out 150 --opt adadelta --epochs 15 --gpu 0 
-
-
-chainer result
-this epoch [#.................................................]  3.13%
-       400 iter, 0 epoch / 15 epochs
-   0.67657 iters/sec. Estimated time to finish: 3 days, 6:31:44.616061.
-
-
-pytorch result
-this epoch [#.................................................]  2.35%
-       300 iter, 0 epoch / 15 epochs
-    1.4973 iters/sec. Estimated time to finish: 1 day, 11:30:13.571661.
-
-"""
-
 
 import os
 import copy
@@ -305,7 +288,7 @@ def main():
     # network archtecture
     # encoder
     parser.add_argument('--etype', default='blstmp', type=str,
-                        choices=['blstmp', 'vggblstmp', 'vggblstm'],
+                        choices=['blstm', 'blstmp', 'vggblstmp', 'vggblstm'],
                         help='Type of encoder network architecture')
     parser.add_argument('--elayers', default=4, type=int,
                         help='Number of encoder layers')
diff --git a/src/nets/e2e_asr_attctc.py b/src/nets/e2e_asr_attctc.py
index 943fdaa14d3..9a47df81bd8 100644
--- a/src/nets/e2e_asr_attctc.py
+++ b/src/nets/e2e_asr_attctc.py
@@ -271,7 +271,7 @@ def reset(self):
         self.enc_h = None
         self.pre_compute_enc_h = None
 
-    def __call__(self, enc_hs, dec_z, scaling=2.0):
+    def __call__(self, enc_hs, dec_z, att_prev, scaling=2.0):
         '''
 
         :param enc_hs:
@@ -631,7 +631,10 @@ class Encoder(chainer.Chain):
     def __init__(self, etype, idim, elayers, eunits, eprojs, subsample, dropout, in_channel=1):
         super(Encoder, self).__init__()
         with self.init_scope():
-            if etype == 'blstmp':
+            if etype == 'blstm':
+                self.enc1 = BLSTM(idim, elayers, eunits, eprojs, dropout)
+                logging.info('BLSTM without projection for encoder')
+            elif etype == 'blstmp':
                 self.enc1 = BLSTMP(idim, elayers, eunits, eprojs, subsample, dropout)
                 logging.info('BLSTM with every-layer projection for encoder')
             elif etype == 'vggblstmp':
@@ -656,7 +659,9 @@ def __call__(self, xs, ilens):
         :param ilens:
         :return:
         '''
-        if self.etype == 'blstmp':
+        if self.etype == 'blstm':
+            xs, ilens = self.enc1(xs, ilens)
+        elif self.etype == 'blstmp':
             xs, ilens = self.enc1(xs, ilens)
         elif self.etype == 'vggblstmp':
             xs, ilens = self.enc1(xs, ilens)
diff --git a/src/nets/e2e_asr_attctc_th.py b/src/nets/e2e_asr_attctc_th.py
index 33ff3f0079b..2fd44057e66 100644
--- a/src/nets/e2e_asr_attctc_th.py
+++ b/src/nets/e2e_asr_attctc_th.py
@@ -335,15 +335,21 @@ def forward(self, hpad, ilens, ys):
         return self.loss
 
 
+def mask_by_length(xs, length, fill=0):
+    assert xs.size(0) == len(length)
+    ret = Variable(xs.data.new(*xs.size()).fill_(fill))
+    for i, l in enumerate(length):
+        ret[i, :l] = xs[i, :l]
+    return ret
+        
+
 # ------------- Attention Network --------------------------------------------------------------------------------------
 # dot product based attention
 class AttDot(torch.nn.Module):
     def __init__(self, eprojs, dunits, att_dim):
-        raise NotImplementedError
         super(AttDot, self).__init__()
-        with self.init_scope():
-            self.mlp_enc = L.Linear(eprojs, att_dim)
-            self.mlp_dec = L.Linear(dunits, att_dim)
+        self.mlp_enc = torch.nn.Linear(eprojs, att_dim)
+        self.mlp_dec = torch.nn.Linear(dunits, att_dim)
 
         self.dunits = dunits
         self.eprojs = eprojs
@@ -361,7 +367,7 @@ def reset(self):
         self.enc_h = None
         self.pre_compute_enc_h = None
 
-    def forward(self, enc_hs, dec_z, scaling=2.0):
+    def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev, scaling=2.0):
         '''
 
         :param enc_hs:
@@ -369,27 +375,28 @@ def forward(self, enc_hs, dec_z, scaling=2.0):
         :param scaling:
         :return:
         '''
-        batch = len(enc_hs)
+        batch = enc_hs_pad.size(0)
         # pre-compute all h outside the decoder loop
         if self.pre_compute_enc_h is None:
-            self.enc_h = F.pad_sequence(enc_hs)  # utt x frame x hdim
+            self.enc_h = mask_by_length(enc_hs_pad, enc_hs_len)  # utt x frame x hdim
             self.h_length = self.enc_h.shape[1]
             # utt x frame x att_dim
-            self.pre_compute_enc_h = F.tanh(linear_tensor(self.mlp_enc, self.enc_h))
+            self.pre_compute_enc_h = linear_tensor(self.mlp_enc, self.enc_h)
 
         if dec_z is None:
-            dec_z = chainer.Variable(self.xp.zeros((batch, self.dunits), dtype=np.float32))
+            dec_z = Variable(enc_hs_pad.data.new(batch, self.dunits).zero_())
         else:
-            dec_z = F.reshape(dec_z, (batch, self.dunits))
+            dec_z = dec_z.view(batch, self.dunits)
+
+        e = torch.sum(self.pre_compute_enc_h *
+                      torch.tanh(self.mlp_dec(dec_z)).view(batch, 1, self.att_dim),
+                      dim=2)  # utt x frame
+        w = torch.nn.functional.softmax(scaling * e, dim=1)
 
-        # <phi (h_t), psi (s)> for all t
-        e = F.sum(self.pre_compute_enc_h * F.tile(F.reshape(F.tanh(self.mlp_dec(dec_z)), (batch, 1, self.att_dim)),
-                                                  (1, self.h_length, 1)), axis=2)  # utt x frame
-        w = F.softmax(scaling * e)
         # weighted sum over flames
         # utt x hdim
-        c = F.sum(self.enc_h * F.tile(F.reshape(w, (batch, self.h_length, 1)), (1, 1, self.eprojs)), axis=1)
-
+        # NOTE use bmm instead of sum(*)
+        c = torch.sum(self.enc_h * w.view(batch, self.h_length, 1), dim=1)
         return c, w
 
 
@@ -489,6 +496,7 @@ def __init__(self, eprojs, odim, dlayers, dunits, sos, eos, att, verbose=0, char
         self.embed = torch.nn.Embedding(odim, dunits)
         # TODO use multiple layers with dlayers option
         self.decoder = torch.nn.LSTMCell(dunits + eprojs, dunits)  # 310s per 100 ite -> 240s from NStepLSTM
+        self.ignore_id = 0  # NOTE: 0 for CTC?
         self.output = torch.nn.Linear(dunits, odim)
 
         self.loss = None
@@ -520,8 +528,7 @@ def forward(self, hpad, hlen, ys):
         # pys: utt x olen
 
         pad_ys_in = pad_list(ys_in, self.eos)
-        ignore_id = 0  # NOTE: 0 for CTC?
-        pad_ys_out = pad_list(ys_out, ignore_id)
+        pad_ys_out = pad_list(ys_out, self.ignore_id)
 
         # get dim, length info
         batch = pad_ys_out.shape[0]
@@ -548,17 +555,17 @@ def forward(self, hpad, hlen, ys):
             z_all.append(z)
             att_weight_all.append(att_w.data)  # for debugging
 
-        z_all = torch.stack(z_all, dim=1).view(batch * olength, self.dunits)  # NOTE: maybe cat?
+        z_all = torch.stack(z_all, dim=1).view(batch * olength, self.dunits)
         # compute loss
         y_all = self.output(z_all)
         # NOTE: use size_average=True?
         self.loss = torch.nn.functional.cross_entropy(y_all, pad_ys_out.view(-1),
-                                                      ignore_index=ignore_id, size_average=True)
+                                                      ignore_index=self.ignore_id, size_average=True)
         # NOTE: is this length-scaling required?
         # -1: eos, which is removed in the loss computation
         self.loss *= (np.mean([len(x) for x in ys_in]) - 1)
         # acc = F.accuracy(y_all, F.concat(pad_ys_out, axis=0), ignore_label=-1)
-        acc = th_accuracy(y_all, pad_ys_out, ignore_label=ignore_id)
+        acc = th_accuracy(y_all, pad_ys_out, ignore_label=self.ignore_id)
         logging.info('att loss:' + str(self.loss.data))
 
         # show predicted character sequence for debug
@@ -745,7 +752,10 @@ class Encoder(torch.nn.Module):
     def __init__(self, etype, idim, elayers, eunits, eprojs, subsample, dropout, in_channel=1):
         super(Encoder, self).__init__()
 
-        if etype == 'blstmp':
+        if etype == 'blstm':
+            self.enc1 = BLSTM(idim, elayers, eunits, eprojs, dropout)
+            logging.info('BLSTM without projection for encoder')
+        elif etype == 'blstmp':
             self.enc1 = BLSTMP(idim, elayers, eunits, eprojs, subsample, dropout)
             logging.info('BLSTM with every-layer projection for encoder')
         elif etype == 'vggblstmp':
@@ -774,7 +784,9 @@ def forward(self, xs, ilens):
         :param ilens:
         :return:
         '''
-        if self.etype == 'blstmp':
+        if self.etype == 'blstm':
+            xs, ilens = self.enc1(xs, ilens)
+        elif self.etype == 'blstmp':
             xs, ilens = self.enc1(xs, ilens)
         elif self.etype == 'vggblstmp':
             xs, ilens = self.enc1(xs, ilens)
@@ -832,13 +844,12 @@ def forward(self, xpad, ilens):
 
 class BLSTM(torch.nn.Module):
     def __init__(self, idim, elayers, cdim, hdim, dropout):
-        raise NotImplementedError
         super(BLSTM, self).__init__()
-        with self.init_scope():
-            self.nblstm = L.NStepBiLSTM(elayers, idim, cdim, dropout)
-            self.l_last = L.Linear(cdim * 2, hdim)
+        self.nblstm = torch.nn.LSTM(idim, cdim, elayers, batch_first=True,
+                                    dropout=dropout, bidirectional=True)
+        self.l_last = torch.nn.Linear(cdim * 2, hdim)
 
-    def forward(self, xs, ilens):
+    def forward(self, xpad, ilens):
         '''
 
         :param xs:
@@ -846,19 +857,14 @@ def forward(self, xs, ilens):
         :return:
         '''
         logging.info(self.__class__.__name__ + ' input lengths: ' + str(ilens))
-        hy, cy, ys = self.nblstm(None, None, xs)
-        ys = self.l_last(F.vstack(ys))  # (sum _utt frame_utt) x dim
-        xs = F.split_axis(ys, _ilens_to_index(ilens), axis=0)
+        xpack = pack_padded_sequence(xpad, ilens, batch_first=True)
+        ys, (hy, cy) = self.nblstm(xpack)
         del hy, cy
-
-        # final tanh operation
-        xs = F.split_axis(F.tanh(F.vstack(xs)), _ilens_to_index(ilens), axis=0)
-
-        # 1 utterance case, it becomes an array, so need to make a utt tuple
-        if not isinstance(xs, tuple):
-            xs = [xs]
-
-        return xs, ilens  # x: utt list of frame x dim
+        # ys: utt list of frame x cdim x 2 (2: means bidirectional)
+        ypad, ilens = pad_packed_sequence(ys, batch_first=True)
+        projected = torch.tanh(self.l_last(ypad.contiguous().view(-1, ypad.size(2))))  # (sum _utt frame_utt) x dim
+        xpad = projected.view(ypad.size(0), ypad.size(1), -1)
+        return xpad, ilens  # x: utt list of frame x dim
 
 
 class VGG2L(torch.nn.Module):
diff --git a/test/test_torch.py b/test/test_torch.py
new file mode 100644
index 00000000000..1f41e62aa63
--- /dev/null
+++ b/test/test_torch.py
@@ -0,0 +1,35 @@
+import pytest
+
+try:
+    import torch
+    from torch.autograd import Variable
+except:
+    pytest.skip("pytorch is not installed")
+
+from e2e_asr_attctc_th import pad_list, mask_by_length
+
+
+def test_pad_list():
+    xs = [[1, 2, 3],
+          [1, 2],
+          [1, 2, 3, 4]]
+    xs = list(map(lambda x: Variable(torch.LongTensor(x)), xs))
+    xpad = pad_list(xs, -1)
+    
+    es = [[1, 2, 3, -1],
+          [1, 2, -1, -1],
+          [1, 2, 3, 4]]
+    assert xpad.data.tolist() == es
+
+
+def test_mask_by_length():
+    xs = [[1, 2, 3, -1],
+          [1, 2, -1, -1],
+          [1, 2, 3, 4]]
+    xs = Variable(torch.LongTensor(xs))
+    xlen = [3, 2, 4]
+    ys = mask_by_length(xs, xlen, fill=0)
+    es = [[1, 2, 3, 0],
+          [1, 2, 0, 0],
+          [1, 2, 3, 4]]
+    assert ys.data.tolist() == es

From 62a1cb0b3862d58ab0f5d29bf2eed71b71f93aeb Mon Sep 17 00:00:00 2001
From: karita <shigekikarita@gmail.com>
Date: Thu, 21 Dec 2017 10:12:18 +0900
Subject: [PATCH 09/17] add forgotten tanh in encoder, chainer-like ctc and
 test them

---
 src/nets/e2e_asr_attctc_th.py | 42 +++++++++++++++-----
 test/test_e2e_model.py        | 72 ++++++++++++++++++++++++++++++++++-
 test/test_initialization.py   |  4 +-
 3 files changed, 105 insertions(+), 13 deletions(-)

diff --git a/src/nets/e2e_asr_attctc_th.py b/src/nets/e2e_asr_attctc_th.py
index 2fd44057e66..b9412756962 100644
--- a/src/nets/e2e_asr_attctc_th.py
+++ b/src/nets/e2e_asr_attctc_th.py
@@ -295,14 +295,40 @@ def recognize(self, x, recog_args, char_list):
 
 
 # ------------- CTC Network --------------------------------------------------------------------------------------------
+
+from warpctc_pytorch import CTCLoss, _CTC
+
+class _ChainerLikeCTC(_CTC):
+    def forward(self, acts, labels, act_lens, label_lens):
+        return super(_ChainerLikeCTC, self).forward(acts, labels, act_lens, label_lens) / acts.size(1)
+
+    def backward(self, grad_output):
+        return self.grads / self.grads.size(1), None, None, None
+
+
+def chainer_like_ctc_loss(acts, labels, act_lens, label_lens):
+    """
+    acts: Tensor of (seqLength x batch x outputDim) containing output from network
+    labels: 1 dimensional Tensor containing all the targets of the batch in one sequence
+    act_lens: Tensor of size (batch) containing size of each output sequence from the network
+    act_lens: Tensor of (batch) containing label length of each example
+    """
+    assert len(labels.size()) == 1 # labels must be 1 dimensional
+    from torch.nn.modules.loss import _assert_no_grad
+    _assert_no_grad(labels)
+    _assert_no_grad(act_lens)
+    _assert_no_grad(label_lens)
+    return _ChainerLikeCTC()(acts, labels, act_lens, label_lens)
+
+
+
 class CTC(torch.nn.Module):
     def __init__(self, odim, eprojs, dropout_rate):
         super(CTC, self).__init__()
         self.dropout_rate = dropout_rate
         self.loss = None
         self.ctc_lo = torch.nn.Linear(eprojs, odim)
-        from warpctc_pytorch import CTCLoss
-        self.loss_fn = CTCLoss()
+        self.loss_fn = chainer_like_ctc_loss  # CTCLoss()
 
     def forward(self, hpad, ilens, ys):
         '''
@@ -328,8 +354,7 @@ def forward(self, hpad, ilens, ys):
         # get ctc loss
         # expected shape of seqLength x batchSize x alphabet_size
         y_hat = y_hat.transpose(0, 1)
-        
-        self.loss = to_cuda(self, self.loss_fn(y_hat, y_true, ilens, olens)) / len(ys)
+        self.loss = to_cuda(self, self.loss_fn(y_hat, y_true, ilens, olens))
         logging.info('ctc loss:' + str(self.loss.data[0]))
 
         return self.loss
@@ -341,7 +366,7 @@ def mask_by_length(xs, length, fill=0):
     for i, l in enumerate(length):
         ret[i, :l] = xs[i, :l]
     return ret
-        
+
 
 # ------------- Attention Network --------------------------------------------------------------------------------------
 # dot product based attention
@@ -378,7 +403,7 @@ def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev, scaling=2.0):
         batch = enc_hs_pad.size(0)
         # pre-compute all h outside the decoder loop
         if self.pre_compute_enc_h is None:
-            self.enc_h = mask_by_length(enc_hs_pad, enc_hs_len)  # utt x frame x hdim
+            self.enc_h = enc_hs_pad  # utt x frame x hdim
             self.h_length = self.enc_h.shape[1]
             # utt x frame x att_dim
             self.pre_compute_enc_h = linear_tensor(self.mlp_enc, self.enc_h)
@@ -517,6 +542,7 @@ def forward(self, hpad, hlen, ys):
         :param ys:
         :return:
         '''
+        hpad = mask_by_length(hpad, hlen, 0)
         self.loss = None
         # prepare input and output word sequences with sos/eos IDs
         eos = Variable(ys[0].data.new([self.eos]))
@@ -558,10 +584,8 @@ def forward(self, hpad, hlen, ys):
         z_all = torch.stack(z_all, dim=1).view(batch * olength, self.dunits)
         # compute loss
         y_all = self.output(z_all)
-        # NOTE: use size_average=True?
         self.loss = torch.nn.functional.cross_entropy(y_all, pad_ys_out.view(-1),
                                                       ignore_index=self.ignore_id, size_average=True)
-        # NOTE: is this length-scaling required?
         # -1: eos, which is removed in the loss computation
         self.loss *= (np.mean([len(x) for x in ys_in]) - 1)
         # acc = F.accuracy(y_all, F.concat(pad_ys_out, axis=0), ignore_label=-1)
@@ -836,7 +860,7 @@ def forward(self, xpad, ilens):
                 ypad = ypad[:, ::sub]
                 ilens = [(i + 1) // sub for i in ilens]
             projected = getattr(self, 'bt' + str(layer))(ypad.contiguous().view(-1, ypad.size(2)))  # (sum _utt frame_utt) x dim
-            xpad = projected.view(ypad.size(0), ypad.size(1), -1)
+            xpad = torch.tanh(projected.view(ypad.size(0), ypad.size(1), -1))
             del hy, cy
 
         return xpad, ilens  # x: utt list of frame x dim
diff --git a/test/test_e2e_model.py b/test/test_e2e_model.py
index 2ced58c04b4..de0af671e0d 100644
--- a/test/test_e2e_model.py
+++ b/test/test_e2e_model.py
@@ -9,6 +9,7 @@
 
 import pytest
 import numpy
+import chainer
 
 
 args = argparse.Namespace(
@@ -29,7 +30,7 @@
     penalty=0.5,
     maxlenratio=1.0,
     minlenratio=0.0,
-    verbose = True,
+    verbose = 2,
     char_list = [u"あ", u"い", u"う", u"え", u"お"],
     outdir = None
 )
@@ -57,3 +58,72 @@ def test_model_trainable_and_decodable():
         in_data = data[0][1]["feat"]
         y = model.predictor.recognize(in_data, args, args.char_list) # decodable
 
+
+
+def init_torch_weight_const(m, val):
+    for p in m.parameters():
+        if p.dim() > 1:
+            p.data.fill_(val)
+
+
+def init_chainer_weight_const(m, val):
+    for p in m.params():
+        if p.data.ndim > 1:
+            p.data[:] = val
+
+
+class Model(chainer.Chain):
+    def __init__(self, n_in, n_out):
+        super(Model, self).__init__()
+        with self.init_scope():
+            self.a = chainer.links.Linear(n_in, n_out)
+
+    def __call__(self, x):
+        return self.a(x)
+
+# def test_encoder_mask_equal():
+if __name__ == "__main__":
+    import logging
+    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s')
+    try:
+        import torch
+    except:
+        pytest.skip("pytorch is not installed")
+    import e2e_asr_attctc as ch
+    import e2e_asr_attctc_th as th
+    ch_model = ch.E2E(40, 5, args)
+    ch_model.cleargrads()
+    th_model = th.E2E(40, 5, args)
+
+    const = 1e-4
+    init_torch_weight_const(th_model, const)
+    init_chainer_weight_const(ch_model, const)
+
+    out_data = "1 2 3 4"
+    data = [
+        ("aaa", dict(feat=numpy.random.randn(200, 40).astype(numpy.float32), tokenid=out_data)),
+        ("bbb", dict(feat=numpy.random.randn(100, 40).astype(numpy.float32), tokenid=out_data)),
+        ("cc", dict(feat=numpy.random.randn(100, 40).astype(numpy.float32), tokenid=out_data))
+    ]
+
+    ch_ctc, ch_att, ch_acc = ch_model(data)
+    th_ctc, th_att, th_acc = th_model(data)
+
+    # test masking
+    ch_ench = ch_model.att.pre_compute_enc_h.data
+    th_ench = th_model.att.pre_compute_enc_h.data.numpy()
+    numpy.testing.assert_equal(ch_ench == 0.0, th_ench == 0.0)
+
+    # test loss with constant weights (1.0) and bias (0.0) except for foget-bias (1.0)
+    numpy.testing.assert_allclose(ch_ctc.data, th_ctc.data.numpy())
+    numpy.testing.assert_allclose(ch_att.data, th_att.data.numpy())
+
+    # test grads
+    ch_ctc.backward()
+    th_ctc.backward()
+    numpy.testing.assert_allclose(ch_model.ctc.ctc_lo.W.grad,
+                                  th_model.ctc.ctc_lo.weight.grad.data.numpy(), 1e-7, 1e-8)
+
+    numpy.testing.assert_allclose(ch_model.ctc.ctc_lo.W.grad,
+                                  th_model.ctc.ctc_lo.weight.grad.data.numpy(), 1e-7, 1e-8)
+
diff --git a/test/test_initialization.py b/test/test_initialization.py
index 4618f5fdc87..a94c2acf457 100644
--- a/test/test_initialization.py
+++ b/test/test_initialization.py
@@ -64,8 +64,6 @@ def test_lecun_init_torch():
             numpy.testing.assert_allclose(data.mean(), 0.0, 5e-2, 5e-2)
             numpy.testing.assert_allclose(data.var(), 1.0 / numpy.prod(data.shape[1:]), 5e-2, 5e-2)
 
-        
-        
 
 def test_lecun_init_chainer():
     import e2e_asr_attctc as m
@@ -89,4 +87,4 @@ def test_lecun_init_chainer():
         else:
             numpy.testing.assert_allclose(data.mean(), 0.0, 5e-2, 5e-2)
             numpy.testing.assert_allclose(data.var(), 1.0 / numpy.prod(data.shape[1:]), 5e-2, 5e-2)
-    
+

From 517c95e8da893d8b6fec10307d5af45fdbbfcfeb Mon Sep 17 00:00:00 2001
From: ShigekiKarita <shigekikarita@gmail.com>
Date: Thu, 21 Dec 2017 15:43:00 +0900
Subject: [PATCH 10/17] add more tests on optim / grad

---
 .gitignore             |  3 ++
 .travis.yml            |  2 +-
 pytest.ini             |  5 ++++
 test/test_e2e_model.py | 16 +++++++----
 test/test_optimizer.py | 62 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 81 insertions(+), 7 deletions(-)
 create mode 100644 pytest.ini
 create mode 100644 test/test_optimizer.py

diff --git a/.gitignore b/.gitignore
index 2fc17e926f1..333251ac0eb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,6 @@ egs/*/*/exp
 egs/*/*/fbank
 egs/*/*/stft
 *DS_Store
+
+
+src/utils/kaldi_io_py.py
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
index 8c693adb209..5ff2c1e236b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,7 +14,7 @@ matrix:
 
 install:
   - pip install -U pip wheel
-  - pip install pytest hacking mock
+  - pip install pytest pytest-pythonpath hacking mock
   - pip install autopep8
   # unable to install pytorch as https://github.com/pytorch/pytorch/issues/4178
   # - if [[ $TRAVIS_PYTHON_VERSION == 2.7 ]]; then pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp27-cp27mu-linux_x86_64.whl; fi
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000000..50e4f411920
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+addopts = --verbose
+testpaths = test
+python_paths = src/nets src/utils src/bin
+
diff --git a/test/test_e2e_model.py b/test/test_e2e_model.py
index de0af671e0d..b6f51c18e84 100644
--- a/test/test_e2e_model.py
+++ b/test/test_e2e_model.py
@@ -12,7 +12,8 @@
 import chainer
 
 
-args = argparse.Namespace(
+def make_arg(etype):
+    return argparse.Namespace(
     elayers = 4,
     subsample = "1_2_2_1_1",
     etype = "vggblstmp",
@@ -33,11 +34,12 @@
     verbose = 2,
     char_list = [u"あ", u"い", u"う", u"え", u"お"],
     outdir = None
-)
+    )
 
 
-
-def test_model_trainable_and_decodable():
+@pytest.mark.parametrize("etype", ["blstmp", "vggblstmp"])
+def test_model_trainable_and_decodable(etype):
+    args = make_arg(etype)
     for m_str in ["e2e_asr_attctc", "e2e_asr_attctc_th"]:
         try:
             import torch
@@ -81,8 +83,10 @@ def __init__(self, n_in, n_out):
     def __call__(self, x):
         return self.a(x)
 
-# def test_encoder_mask_equal():
-if __name__ == "__main__":
+
+@pytest.mark.parametrize("etype", ["blstmp", "vggblstmp"])
+def test_loss_and_ctc_grad(etype):
+    args = make_arg(etype)
     import logging
     logging.basicConfig(level=logging.DEBUG, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s')
     try:
diff --git a/test/test_optimizer.py b/test/test_optimizer.py
new file mode 100644
index 00000000000..5a97020125e
--- /dev/null
+++ b/test/test_optimizer.py
@@ -0,0 +1,62 @@
+import pytest
+import chainer
+import numpy
+try:
+    import torch
+except ImportError:
+    pytest.skip("torch not installed")
+
+
+class ChModel(chainer.Chain):
+    def __init__(self):
+        super(ChModel, self).__init__()
+        with self.init_scope():
+            self.a = chainer.links.Linear(3, 1)
+
+    def __call__(self, x):
+        return chainer.functions.sum(self.a(x))
+
+
+class ThModel(torch.nn.Module):
+    def __init__(self):
+        super(ThModel, self).__init__()
+        self.a = torch.nn.Linear(3, 1)
+
+    def forward(self, x):
+        return self.a(x).sum()
+
+
+@pytest.mark.parametrize("ch_opt_t,th_opt_t", [
+    (chainer.optimizers.SGD, lambda ps: torch.optim.SGD(ps, lr=0.01)),
+    (chainer.optimizers.Adam, torch.optim.Adam),
+    (chainer.optimizers.AdaDelta, lambda ps: torch.optim.Adadelta(ps, rho=0.95))
+])
+def test_optimizer(ch_opt_t, th_opt_t):
+    # model construction
+    ch_model = ChModel()
+    th_model = ThModel()
+
+    # copy params
+    th_model.a.weight.data = torch.from_numpy(numpy.copy(ch_model.a.W.data))
+    th_model.a.bias.data = torch.from_numpy(numpy.copy(ch_model.a.b.data))
+
+    # optimizer setup
+    ch_opt = ch_opt_t()
+    ch_opt.setup(ch_model)
+    th_opt = th_opt_t(th_model.parameters())
+
+    # forward
+    ch_model.cleargrads()
+    data = numpy.random.randn(2, 3).astype(numpy.float32)
+    v = chainer.Variable(data)
+    ch_loss = ch_model(data)
+    th_loss = th_model(torch.autograd.Variable(torch.from_numpy(data)))
+    numpy.testing.assert_allclose(ch_loss.data, th_loss.data.numpy())
+
+    chainer.functions.sum(ch_loss).backward()
+    th_loss.backward()
+    ch_opt.update()
+    th_opt.step()
+    numpy.testing.assert_allclose(ch_model.a.W.data, th_model.a.weight.data.numpy())
+    numpy.testing.assert_allclose(ch_model.a.b.data, th_model.a.bias.data.numpy())
+    

From 6b0b4a81779efe59352b3b4d3fb1fd6e390b1e4e Mon Sep 17 00:00:00 2001
From: ShigekiKarita <shigekikarita@gmail.com>
Date: Thu, 21 Dec 2017 15:55:33 +0900
Subject: [PATCH 11/17] add more tests on grad

---
 test/test_e2e_model.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/test/test_e2e_model.py b/test/test_e2e_model.py
index b6f51c18e84..58628926ec6 100644
--- a/test/test_e2e_model.py
+++ b/test/test_e2e_model.py
@@ -122,12 +122,26 @@ def test_loss_and_ctc_grad(etype):
     numpy.testing.assert_allclose(ch_ctc.data, th_ctc.data.numpy())
     numpy.testing.assert_allclose(ch_att.data, th_att.data.numpy())
 
-    # test grads
+    # test ctc grads
     ch_ctc.backward()
     th_ctc.backward()
     numpy.testing.assert_allclose(ch_model.ctc.ctc_lo.W.grad,
                                   th_model.ctc.ctc_lo.weight.grad.data.numpy(), 1e-7, 1e-8)
+    numpy.testing.assert_allclose(ch_model.ctc.ctc_lo.b.grad,
+                                  th_model.ctc.ctc_lo.bias.grad.data.numpy(), 1e-5, 1e-6)
+    
 
-    numpy.testing.assert_allclose(ch_model.ctc.ctc_lo.W.grad,
-                                  th_model.ctc.ctc_lo.weight.grad.data.numpy(), 1e-7, 1e-8)
+    # test cross-entropy grads
+    ch_model.cleargrads()
+    th_model.zero_grad()
 
+    ch_ctc, ch_att, ch_acc = ch_model(data)
+    th_ctc, th_att, th_acc = th_model(data)
+    ch_att.backward()
+    th_att.backward()
+    numpy.testing.assert_allclose(ch_model.dec.output.W.grad,
+                                  th_model.dec.output.weight.grad.data.numpy(), 1e-7, 1e-8)
+    numpy.testing.assert_allclose(ch_model.dec.output.b.grad,
+                                  th_model.dec.output.bias.grad.data.numpy(), 1e-5, 1e-6)
+    
+    

From 68f2c88283e7639fd4033f8e4a1f421190ba8248 Mon Sep 17 00:00:00 2001
From: ShigekiKarita <shigekikarita@gmail.com>
Date: Thu, 21 Dec 2017 17:40:37 +0900
Subject: [PATCH 12/17] workaround for zero-length

---
 src/nets/e2e_asr_attctc_th.py |  8 +++---
 test/test_e2e_model.py        | 46 +++++++++++++++++++++++++++--------
 2 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/src/nets/e2e_asr_attctc_th.py b/src/nets/e2e_asr_attctc_th.py
index b9412756962..bbaae9ed9ba 100644
--- a/src/nets/e2e_asr_attctc_th.py
+++ b/src/nets/e2e_asr_attctc_th.py
@@ -227,11 +227,13 @@ def forward(self, data):
         :return:
         '''
         # utt list of frame x dim
-        xs = [i[1]['feat'] for i in data]
-        sorted_index = sorted(range(len(xs)), key=lambda i: -len(xs[i]))
+        xs = [d[1]['feat'] for d in data]
+        tids = [d[1]['tokenid'].split() for d in data]
+        filtered_index = filter(lambda i: len(tids[i]) > 0, range(len(xs)))
+        sorted_index = sorted(filtered_index, key=lambda i: -len(xs[i]))
         xs = [xs[i] for i in sorted_index]
         # utt list of olen
-        ys = [np.fromiter(map(int, data[i][1]['tokenid'].split()), dtype=np.int64) for i in sorted_index]
+        ys = [np.fromiter(map(int, tids[i]), dtype=np.int64) for i in sorted_index]
         ys = [to_cuda(self, Variable(torch.from_numpy(y))) for y in ys]
 
         # subsample frame
diff --git a/test/test_e2e_model.py b/test/test_e2e_model.py
index 58628926ec6..14271eac335 100644
--- a/test/test_e2e_model.py
+++ b/test/test_e2e_model.py
@@ -74,16 +74,6 @@ def init_chainer_weight_const(m, val):
             p.data[:] = val
 
 
-class Model(chainer.Chain):
-    def __init__(self, n_in, n_out):
-        super(Model, self).__init__()
-        with self.init_scope():
-            self.a = chainer.links.Linear(n_in, n_out)
-
-    def __call__(self, x):
-        return self.a(x)
-
-
 @pytest.mark.parametrize("etype", ["blstmp", "vggblstmp"])
 def test_loss_and_ctc_grad(etype):
     args = make_arg(etype)
@@ -145,3 +135,39 @@ def test_loss_and_ctc_grad(etype):
                                   th_model.dec.output.bias.grad.data.numpy(), 1e-5, 1e-6)
     
     
+
+
+@pytest.mark.parametrize("etype", ["blstmp", "vggblstmp"])
+def test_zero_length_target(etype):
+    args = make_arg(etype)
+    import logging
+    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s')
+    try:
+        import torch
+    except:
+        pytest.skip("pytorch is not installed")
+    import e2e_asr_attctc as ch
+    import e2e_asr_attctc_th as th
+    ch_model = ch.E2E(40, 5, args)
+    ch_model.cleargrads()
+    th_model = th.E2E(40, 5, args)
+
+    out_data = ""
+    data = [
+        ("aaa", dict(feat=numpy.random.randn(200, 40).astype(numpy.float32), tokenid="1")),
+        ("bbb", dict(feat=numpy.random.randn(100, 40).astype(numpy.float32), tokenid="")),
+        ("cc", dict(feat=numpy.random.randn(100, 40).astype(numpy.float32), tokenid="1 2"))
+    ]
+
+    ch_ctc, ch_att, ch_acc = ch_model(data)
+    th_ctc, th_att, th_acc = th_model(data)
+
+    # NOTE: We ignore all zero length case because chainer also fails. Have a nice data-prep!
+    # out_data = ""
+    # data = [
+    #     ("aaa", dict(feat=numpy.random.randn(200, 40).astype(numpy.float32), tokenid="")),
+    #     ("bbb", dict(feat=numpy.random.randn(100, 40).astype(numpy.float32), tokenid="")),
+    #     ("cc", dict(feat=numpy.random.randn(100, 40).astype(numpy.float32), tokenid=""))
+    # ]
+    # ch_ctc, ch_att, ch_acc = ch_model(data)
+    # th_ctc, th_att, th_acc = th_model(data)

From d2fdf0a5ffa6ca1e7ed1cbba9f66435fa1530ee8 Mon Sep 17 00:00:00 2001
From: ShigekiKarita <shigekikarita@gmail.com>
Date: Thu, 21 Dec 2017 19:08:06 +0900
Subject: [PATCH 13/17] fix pytest to import torch

---
 test/test_optimizer.py | 6 ++----
 test/test_torch.py     | 9 ++-------
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/test/test_optimizer.py b/test/test_optimizer.py
index 5a97020125e..f79df743961 100644
--- a/test/test_optimizer.py
+++ b/test/test_optimizer.py
@@ -1,10 +1,8 @@
 import pytest
 import chainer
 import numpy
-try:
-    import torch
-except ImportError:
-    pytest.skip("torch not installed")
+pytest.importorskip('torch')
+import torch
 
 
 class ChModel(chainer.Chain):
diff --git a/test/test_torch.py b/test/test_torch.py
index 1f41e62aa63..60c51e70655 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -1,11 +1,6 @@
 import pytest
-
-try:
-    import torch
-    from torch.autograd import Variable
-except:
-    pytest.skip("pytorch is not installed")
-
+pytest.importorskip('torch')
+import torch
 from e2e_asr_attctc_th import pad_list, mask_by_length
 
 

From 11c14d4fd84df9fc0d786f39234b9bd6c154b417 Mon Sep 17 00:00:00 2001
From: ShigekiKarita <shigekikarita@gmail.com>
Date: Thu, 21 Dec 2017 19:13:12 +0900
Subject: [PATCH 14/17] fix .travis.yml

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 5ff2c1e236b..164f6f13266 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,7 +21,7 @@ install:
   # - if [[ $TRAVIS_PYTHON_VERSION == 3.6 ]]; then pip install install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl; fi
   # - cd tools && make warp-ctc && cd -
   - grep -v cupy tools/requirements.txt | pip install -r /dev/stdin
-  - cd tools && git clone https://github.com/vesis84/kaldi-io-for-python.git && cd -
+  - cd tools && make kaldi-io-for-python.git && cd -
 
 
 script:

From 696a3cce8ae745e0a32b5418299ce42b43d5c4c3 Mon Sep 17 00:00:00 2001
From: ShigekiKarita <shigekikarita@gmail.com>
Date: Fri, 22 Dec 2017 11:18:16 +0900
Subject: [PATCH 15/17] reflect reviews from kan-bayashi

---
 src/nets/e2e_asr_attctc_th.py | 13 +++++++------
 test/test_torch.py            |  1 +
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/nets/e2e_asr_attctc_th.py b/src/nets/e2e_asr_attctc_th.py
index bbaae9ed9ba..36ea2142271 100644
--- a/src/nets/e2e_asr_attctc_th.py
+++ b/src/nets/e2e_asr_attctc_th.py
@@ -408,7 +408,7 @@ def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev, scaling=2.0):
             self.enc_h = enc_hs_pad  # utt x frame x hdim
             self.h_length = self.enc_h.shape[1]
             # utt x frame x att_dim
-            self.pre_compute_enc_h = linear_tensor(self.mlp_enc, self.enc_h)
+            self.pre_compute_enc_h = torch.tanh(linear_tensor(self.mlp_enc, self.enc_h))
 
         if dec_z is None:
             dec_z = Variable(enc_hs_pad.data.new(batch, self.dunits).zero_())
@@ -469,7 +469,7 @@ def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev, scaling=2.0):
             self.enc_h = enc_hs_pad  # utt x frame x hdim
             self.h_length = self.enc_h.shape[1]
             # utt x frame x att_dim
-            self.pre_compute_enc_h = linear_tensor(self.mlp_enc, self.enc_h)
+            self.pre_compute_enc_h = torch.tanh(linear_tensor(self.mlp_enc, self.enc_h))
 
         if dec_z is None:
             dec_z = Variable(enc_hs_pad.data.new(batch, self.dunits).zero_())
@@ -509,10 +509,11 @@ def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev, scaling=2.0):
 
 
 def th_accuracy(y_all, pad_target, ignore_label):
-    acc = 0
     pad_pred = y_all.data.view(pad_target.size(0), pad_target.size(1), y_all.size(1)).max(2)[1]
     mask = pad_target.data != ignore_label
-    return torch.sum(pad_pred.masked_select(mask) == pad_target.data.masked_select(mask)) / torch.sum(mask)
+    numerator = torch.sum(pad_pred.masked_select(mask) == pad_target.data.masked_select(mask))
+    denominator = torch.sum(mask)
+    return float(numerator) / float(denominator)
 
 
 # ------------- Decoder Network ----------------------------------------------------------------------------------------
@@ -601,8 +602,8 @@ def forward(self, hpad, hlen, ys):
             for (i, y_hat_), y_true_ in zip(enumerate(y_hat.data.cpu().numpy()), y_true.data.cpu().numpy()):
                 if i == MAX_DECODER_OUTPUT:
                     break
-                idx_hat = np.argmax(y_hat_[y_true_ != -1], axis=1)
-                idx_true = y_true_[y_true_ != -1]
+                idx_hat = np.argmax(y_hat_[y_true_ != self.ignore_id], axis=1)
+                idx_true = y_true_[y_true_ != self.ignore_id]
                 seq_hat = [self.char_list[int(idx)] for idx in idx_hat]
                 seq_true = [self.char_list[int(idx)] for idx in idx_true]
                 seq_hat = "".join(seq_hat)
diff --git a/test/test_torch.py b/test/test_torch.py
index 60c51e70655..98a2f37e7e1 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -1,6 +1,7 @@
 import pytest
 pytest.importorskip('torch')
 import torch
+from torch.autograd import Variable
 from e2e_asr_attctc_th import pad_list, mask_by_length
 
 

From b4129da00f288c571e5a937e334a0a6549f4c08c Mon Sep 17 00:00:00 2001
From: ShigekiKarita <shigekikarita@gmail.com>
Date: Fri, 22 Dec 2017 13:40:22 +0900
Subject: [PATCH 16/17] revert attloc tanh

---
 src/nets/e2e_asr_attctc_th.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nets/e2e_asr_attctc_th.py b/src/nets/e2e_asr_attctc_th.py
index 36ea2142271..c00136663db 100644
--- a/src/nets/e2e_asr_attctc_th.py
+++ b/src/nets/e2e_asr_attctc_th.py
@@ -469,7 +469,7 @@ def forward(self, enc_hs_pad, enc_hs_len, dec_z, att_prev, scaling=2.0):
             self.enc_h = enc_hs_pad  # utt x frame x hdim
             self.h_length = self.enc_h.shape[1]
             # utt x frame x att_dim
-            self.pre_compute_enc_h = torch.tanh(linear_tensor(self.mlp_enc, self.enc_h))
+            self.pre_compute_enc_h = linear_tensor(self.mlp_enc, self.enc_h)
 
         if dec_z is None:
             dec_z = Variable(enc_hs_pad.data.new(batch, self.dunits).zero_())

From e3a19ab57ce5bfeb90da413f36965f6fc540f6e1 Mon Sep 17 00:00:00 2001
From: ShigekiKarita <shigekikarita@gmail.com>
Date: Fri, 22 Dec 2017 15:03:41 +0900
Subject: [PATCH 17/17] update Makefile

---
 tools/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/Makefile b/tools/Makefile
index c518e234737..ee81ac49cf6 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,7 +1,7 @@
 .PHONY: all clean
 
 
-all: kaldi venv venv/bin/activate nkf kaldi-io-for-python.git
+all: kaldi venv venv/bin/activate nkf kaldi-io-for-python.git pytorch warp-ctc
 
 kaldi-io-for-python.git:
 	git clone https://github.com/vesis84/kaldi-io-for-python.git