espnet · sw005320 · Aug 20, 2019 · Aug 17, 2019 · Aug 17, 2019 · Aug 17, 2019
diff --git a/egs/README.md b/egs/README.md
@@ -15,6 +15,7 @@
 | fisher_callhome_spanish | Fisher and CALLHOME Spanish--English Speech Translation      | ASR/Machine Translation/Speech Translation | ES->EN         | https://catalog.ldc.upenn.edu/LDC2014T23                     |                               |
 | fisher_swbd             | Fisher English Training Speech, Switchboard-1 Release 2      | ASR                                        | EN             | https://catalog.ldc.upenn.edu/LDC2004S13, https://catalog.ldc.upenn.edu/LDC2005S13, https://catalog.ldc.upenn.edu/LDC97S62 |                               |
 | hkust                   | HKUST Mandarin Telephone Speech                              | ASR                                        | ZH             | [https://catalog.ldc.upenn.edu/LDC2005S15, https://catalog.ldc.upenn.edu/LDC2005T32](https://catalog.ldc.upenn.edu/LDC2005S15) |                               |
+| how2                   | How2: A Large-scale Dataset for Multimodal Language Understanding | ASR/Machine Translation/Speech Translation | EN->PT     | https://github.com/srvk/how2-dataset                         |                               |
 | hub4_spanish            | 1997 Spanish Broadcast News Speech (HUB4-NE)                 | ASR                                        | ES             | https://catalog.ldc.upenn.edu/LDC98S74, https://catalog.ldc.upenn.edu/LDC98T29 |                               |
 | iwslt18                 | International Workshop on Spoken Language Translation 2018   | ASR/Machine Translation/Speech Translation | EN->DE         | https://sites.google.com/site/iwsltevaluation2018/Lectures-task |                               |
 | jnas                    | ASJ Japanese Newspaper Article Sentences Read Speech Corpus (JNAS) | ASR                                  | JP             | http://research.nii.ac.jp/src/JNAS.html                      |                               |

diff --git a/egs/how2/asr1/cmd.sh b/egs/how2/asr1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/how2/asr1/conf/decode.yaml b/egs/how2/asr1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_rnn.yaml
diff --git a/egs/how2/asr1/conf/gpu.conf b/egs/how2/asr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
diff --git a/egs/how2/asr1/conf/lm.yaml b/egs/how2/asr1/conf/lm.yaml
@@ -0,0 +1 @@
+tuning/lm.yaml
diff --git a/egs/how2/asr1/conf/queue.conf b/egs/how2/asr1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/how2/asr1/conf/slurm.conf b/egs/how2/asr1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/how2/asr1/conf/train.yaml b/egs/how2/asr1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_rnn.yaml
diff --git a/egs/how2/asr1/conf/tuning/decode_pytorch_transformer.yaml b/egs/how2/asr1/conf/tuning/decode_pytorch_transformer.yaml
@@ -0,0 +1,7 @@
+batchsize: 0
+beam-size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.5
+lm-weight: 0.7
diff --git a/egs/how2/asr1/conf/tuning/decode_rnn.yaml b/egs/how2/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm-weight: 0.3
+beam-size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.3
diff --git a/egs/how2/asr1/conf/tuning/lm.yaml b/egs/how2/asr1/conf/tuning/lm.yaml
@@ -0,0 +1,8 @@
+layer: 2
+unit: 1024
+opt: adam        # or sgd
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+batchsize: 256 # batch size in LM training
+epoch: 60      # if the data size is large, we can reduce this
+patience: 3
+maxlen: 150      # if sentence length > lm_maxlen, lm_batchsize is automatically reduced
diff --git a/egs/how2/asr1/conf/tuning/train_pytorch_transformer.yaml b/egs/how2/asr1/conf/tuning/train_pytorch_transformer.yaml
@@ -0,0 +1,40 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 100
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 10.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
diff --git a/egs/how2/asr1/conf/tuning/train_rnn.yaml b/egs/how2/asr1/conf/tuning/train_rnn.yaml
@@ -0,0 +1,39 @@
+# network architecture
+# encoder related
+etype: vggblstm     # encoder architecture type
+elayers: 5
+eunits: 1024
+eprojs: 1024
+subsample: "1_2_2_1_1" # skip every n frame from input to nth layers
+# decoder related
+dlayers: 2
+dunits: 1024
+context-residual: true
+# attention related
+atype: location
+adim: 1024
+aconv-chans: 10
+aconv-filts: 100
+
+# hybrid CTC/attention
+mtlalpha: 0.5
+
+# minibatch related
+batch-size: 20
+maxlen-in: 800  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# other config
+dropout-rate: 0.3
+dropout-rate-decoder: 0.0
+lsm-weight: 0.1
+weight-decay: 0
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: adadelta
+epochs: 15
+patience: 3
+
+# scheduled sampling option
+sampling-probability: 0.0
diff --git a/egs/how2/asr1/local b/egs/how2/asr1/local
@@ -0,0 +1 @@
+../st1/local
diff --git a/egs/how2/asr1/path.sh b/egs/how2/asr1/path.sh
@@ -0,0 +1,30 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+export PATH=$MAIN_ROOT/tools/moses/scripts/tokenizer/:$MAIN_ROOT/tools/moses/scripts/generic/:$PATH
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+if [ -e $MAIN_ROOT/tools/venv/etc/profile.d/conda.sh ]; then
+    source $MAIN_ROOT/tools/venv/etc/profile.d/conda.sh && conda deactivate && conda activate
+else
+    source $MAIN_ROOT/tools/venv/bin/activate
+fi
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# check extra module installation
+if ! which tokenizer.perl > /dev/null; then
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+    return 1
+fi
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8