espnet · kan-bayashi · Oct 11, 2019 · Oct 11, 2019 · Oct 11, 2019
diff --git a/README.md b/README.md
@@ -284,7 +284,7 @@ Instead of giving arguments directly, we recommend you to modify the yaml file a
 We also provide a utility to generate a yaml file from the input yaml file:
 
 ```bash
-# e.g. You can give any parameters as '-a key=value' and '-a' is repeatable. 
+# e.g. You can give any parameters as '-a key=value' and '-a' is repeatable.
 #      This generates new file at 'conf/train_batch-size24_epochs10.yaml'
 ./run.sh --train-config $(change_yaml.py conf/train.yaml -a batch-size=24 -a epochs=10)
 # e.g. '-o' option specifies the output file name instead of auto named file.
@@ -294,14 +294,14 @@ We also provide a utility to generate a yaml file from the input yaml file:
 ### How to set minibatch
 
 From espnet v0.4.0, we have three options in `--batch-count` to specify minibatch size (see `espnet.utils.batchfy` for implementation);
-1. `--batch-count seq --batch-seqs 32 --batch-seq-maxlen-in 800 --batch-seq-maxlen-out 150`. 
+1. `--batch-count seq --batch-seqs 32 --batch-seq-maxlen-in 800 --batch-seq-maxlen-out 150`.
 
     This option is compatible to the old setting before v0.4.0. This counts the minibatch size as the number of sequences and reduces the size when the maximum length of the input or output sequences is greater than 800 or 150, respectively.
-1. `--batch-count bin --batch-bins 100000`. 
+1. `--batch-count bin --batch-bins 100000`.
 
-    This creates the minibatch that has the maximum number of bins under 100 in the padded input/output minibatch tensor  (i.e., `max(ilen) * idim + max(olen) * odim`). 
+    This creates the minibatch that has the maximum number of bins under 100 in the padded input/output minibatch tensor  (i.e., `max(ilen) * idim + max(olen) * odim`).
 Basically, this option makes training iteration faster than `--batch-count seq`. If you already has the best `--batch-seqs x` config, try `--batch-bins $((x * (mean(ilen) * idim + mean(olen) * odim)))`.
-1. `--batch-count frame --batch-frames-in 800 --batch-frames-out 100 --batch-frames-inout 900`. 
+1. `--batch-count frame --batch-frames-in 800 --batch-frames-out 100 --batch-frames-inout 900`.
 
     This creates the minibatch that has the maximum number of input, output and input+output frames under 800, 100 and 900, respectively. You can set one of `--batch-frames-xxx` partially. Like `--batch-bins`, this option makes training iteration faster than `--batch-count seq`. If you already has the best `--batch-seqs x` config, try `--batch-frames-in $((x * (mean(ilen) * idim)) --batch-frames-out $((x * mean(olen) * odim))`.
 
@@ -464,8 +464,10 @@ Available pretrained models are listed as follows:
 | [ljspeech.fastspeech.v1](https://drive.google.com/open?id=17RUNFLP4SSTbGA01xWRJo7RkR876xM0i) | Feed-forward Transformer with position-wise FFN |
 | [ljspeech.fastspeech.v2](https://drive.google.com/open?id=1zD-2GMrWM3thaDpS3h3rkTU4jIC0wc5B) | Feed-forward Transformer with CNN instead of position-wise FFN |
 | [libritts.transformer.v1 (New!)](https://drive.google.com/open?id=1Xj73mDPuuPH8GsyNO8GnOC3mn0_OK4g3) | Multi-speaker Transformer with reduction factor = 2 |
+| [jsut.tacotron.v1 (New!)](https://drive.google.com/open?id=1kp5M4VvmagDmYckFJa78WGqh1drb_P9t) | Tacotron 2 ith reduction factor = 2, phoneme input |
+| [jsut.transformer.v1 (New!)](https://drive.google.com/open?id=1mEnZfBKqA4eT6Bn0eRZuP6lNzL-IL3VD) | Transformer with reduction factor = 3, phoneme input |
 
-Waveform synthesis is performed with Griffin-Lim algorithm as default, but we also support a pretrained WaveNet vocoder.  
+Waveform synthesis is performed with Griffin-Lim algorithm as default, but we also support a pretrained WaveNet vocoder.
 You can try it by extending the `stop_stage` as follows:
 ```
 ../../../utils/synth_wav.sh --stop_stage 4 example.txt
@@ -481,6 +483,8 @@ Available pretrained vocoder models are listed as follows:
 |:------|:------|
 | [ljspeech.wavenet.softmax.ns.v1](https://drive.google.com/open?id=1eA1VcRS9jzFa-DovyTgJLQ_jmwOLIi8L) | 8 bit Softmax WaveNet w/ noise shapining trained by [kan-bayashi/PytorchWaveNetVocoder](https://github.com/kan-bayashi/PytorchWaveNetVocoder) |
 | [ljspeech.wavenet.mol.v1](https://drive.google.com/open?id=1sY7gEUg39QaO1szuN62-Llst9TrFno2t) | 16 bit MoL WaveNet trained by [r9y9/wavenet_vocoder](https://github.com/r9y9/wavenet_vocoder) |
+| [jsut.wavenet.mol.v1](https://drive.google.com/open?id=187xvyNbmJVZ0EZ1XHCdyjZHTXK9EcfkK) | 16 bit MoL WaveNet trained by [r9y9/wavenet_vocoder](https://github.com/r9y9/wavenet_vocoder) |
+| [libritts.wavenet.mol.v1](https://drive.google.com/open?id=1jHUUmQFjWiQGyDd7ZeiCThSjjpbF_B4h) | 16 bit MoL WaveNet trained by [r9y9/wavenet_vocoder](https://github.com/r9y9/wavenet_vocoder) |
 
 If you want to build your own WaveNet vocoder, please check [kan-bayashi/PytorchWaveNetVocoder](https://github.com/kan-bayashi/PytorchWaveNetVocoder) or [r9y9/wavenet_vocoder](https://github.com/r9y9/wavenet_vocoder).
 

diff --git a/utils/synth_wav.sh b/utils/synth_wav.sh
@@ -31,6 +31,7 @@ cmvn=
 
 # dictionary related
 dict=
+trans_type="char"
 
 # embedding related
 input_wav=
@@ -105,6 +106,8 @@ function download_models () {
         "ljspeech.fastspeech.v1") share_url="https://drive.google.com/open?id=17RUNFLP4SSTbGA01xWRJo7RkR876xM0i" ;;
         "ljspeech.fastspeech.v2") share_url="https://drive.google.com/open?id=1zD-2GMrWM3thaDpS3h3rkTU4jIC0wc5B";;
         "libritts.transformer.v1") share_url="https://drive.google.com/open?id=1Xj73mDPuuPH8GsyNO8GnOC3mn0_OK4g3";;
+        "jsut.transformer.v1") share_url="https://drive.google.com/open?id=1mEnZfBKqA4eT6Bn0eRZuP6lNzL-IL3VD" ;;
+        "jsut.tacotron2.v1") share_url="https://drive.google.com/open?id=1kp5M4VvmagDmYckFJa78WGqh1drb_P9t" ;;
         *) echo "No such models: ${models}"; exit 1 ;;
     esac
 
@@ -120,6 +123,8 @@ function download_vocoder_models () {
     case "${vocoder_models}" in
         "ljspeech.wavenet.softmax.ns.v1") share_url="https://drive.google.com/open?id=1eA1VcRS9jzFa-DovyTgJLQ_jmwOLIi8L";;
         "ljspeech.wavenet.mol.v1") share_url="https://drive.google.com/open?id=1sY7gEUg39QaO1szuN62-Llst9TrFno2t";;
+        "jsut.wavenet.mol.v1") share_url="https://drive.google.com/open?id=187xvyNbmJVZ0EZ1XHCdyjZHTXK9EcfkK";;
+        "libritts.wavenet.mol.v1") share_url="https://drive.google.com/open?id=1jHUUmQFjWiQGyDd7ZeiCThSjjpbF_B4h";;
         *) echo "No such models: ${vocoder_models}"; exit 1 ;;
     esac
 
@@ -202,7 +207,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     cat $txt >> ${decode_dir}/data/text
 
     mkdir -p ${decode_dir}/dump
-    data2json.sh ${decode_dir}/data ${dict} > ${decode_dir}/dump/data.json
+    data2json.sh --trans_type ${trans_type} ${decode_dir}/data ${dict} > ${decode_dir}/dump/data.json
 fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] && ${use_input_wav}; then
@@ -292,7 +297,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     dst_dir=${decode_dir}/wav_wnv
 
     # This is hardcoded for now.
-    if [ ${vocoder_models} == "ljspeech.wavenet.mol.v1" ]; then
+    if [[ ${vocoder_models} == *".mol."* ]]; then
         # Needs to use https://github.com/r9y9/wavenet_vocoder
         # that supports mixture of logistics/gaussians
         MDN_WAVENET_VOC_DIR=./local/r9y9_wavenet_vocoder