Add LibriTTS-R recipe (#5379)

* Add LibriTTS-R recipe * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tomoki Hayashi <hayashi.tomoki@g.sp.m.is.nagoya-u.ac.jp>
espnet · Jul 29, 2023 · 216b664 · 216b664
1 parent 0cd51bb
commit 216b664
Show file tree

Hide file tree

Showing 36 changed files with 1,353 additions and 1 deletion.
diff --git a/egs2/README.md b/egs2/README.md
@@ -86,6 +86,7 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | librispeech             | LibriSpeech ASR corpus                                                                                                           | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
 | librispeech_100         | LibriSpeech ASR corpus 100h subset                                                                                               | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
 | libritts                | LibriTTS corpus                                                                                                                  | TTS                     | ENG                  | http://www.openslr.org/60                                                                                    |              |
+| libritts_r              | LibriTTS-R corpus                                                                                                                | TTS                     | ENG                  | http://www.openslr.org/141                                                                                   |              |
 | ljspeech                | The LJ Speech Dataset                                                                                                            | TTS                     | ENG                  | https://keithito.com/LJ-Speech-Dataset/                                                                      |              |
 | lrs2                    | The Oxford-BBC Lip Reading Sentences 2 (LRS2) Dataset                                                                            | Lipreading/ASR          | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html                                                  |              |
 | lrs3                    | The Oxford-BBC Lip Reading Sentences 3 (LRS3) Dataset                                                                            | ASR                     | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html                                                  |              |

diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh
@@ -66,7 +66,8 @@ MISP2021=
 MLSUPERB=
 L3DAS22=
 LIBRIMIX=downloads
-LIBRITTS=
+LIBRITTS=downloads
+LIBRITTS_R=downloads
 LJSPEECH=downloads
 MUSAN=
 MUSDB18=downloads

diff --git a/egs2/TEMPLATE/asr1/utils/filter_scps.pl b/egs2/TEMPLATE/asr1/utils/filter_scps.pl
diff --git a/egs2/libritts_r/tts1/README.md b/egs2/libritts_r/tts1/README.md
@@ -0,0 +1,22 @@
+# LIBRITTS-R RECIPE
+
+This is the recipe of the English multi-speaker TTS model with [LibriTTS-R](http://www.openslr.org/141) corpus.
+
+See the following pages for the usage:
+- [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
+- [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
+- [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train with X-vector](../../TEMPLATE/tts1/README.md#multi-speaker-model-with-x-vector-training)
+- [How to train with speaker ID](../../TEMPLATE/tts1/README.md#multi-speaker-model-with-speaker-id-embedding-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
+
+See the following pages before asking the question:
+- [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
+- [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
+
+# FIRST RESULTS
+
+## Pretrained Models
+
+TBA
diff --git a/egs2/libritts_r/tts1/cmd.sh b/egs2/libritts_r/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/libritts_r/tts1/conf/decode.yaml b/egs2/libritts_r/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
diff --git a/egs2/libritts_r/tts1/conf/mfcc.conf b/egs2/libritts_r/tts1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs2/libritts_r/tts1/conf/pbs.conf b/egs2/libritts_r/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/libritts_r/tts1/conf/queue.conf b/egs2/libritts_r/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/libritts_r/tts1/conf/slurm.conf b/egs2/libritts_r/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/libritts_r/tts1/conf/train.yaml b/egs2/libritts_r/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_gst+xvector_transformer.yaml
diff --git a/egs2/libritts_r/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/libritts_r/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/libritts_r/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/libritts_r/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,16 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5             # threshold to stop the generation
+maxlenratio: 10.0          # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0           # minimum length of generated samples = input length * minlenratio
+use_att_constraint: true   # whether to use attention constraint, which is introduced in deep voice 3
+backward_window: 1         # backward window size in the attention constraint
+forward_window: 3          # forward window size in the attention constraint
+use_teacher_forcing: false # whether to use teacher forcing
diff --git a/egs2/libritts_r/tts1/conf/tuning/decode_vits.yaml b/egs2/libritts_r/tts1/conf/tuning/decode_vits.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for VITS.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+noise_scale: 0.667         # noise scale parameter for the flow in VITS.
+noise_scale_dur: 0.8       # noise scale parameter for the stochastic duration predictor in VITS.
+speed_control_alpha: 1     # alpha to control the speed of generated speech.
+                           # 1 < alpha makes slower and 1 > alpha makes faster.
+use_teacher_forcing: false # whether to use teacher forcing.
diff --git a/egs2/libritts_r/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml b/egs2/libritts_r/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
@@ -0,0 +1,113 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2 with GST + X-vector. It requires 4 GPU with 32 GB
+# memory and it takes ~3 days to finish the training on V100.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    spk_embed_dim: 512                         # dimension of speaker embedding
+    spk_embed_integration_type: add            # how to integrate speaker embedding
+    use_gst: true                              # whether to use GST embedding
+    gst_heads: 4                               # number of heads in GST multi-head attention
+    gst_tokens: 16                             # number of global style tokens
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500  # number of iterations per epoch
+max_epoch: 500            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 18000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min