Merge pull request #5436 from wyh2000/master

Add BibleTTS recipe
espnet · Sep 29, 2023 · 8457fe2 · 8457fe2
2 parents 688d87c + 2e9c519
commit 8457fe2
Show file tree

Hide file tree

Showing 23 changed files with 657 additions and 0 deletions.
diff --git a/egs2/README.md b/egs2/README.md
@@ -22,6 +22,7 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | an4                     | CMU AN4 database                                                                                                                 | ASR/TTS                 | ENG                   | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
 | aphasiabank             | AphasiaBank database (English)                                                                                                   | ASR                     | ENG                   | https://aphasia.talkbank.org/                                                                                |              |
 | babel                   | IARPA Babel corups                                                                                                               | ASR                     | ~20 languages         | https://www.iarpa.gov/index.php/research-programs/babel                                                      |              |
+| bibletts                   | Bible TTS corups                                                                                                               | TTS                     | 6 Sub-Saharan Africa languages         | https://masakhane-io.github.io/bibleTTS/                                          |              |
 | bn_openslr53            | Large bengali ASR training dataset                                                                                               | ASR                     | BEN                   | https://openslr.org/53/                                                                                      |              |
 | bur_openslr80           | Burmese ASR training dataset                                                                                                     | ASR                     | BUR                   | https://openslr.org/80/                                                                                      |              |
 | catslu               	  | CATSLU-MAPS                                                                                                                      | SLU                     | CMN           	       | https://sites.google.com/view/catslu/home                                                                    |              |

diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh
@@ -14,6 +14,7 @@ AN4=downloads
 ASVTutorial=espnet_tutorial_asvspoof
 APHASIABANK=
 AUDIOSET=
+BIBLETTS=downloads
 DIRHA_ENGLISH_PHDEV=
 DIRHA_WSJ=
 DIRHA_WSJ_PROCESSED="${PWD}/data/local/dirha_wsj_processed"  # Output file path

diff --git a/egs2/bibletts/tts1/README.md b/egs2/bibletts/tts1/README.md
@@ -0,0 +1,113 @@
+# BIBLE-TTS RECIPE
+
+This is the recipe of single speaker TTS model with [Bible TTS](https://masakhane-io.github.io/bibleTTS/) corpus.
+
+Our goal is to build a TTS character based system using high-quality BibleTTS dataset. BibleTTS is a large high-quality open Text-to-Speech dataset with up to 80 hours of single speaker. It releases aligned speech and text for six languages spoken in Sub-Saharan Africa. There are two options:
+1) Train VITS models from scratch for each language.
+
+2) Finetune from a pretrained TTS model to accelarate training stage.
+
+## Recipe flow
+
+### 1. Data preparation
+
+Data preparation stage
+
+Donwload dataset from  [Bible TTS](https://masakhane-io.github.io/bibleTTS/), and then run:
+
+```sh
+# Assume that data prep stage (stage 1) is finished
+$ ./run.sh --stage 1 --stop-stage 1
+```
+
+### 2. VITS training
+If you want to train from scratch, run:
+```sh
+# Specify the language name for training (e.g. Ewe, Hausa, Lingala, Yoruba, Asante-Twi, Akuapem-Twi)
+$ lang=Yoruba
+$ cd egs2/bibletts/tts1
+$ ./run.sh \
+    --lang ${lang}
+    --train_set ${lang}/tr_no_dev \
+    --valid_set ${lang}/dev1 \
+    --test_sets "${lang}/dev1 ${lang}/eval1" \
+    --srctexts "data/${lang}/tr_no_dev/text" \
+    --dumpdir dump/${lang} \
+    --stage 2 \
+    --ngpu 4 \
+    --g2p none \
+    --token_type char \
+    --cleaner none \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_vits.yaml \
+    --inference_model train.total_count.ave.pth \
+    --tag vits_bibletts_${lang}
+
+```
+If you want to finetune from a pretrain model, first download a [pretrained model](https://zenodo.org/record/5555690), and then run:
+
+```sh
+# exclude the embedding layer since we are finetuning on a different language
+
+$ lang=yoruba
+$ cd egs2/bibletts/tts1
+$ ./run.sh \
+    --train_set ${lang}/tr_no_dev \
+    --valid_set ${lang}/dev1 \
+    --test_sets "${lang}/dev1 ${lang}/eval1" \
+    --srctexts "data/${lang}/tr_no_dev/text" \
+    --dumpdir dump/${lang} \
+    --stage 2 \
+    --ngpu 4 \
+    --g2p none \
+    --token_type char \
+    --cleaner none \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_vits.yaml \
+    --train_args "--init_param <pretrain-model-path>":::tts.generator.text_encoder.emb \
+    --inference_model train.total_count.ave.pth \
+    --tag vits_lj_train_phn_1Minter_char_ft_${lang}
+
+```
+### 3. Inference
+
+TTS model decoding stage. You can change the decoding setting via --inference_config and --inference_args.
+```sh
+$ lang=yoruba
+$ ./run.sh \
+    --train_set ${lang}/tr_no_dev \
+    --valid_set ${lang}/dev1 \
+    --test_sets "${lang}/dev1 ${lang}/eval1" \
+    --srctexts "data/${lang}/tr_no_dev/text" \
+    --dumpdir dump/${lang} \
+    --ngpu 4 \
+    --stage 7 \
+    --min_wav_duration 0.38 \
+    --g2p none \
+    --token_type char \
+    --cleaner none \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config conf/tuning/train_vits.yaml \
+    --inference_model <checkpoint-**.pth> \
+    --tag vits_bibletts_${lang}
+```
+
+### 4. Objective Evaluation
+
+```sh
+# Evaluate MCD
+./pyscripts/utils/evaluate_mcd.py \
+    exp/<model_dir_name>/<decode_dir_name>/eval1/wav/wav.scp \
+    dump/raw/eval1/wav.scp
+
+# Evaluate log-F0 RMSE
+./pyscripts/utils/evaluate_f0.py \
+    exp/<model_dir_name>/<decode_dir_name>/eval1/wav/wav.scp \
+    dump/raw/eval1/wav.scp
+```
diff --git a/egs2/bibletts/tts1/RESULTS.MD b/egs2/bibletts/tts1/RESULTS.MD
@@ -0,0 +1,40 @@
+# BIBLE-TTS Results
+## Environments
+
+  - python version: `3.8.18`
+
+  - pytorch version: `pytorch 2.0.1+cu117`
+
+## The details of BibleTTS dataset
+
+  | Languages  | Number of Training Utters | Hours (h) |
+  |:-----: | :-----: | :----: |
+  | Yoruba |  7,491  |  33.3  |
+  | Lingala | 11,093  | 71.6 |
+  | Asante twi |  21,348 | 74.9 |
+  | Hausa | 40,215 | 86.6  |
+  | Ewe |  22,192 |  86.8  |
+  | Akuapem Twi | 24,892  | 67.1  |
+
+
+
+
+## Results: using objective evaluation (MCD, RMSE)
+Pretrained models can be found at [BibleTTS pretrained models](https://huggingface.co/espnet/vits_tts_bibletts_char/tree/main)
+
+  * Experiment results on different languages:
+    |   Languages     | MCD | RMSE |
+    |:-----: | :-----: | :----: |
+    | Yoruba |  9.66 ± 1.05  | 0.32 ± 0.03   |
+    | Lingala |  6.98 ± 1.22 | 0.34 ± 0.07 |
+    | Asante twi | 8.33 ± 1.17 | 0.27 ± 0.04 |
+    | Hausa | 9.52 ± 1.88 | 0.27 ± 0.07|
+    | Ewe | 11.11 ± 3.62 |  0.27 ± 0.06 |
+    | Akuapem Twi |  7.98 ± 1.57 | 0.28 ± 0.06 |
+
+  * Experiment results (Ablation study on Yoruba):
+    |      Settings      | MCD_test |MCD_dev | RMSE_test | RMSE_dev |
+    |:-----: | :-----: | :----: | :----: | :----: |
+    | Yoruba (train from scratch) | **9.66 ± 1.05** | **6.94 ± 1.96** | **0.32 ± 0.03** | **0.30 ± 0.06** |
+    | Yoruba (finetune from pretrained model, char-based)) |  10.43 ± 1.13  | 7.97 ± 2.37 | 0.39 ± 0.05 | 0.39 ± 0.10|
+    | Yoruba (finetune from pretrained model, phn-based) |  10.14 ± 1.26  | 7.42 ± 1.76 | 0.35 ± 0.04 | 0.31 ± 0.07 |
diff --git a/egs2/bibletts/tts1/cmd.sh b/egs2/bibletts/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/bibletts/tts1/conf/mfcc.conf b/egs2/bibletts/tts1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs2/bibletts/tts1/conf/pbs.conf b/egs2/bibletts/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/bibletts/tts1/conf/queue.conf b/egs2/bibletts/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/bibletts/tts1/conf/slurm.conf b/egs2/bibletts/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/bibletts/tts1/conf/tuning/decode_vits.yaml b/egs2/bibletts/tts1/conf/tuning/decode_vits.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for VITS.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+noise_scale: 0.667         # noise scale parameter for the flow in VITS.
+noise_scale_dur: 0.8       # noise scale parameter for the stochastic duration predictor in VITS.
+speed_control_alpha: 1     # alpha to control the speed of generated speech.
+                           # 1 < alpha makes slower and 1 > alpha makes faster.
+use_teacher_forcing: false # whether to use teacher forcing.