espnet · mergify · Feb 1, 2023 · Jan 24, 2023 · Jan 29, 2023 · Jan 29, 2023
diff --git a/egs2/README.md b/egs2/README.md
@@ -112,6 +112,7 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | primewords_chinese      | Primewords Chinese Corpus Set 1                                                                                                  | ASR                     | CMN                  | https://www.openslr.org/47/                                                                                  |              |
 | puebla_nahuatl          | Highland Puebla Nahuatl corpus (endangered language in central Mexico)                                                           | ASR/ST                  | HPN                  | https://www.openslr.org/92/                                                                                  |              |
 | qasr_tts                | TTS character based system using semi-supervised data selection                                                                  | TTS                     | ARA                  | https://arabicspeech.org/qasr_tts                                                                                  |              |
+| reasonspeech            | ReazonSpeech: Japanese Corpus collected from TV Programs                                                                         | ASR                     | JPN                  | https://research.reazon.jp/projects/ReazonSpeech/                                                            |              |
 | reverb                  | REVERB (REverberant Voice Enhancement and Recognition Benchmark) challenge                                                       | ASR                     | ENG                  | https://reverb2014.dereverberation.com/                                                                      |              |
 | ru_open_stt             | Russian Open Speech To Text (STT/ASR) Dataset                                                                                    | ASR                     | RUS                  | https://github.com/snakers4/open_stt                                                                         |              |
 | ruslan                  | RUSLAN: Russian Spoken Language Corpus For Speech Synthesis                                                                      | TTS                     | RUS                  | https://ruslan-corpus.github.io/                                                                             |              |

diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh
@@ -23,6 +23,7 @@ DSING=downloads
 WSJ0=
 WSJ1=
 WSJCAM0=
+REAZONSPEECH=downloads
 REVERB=
 REVERB_OUT="${PWD}/REVERB"  # Output file path
 CHIME3=

diff --git a/egs2/reazonspeech/asr1/asr.sh b/egs2/reazonspeech/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
diff --git a/egs2/reazonspeech/asr1/cmd.sh b/egs2/reazonspeech/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
diff --git a/egs2/reazonspeech/asr1/conf/decode_asr.yaml b/egs2/reazonspeech/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.3
diff --git a/egs2/reazonspeech/asr1/conf/train_asr_conformer.yaml b/egs2/reazonspeech/asr1/conf/train_asr_conformer.yaml
@@ -0,0 +1,71 @@
+# We trained this model using Nvidia A100 x 8 with 40GiB Memory.
+batch_type: numel
+batch_bins: 30000000
+accum_grad: 3
+max_epoch: 33
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d6
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+ctc_conf:
+    ignore_nan_grad: true
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/reazonspeech/asr1/conf/train_lm.yaml b/egs2/reazonspeech/asr1/conf/train_lm.yaml
@@ -0,0 +1,16 @@
+optim: sgd
+patience: 3
+max_epoch: 40
+batch_type: folded
+batch_size: 256
+lm: seq_rnn
+lm_conf:
+    rnn_type: lstm
+    nlayers: 2
+    unit: 2024
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/reazonspeech/asr1/db.sh b/egs2/reazonspeech/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
diff --git a/egs2/reazonspeech/asr1/local/data.py b/egs2/reazonspeech/asr1/local/data.py
@@ -0,0 +1,46 @@
+import os
+import sys
+
+from datasets import load_dataset
+from reazonspeech.text import normalize
+
+
+def save_kaldi_format(outdir, ds):
+    os.makedirs(outdir, exist_ok=True)
+    with open(os.path.join(outdir, "text"), "w") as fp_text, open(
+        os.path.join(outdir, "wav.scp"), "w"
+    ) as fp_wav, open(os.path.join(outdir, "utt2spk"), "w") as fp_utt2spk, open(
+        os.path.join(outdir, "spk2utt"), "w"
+    ) as fp_spk2utt:
+
+        for item in ds.sort("name"):
+            path = item["audio"]["path"]
+
+            # '11時のニュースです。' -> '１１時のニュースです'
+            text = normalize(item["transcription"])
+
+            # '000/e7fb3323c280c.flac' -> '000e7fb3323c280c'
+            name = os.path.splitext(item["name"].replace("/", ""))[0]
+            uttid = "uttid%s" % name
+            spkid = "spkid%s" % name
+            print(uttid, text, file=fp_text)
+            print(uttid, path, file=fp_wav)
+            print(uttid, spkid, file=fp_utt2spk)
+            print(spkid, uttid, file=fp_spk2utt)
+
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: %s <download_dir>" % sys.argv[0], file=sys.stderr)
+        return 1
+    download_dir = sys.argv[1]
+    ds = load_dataset("reazon-research/reazonspeech", "all", cache_dir=download_dir)[
+        "train"
+    ]
+    save_kaldi_format("data/dev", ds.select(range(1000)))
+    save_kaldi_format("data/test", ds.select(range(1000, 2000)))
+    save_kaldi_format("data/train", ds.select(range(2000, ds.num_rows)))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/reazonspeech/asr1/local/data.sh b/egs2/reazonspeech/asr1/local/data.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+stage=0
+stop_stage=1
+SECONDS=0
+
+if [ -z "${REAZONSPEECH}" ]; then
+    log "Fill the value of 'REAZONSPEECH' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage1: Download data to ${REAZONSPEECH}"
+    python3 local/data.py ${REAZONSPEECH}
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/reazonspeech/asr1/local/path.sh b/egs2/reazonspeech/asr1/local/path.sh
@@ -0,0 +1,13 @@
+if ! python3 -c 'import datasets' > /dev/null; then
+    echo "Error: it seems that datasets is not installed." >&2
+    echo "Error: please install datasets as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools/installers && ./install_datasets.sh" >&2
+    return 1
+fi
+
+if ! python3 -c 'import reazonspeech' > /dev/null; then
+    echo "Error: it seems that reazonspeech is not installed." >&2
+    echo "Error: please install reazonspeech as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools/installers && ./install_reazonspeech.sh" >&2
+    return 1
+fi
diff --git a/egs2/reazonspeech/asr1/path.sh b/egs2/reazonspeech/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
diff --git a/egs2/reazonspeech/asr1/pyscripts b/egs2/reazonspeech/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts/
diff --git a/egs2/reazonspeech/asr1/run.sh b/egs2/reazonspeech/asr1/run.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="test"
+
+asr_config=conf/train_asr_conformer.yaml
+inference_config=conf/decode_asr.yaml
+lm_config=conf/train_lm.yaml
+
+./asr.sh \
+    --ngpu 8 \
+    --nj 16 \
+    --inference_nj 16 \
+    --max_wav_duration 14 \
+    --lang jp \
+    --use_lm true \
+    --token_type char \
+    --feats_type raw \
+    --audio_format flac \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --lm_config "${lm_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/reazonspeech/asr1/scripts b/egs2/reazonspeech/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts/
diff --git a/egs2/reazonspeech/asr1/steps b/egs2/reazonspeech/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
diff --git a/egs2/reazonspeech/asr1/utils b/egs2/reazonspeech/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
diff --git a/tools/check_install.py b/tools/check_install.py
@@ -36,6 +36,7 @@
     ("pykeops", None, "installers/install_cauchy_mult.sh"),
     ("whisper", None, "installers/install_whisper.sh"),
     ("RawNet3", None, "installers/install_rawnet.sh"),
+    ("reazonspeech", None, "installers/install_reazonspeech.sh"),
 ]
 
 executable_list = [

diff --git a/tools/installers/install_datasets.sh b/tools/installers/install_datasets.sh
diff --git a/tools/installers/install_reazonspeech.sh b/tools/installers/install_reazonspeech.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+if [ $# != 0 ]; then
+    echo "Usage: $0"
+    exit 1;
+fi
+
+python3 -m pip install git+https://github.com/reazon-research/ReazonSpeech