Merge pull request #5370 from Emrys365/tse

Adding general data augmentation methods for speech preprocessing
espnet · Aug 9, 2023 · 88050b2 · 88050b2
2 parents ac8b312 + 3a82677
commit 88050b2
Show file tree

Hide file tree

Showing 19 changed files with 1,023 additions and 141 deletions.
diff --git a/.github/workflows/ci_on_ubuntu.yml b/.github/workflows/ci_on_ubuntu.yml
@@ -255,7 +255,7 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python
-      uses: actions/setup-python@v1
+      uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
@@ -285,7 +285,7 @@ jobs:
         python3 -m pip freeze
     - name: Import all modules (Try2)
       run: |
-        python3 ./ci/test_import_all.py
+        python3 -q -X faulthandler ./ci/test_import_all.py
 
   check_kaldi_symlinks:
     runs-on: ubuntu-latest

diff --git a/ci/test_import_all.py b/ci/test_import_all.py
@@ -2,6 +2,7 @@
 import glob
 import importlib
 import sys
+import traceback
 
 try:
     import k2
@@ -17,6 +18,7 @@
     has_mir_eval = True
 
 
+failed_imports = []
 for dirname in ["espnet", "espnet2"]:
     for f in glob.glob(f"{dirname}/**/*.py"):
         module_name = f.replace("/", ".")[:-3]
@@ -38,4 +40,15 @@
         else:
             print(f"import {module_name}", file=sys.stderr)
 
-        importlib.import_module(module_name)
+        try:
+            importlib.import_module(module_name)
+        except Exception as e:
+            reason = traceback.format_exc()
+            failed_imports.append((module_name, reason))
+
+
+if failed_imports:
+    print(f"Error: Failed to import {len(failed_imports)} modules")
+    for i, (name, reason) in enumerate(failed_imports, 1):
+        print(f"[{i}] {name}\n\t{reason}\n")
+    raise RuntimeError("See the errors above")
diff --git a/ci/test_integration_espnet2.sh b/ci/test_integration_espnet2.sh
@@ -50,6 +50,12 @@ echo "==== feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_
     --feats_normalize "utterance_mvn" --python "${python}" \
     --asr-args "--model_conf extract_feats_in_collect_stats=false --num_workers 0"
 
+echo "==== feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn, with data augmentation ==="
+./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
+    --asr_config "conf/train_asr_rnn_data_aug_debug.yaml" \
+    --feats_normalize "utterance_mvn" --python "${python}" \
+    --asr-args "--model_conf extract_feats_in_collect_stats=false --num_workers 0"
+
 echo "==== use_streaming, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
 ./run.sh --use_streaming true --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
     --feats_normalize "utterance_mvn"  --python "${python}" \
@@ -171,8 +177,10 @@ if python -c 'import torch as t; from packaging.version import parse as L; asser
         echo "==== feats_type=${t} with preprocessor ==="
         ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" \
             --extra_wav_list "rirs.scp noises.scp" --enh_config ./conf/train_with_preprocessor_debug.yaml --enh-args "--num_workers 0"
-        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" \
-            --enh_config conf/train_with_dynamic_mixing_debug.yaml --ref-num 2 --enh-args "--num_workers 0"
+        ./run.sh --ngpu 0 --stage 5 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 1 --python "${python}" \
+            --enh_config conf/train_with_data_aug_debug.yaml --enh-args "--num_workers 0"
+        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --ref-num 2 --python "${python}" \
+            --enh_config conf/train_with_dynamic_mixing_debug.yaml --enh-args "--num_workers 0"
     done
     rm data/**/utt2category 2>/dev/null || true
     rm -r dump

diff --git a/egs2/mini_an4/asr1/conf/train_asr_rnn_data_aug_debug.yaml b/egs2/mini_an4/asr1/conf/train_asr_rnn_data_aug_debug.yaml
@@ -0,0 +1,45 @@
+# This is a debug config for CI
+encoder: vgg_rnn
+encoder_conf:
+    num_layers: 1
+    hidden_size: 2
+    output_size: 2
+
+decoder: rnn
+decoder_conf:
+    hidden_size: 2
+
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.5
+    patience: 1
+
+use_preprocessor: true
+preprocessor: default
+preprocessor_conf:
+    fs: 16000
+    data_aug_effects:   # no need to set the "sample_rate" argument for each effect here
+        - [0.1, "contrast", {"enhancement_amount": 75.0}]
+        - [0.1, "highpass", {"cutoff_freq": 5000, "Q": 0.707}]
+        - [0.1, "equalization", {"center_freq": 1000, "gain": 0, "Q": 0.707}]
+        - - 0.1
+          - - [0.3, "speed_perturb", {"factor": 0.9}]
+            - [0.3, "speed_perturb", {"factor": 1.1}]
+            - [0.3, "speed_perturb", {"factor": 1.3}]
+    data_aug_num: [1, 4]
+    data_aug_prob: 1.0
+
+
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 1
+max_epoch: 1
+num_iters_per_epoch: 1
+batch_type: folded
+batch_size: 2
diff --git a/egs2/mini_an4/enh1/conf/train_with_data_aug_debug.yaml b/egs2/mini_an4/enh1/conf/train_with_data_aug_debug.yaml
@@ -0,0 +1,57 @@
+# This is a debug config for CI
+encoder: stft
+encoder_conf:
+    n_fft: 512
+    hop_length: 128
+
+decoder: stft
+decoder_conf:
+    n_fft: 512
+    hop_length: 128
+
+separator: rnn
+separator_conf:
+    rnn_type: blstm
+    num_spk: 1
+    nonlinear: relu
+    layer: 1
+    unit: 2
+    dropout: 0.2
+
+preprocessor: enh
+preprocessor_conf:
+    speech_volume_normalize: "0.5_1.0"
+    rir_scp: dump/raw/train_nodev/rirs.scp
+    rir_apply_prob: 1.0
+    noise_scp: dump/raw/train_nodev/noises.scp
+    noise_apply_prob: 1.0
+    noise_db_range: "5_20"
+    sample_rate: 16000
+    force_single_channel: true
+    categories:
+        - 1ch_16k
+        - 2ch_16k
+    data_aug_effects:   # no need to set the "sample_rate" argument for each effect here
+        - [0.1, "contrast", {"enhancement_amount": 75.0}]
+        - [0.1, "highpass", {"cutoff_freq": 5000, "Q": 0.707}]
+        - - 0.1
+          - - [0.3, "clipping", {"min_quantile": 0.05, "max_quantile": 0.95}]
+            - [0.3, "corrupt_phase", {"scale": 0.1, "n_fft": 0.032, "hop_length": 0.008}]
+    data_aug_num: [1, 3]
+    data_aug_prob: 1.0
+
+criterions:
+  # The first criterion
+  - name: mse
+    conf:
+      compute_on_mask: false
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
+
+max_epoch: 1
+num_iters_per_epoch: 1
+batch_type: folded
+batch_size: 2
diff --git a/egs2/musdb18/enh1/conf/tuning/train_enh_conv_tasnet.yaml b/egs2/musdb18/enh1/conf/tuning/train_enh_conv_tasnet.yaml
@@ -36,7 +36,7 @@ decoder_conf:
     stride: 10
 separator: tcn
 separator_conf:
-    num_spk: 2
+    num_spk: 4
     layer: 8
     stack: 4
     bottleneck_dim: 256

diff --git a/egs2/musdb18/enh1/run.sh b/egs2/musdb18/enh1/run.sh
@@ -10,6 +10,9 @@ num_dev=5000
 num_eval=3000
 sample_rate=16k
 
+# 0, 1, 2, 3 represent drums, bass, vocals, and others, respectively.
+ref_num=4
+
 
 train_set="train_${sample_rate}"
 valid_set="dev_${sample_rate}"
@@ -21,7 +24,7 @@ test_sets="test_${sample_rate} "
     --test_sets "${test_sets}" \
     --fs "${sample_rate}" \
     --audio_format wav \
-    --ref_num 4 \
+    --ref_num ${ref_num} \
     --lang en \
     --ngpu 1 \
     --local_data_opts "--sample_rate ${sample_rate} --num_train ${num_train} --num_dev ${num_dev} --num_eval ${num_eval}" \

diff --git a/egs2/wsj0_2mix_spatialized/enh1/local/spatialize_wsj0_mix.sh b/egs2/wsj0_2mix_spatialized/enh1/local/spatialize_wsj0_mix.sh
@@ -71,7 +71,7 @@ unzip ${wdir}/spatialize_wsj0-mix.zip -d ${dir}
 sed -i -e "s#data_in_root  = './wsj0-mix/';#data_in_root  = '${wsj0_2mix_wav}';#" \
        -e "s#rir_root      = './wsj0-mix/';#rir_root      = '${wsj0_2mix_spatialized_wav}';#" \
        -e "s#data_out_root = './wsj0-mix/';#data_out_root = '${wsj0_2mix_spatialized_wav}';#" \
-       -e "s#RIR-Generator-master/#RIR-Generator/" \
+       -e "s#RIR-Generator-master/#RIR-Generator/#" \
        ${dir}/spatialize_wsj0_mix.m
 
 sed -i -e "s#MIN_OR_MAX=\"'min'\"#MIN_OR_MAX=\"'${min_or_max}'\"#" \