Merge pull request #5342 from YoshikiMas/hackathon-2023s

Whisper fine-tuning recipes for CHiME-4 and WSJ
espnet · Jul 23, 2023 · 42f3182 · 42f3182
2 parents 4b5cd10 + 53b1f4b
commit 42f3182
Show file tree

Hide file tree

Showing 5 changed files with 216 additions and 0 deletions.
diff --git a/egs2/chime4/asr1/README.md b/egs2/chime4/asr1/README.md
@@ -32,6 +32,38 @@
 |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_simu_beamformit_5mics|1320|126812|94.4|2.8|2.8|1.5|7.2|66.1|
 
 
+## Whisper [medium_finetuning](conf/tuning/train_asr_whisper_full_warmup1500.yaml) without LM
+
+## Environments
+- date: `Fri Jul 21 12:47:17 JST 2023`
+- python version: `3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]`
+- espnet version: `espnet 202304`
+- pytorch version: `pytorch 1.13.1`
+- Git hash: `d7172fcb7181ffdcca9c0061400254b63e37bf21`
+  - Commit date: `Sat Jul 15 15:01:30 2023 +0900`
+- Pretrained URL: [espnet/yoshiki_chime4_whisper_medium_finetuning](https://huggingface.co/espnet/yoshiki_chime4_whisper_medium_finetuning)
+
+- token_type: whisper_multilingual
+- cleaner: whisper_en
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/dt05_real_isolated_1ch_track|1640|24791|97.7|1.9|0.5|0.7|3.0|25.7|
+|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/dt05_simu_isolated_1ch_track|1640|24792|95.9|3.3|0.8|0.8|4.9|37.0|
+|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/et05_real_isolated_1ch_track|1320|19341|96.3|3.2|0.5|0.8|4.5|33.6|
+|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/et05_simu_isolated_1ch_track|1320|19344|93.1|5.8|1.1|1.2|8.1|43.3|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/dt05_real_isolated_1ch_track|1640|141889|99.2|0.4|0.4|0.7|1.5|25.7|
+|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/dt05_simu_isolated_1ch_track|1640|141900|98.2|0.9|0.9|0.8|2.6|37.0|
+|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/et05_real_isolated_1ch_track|1320|110558|98.6|0.8|0.6|0.7|2.1|33.6|
+|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/et05_simu_isolated_1ch_track|1320|110572|96.5|1.9|1.5|1.2|4.7|43.3|
+
 
 # Conformer: 12 layers, 2048 linear units
 ## Environments

diff --git a/egs2/chime4/asr1/conf/tuning/train_asr_whisper_full_warmup1500.yaml b/egs2/chime4/asr1/conf/tuning/train_asr_whisper_full_warmup1500.yaml
@@ -0,0 +1,75 @@
+normalize: null
+
+encoder: whisper
+encoder_conf:
+    whisper_model: medium
+    dropout_rate: 0.0
+    use_specaug: true
+    specaug_conf:
+        apply_time_warp: true
+        time_warp_window: 5
+        time_warp_mode: bicubic
+        apply_freq_mask: true
+        freq_mask_width_range:
+        - 0
+        - 27
+        num_freq_mask: 2
+        apply_time_mask: true
+        time_mask_width_ratio_range:
+        - 0.
+        - 0.05
+        num_time_mask: 5
+
+
+decoder: whisper
+decoder_conf:
+    whisper_model: medium
+    dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    sym_sos: "<|startoftranscript|>"
+    sym_eos: "<|endoftext|>"
+    extract_feats_in_collect_stats: false
+
+
+frontend: null
+input_size: 1                   # to prevent build_model() from complaining
+
+seed: 2022
+log_interval: 100
+num_att_plot: 0
+num_workers: 4
+sort_in_batch: descending       # how to sort data in making batch
+sort_batch: descending          # how to sort created batches
+batch_type: numel
+batch_bins: 8000000            # good for single GPU w/ 40G mem
+batch_size: 16
+accum_grad: 4
+max_epoch: 3
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 3
+
+use_amp: true
+cudnn_deterministic: false
+cudnn_benchmark: false
+
+optim: adamw
+grad_clip: 1.0
+optim_conf:
+    lr: 1.0e-05
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.99
+    eps: 1.0e-06
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 1500
diff --git a/egs2/wsj/asr1/README.md b/egs2/wsj/asr1/README.md
@@ -1,5 +1,34 @@
 # RESULTS
 
+## Whisper [medium_finetuning](conf/tuning/train_asr_whisper_full_warmup3000.yaml) without LM
+
+## Environments
+- date: `Fri Jul 21 12:47:17 JST 2023`
+- python version: `3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]`
+- espnet version: `espnet 202304`
+- pytorch version: `pytorch 1.13.1`
+- Git hash: `d7172fcb7181ffdcca9c0061400254b63e37bf21`
+  - Commit date: `Sat Jul 15 15:01:30 2023 +0900`
+- Pretrained URL: [espnet/yoshiki_wsj_whisper_medium_finetuning](https://huggingface.co/espnet/yoshiki_wsj_whisper_medium_finetuning)
+
+- token_type: whisper_multilingual
+- cleaner: whisper_en
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/test_dev93|503|8132|97.7|2.0|0.4|0.3|2.6|24.9|
+|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/test_eval92|333|5564|98.5|1.4|0.1|0.2|1.7|19.5|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/test_dev93|503|46683|99.4|0.3|0.3|0.2|0.8|24.9|
+|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/test_eval92|333|32096|99.6|0.2|0.2|0.2|0.6|19.5|
+
+
 ## Self-supervised learning features [HuBERT_large_ll60k, Conformer, utt_mvn](conf/tuning/train_asr_conformer_s3prlfrontend_hubert.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer.yaml)
 
 ### Environments

diff --git a/egs2/wsj/asr1/conf/decode_asr_whisper_noctc_greedy.yaml b/egs2/wsj/asr1/conf/decode_asr_whisper_noctc_greedy.yaml
@@ -0,0 +1,6 @@
+beam_size: 1
+ctc_weight: 0.0
+lm_weight: 0.0
+maxlenratio: 0.3
+minlenratio: 0.0
+penalty: 0.0
diff --git a/egs2/wsj/asr1/conf/tuning/train_asr_whisper_full_warmup3000.yaml b/egs2/wsj/asr1/conf/tuning/train_asr_whisper_full_warmup3000.yaml
@@ -0,0 +1,74 @@
+normalize: null
+
+encoder: whisper
+encoder_conf:
+    whisper_model: medium
+    dropout_rate: 0.0
+    use_specaug: true
+    specaug_conf:
+        apply_time_warp: true
+        time_warp_window: 5
+        time_warp_mode: bicubic
+        apply_freq_mask: true
+        freq_mask_width_range:
+        - 0
+        - 27
+        num_freq_mask: 2
+        apply_time_mask: true
+        time_mask_width_ratio_range:
+        - 0.
+        - 0.05
+        num_time_mask: 5
+
+
+decoder: whisper
+decoder_conf:
+    whisper_model: medium
+    dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    sym_sos: "<|startoftranscript|>"
+    sym_eos: "<|endoftext|>"
+    extract_feats_in_collect_stats: false
+
+
+frontend: null
+input_size: 1                   # to prevent build_model() from complaining
+
+seed: 2022
+log_interval: 100
+num_att_plot: 0
+num_workers: 4
+sort_in_batch: descending       # how to sort data in making batch
+sort_batch: descending          # how to sort created batches
+batch_type: numel
+batch_bins: 8000000            # good for single GPU w/ 40G mem
+accum_grad: 4
+max_epoch: 3
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 3
+
+use_amp: true
+cudnn_deterministic: false
+cudnn_benchmark: false
+
+optim: adamw
+grad_clip: 1.0
+optim_conf:
+    lr: 1.0e-05
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.99
+    eps: 1.0e-06
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 3000