Skip to content

Commit

Permalink
Merge pull request #5342 from YoshikiMas/hackathon-2023s
Browse files Browse the repository at this point in the history
Whisper fine-tuning recipes for CHiME-4 and WSJ
  • Loading branch information
sw005320 committed Jul 23, 2023
2 parents 4b5cd10 + 53b1f4b commit 42f3182
Show file tree
Hide file tree
Showing 5 changed files with 216 additions and 0 deletions.
32 changes: 32 additions & 0 deletions egs2/chime4/asr1/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,38 @@
|decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_simu_beamformit_5mics|1320|126812|94.4|2.8|2.8|1.5|7.2|66.1|


## Whisper [medium_finetuning](conf/tuning/train_asr_whisper_full_warmup1500.yaml) without LM

## Environments
- date: `Fri Jul 21 12:47:17 JST 2023`
- python version: `3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]`
- espnet version: `espnet 202304`
- pytorch version: `pytorch 1.13.1`
- Git hash: `d7172fcb7181ffdcca9c0061400254b63e37bf21`
- Commit date: `Sat Jul 15 15:01:30 2023 +0900`
- Pretrained URL: [espnet/yoshiki_chime4_whisper_medium_finetuning](https://huggingface.co/espnet/yoshiki_chime4_whisper_medium_finetuning)

- token_type: whisper_multilingual
- cleaner: whisper_en

### WER

|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
|---|---|---|---|---|---|---|---|---|
|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/dt05_real_isolated_1ch_track|1640|24791|97.7|1.9|0.5|0.7|3.0|25.7|
|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/dt05_simu_isolated_1ch_track|1640|24792|95.9|3.3|0.8|0.8|4.9|37.0|
|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/et05_real_isolated_1ch_track|1320|19341|96.3|3.2|0.5|0.8|4.5|33.6|
|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/et05_simu_isolated_1ch_track|1320|19344|93.1|5.8|1.1|1.2|8.1|43.3|

### CER

|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
|---|---|---|---|---|---|---|---|---|
|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/dt05_real_isolated_1ch_track|1640|141889|99.2|0.4|0.4|0.7|1.5|25.7|
|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/dt05_simu_isolated_1ch_track|1640|141900|98.2|0.9|0.9|0.8|2.6|37.0|
|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/et05_real_isolated_1ch_track|1320|110558|98.6|0.8|0.6|0.7|2.1|33.6|
|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/et05_simu_isolated_1ch_track|1320|110572|96.5|1.9|1.5|1.2|4.7|43.3|


# Conformer: 12 layers, 2048 linear units
## Environments
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
normalize: null

encoder: whisper
encoder_conf:
whisper_model: medium
dropout_rate: 0.0
use_specaug: true
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 27
num_freq_mask: 2
apply_time_mask: true
time_mask_width_ratio_range:
- 0.
- 0.05
num_time_mask: 5


decoder: whisper
decoder_conf:
whisper_model: medium
dropout_rate: 0.0

model_conf:
ctc_weight: 0.0
lsm_weight: 0.1
length_normalized_loss: false
sym_sos: "<|startoftranscript|>"
sym_eos: "<|endoftext|>"
extract_feats_in_collect_stats: false


frontend: null
input_size: 1 # to prevent build_model() from complaining

seed: 2022
log_interval: 100
num_att_plot: 0
num_workers: 4
sort_in_batch: descending # how to sort data in making batch
sort_batch: descending # how to sort created batches
batch_type: numel
batch_bins: 8000000 # good for single GPU w/ 40G mem
batch_size: 16
accum_grad: 4
max_epoch: 3
patience: none
init: none
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 3

use_amp: true
cudnn_deterministic: false
cudnn_benchmark: false

optim: adamw
grad_clip: 1.0
optim_conf:
lr: 1.0e-05
weight_decay: 0.01
betas:
- 0.9
- 0.99
eps: 1.0e-06
scheduler: warmuplr
scheduler_conf:
warmup_steps: 1500
29 changes: 29 additions & 0 deletions egs2/wsj/asr1/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,34 @@
# RESULTS

## Whisper [medium_finetuning](conf/tuning/train_asr_whisper_full_warmup3000.yaml) without LM

## Environments
- date: `Fri Jul 21 12:47:17 JST 2023`
- python version: `3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]`
- espnet version: `espnet 202304`
- pytorch version: `pytorch 1.13.1`
- Git hash: `d7172fcb7181ffdcca9c0061400254b63e37bf21`
- Commit date: `Sat Jul 15 15:01:30 2023 +0900`
- Pretrained URL: [espnet/yoshiki_wsj_whisper_medium_finetuning](https://huggingface.co/espnet/yoshiki_wsj_whisper_medium_finetuning)

- token_type: whisper_multilingual
- cleaner: whisper_en

### WER

|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
|---|---|---|---|---|---|---|---|---|
|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/test_dev93|503|8132|97.7|2.0|0.4|0.3|2.6|24.9|
|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/test_eval92|333|5564|98.5|1.4|0.1|0.2|1.7|19.5|

### CER

|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
|---|---|---|---|---|---|---|---|---|
|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/test_dev93|503|46683|99.4|0.3|0.3|0.2|0.8|24.9|
|decode_asr_whisper_noctc_greedy_asr_model_valid.acc.ave/test_eval92|333|32096|99.6|0.2|0.2|0.2|0.6|19.5|


## Self-supervised learning features [HuBERT_large_ll60k, Conformer, utt_mvn](conf/tuning/train_asr_conformer_s3prlfrontend_hubert.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer.yaml)

### Environments
Expand Down
6 changes: 6 additions & 0 deletions egs2/wsj/asr1/conf/decode_asr_whisper_noctc_greedy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
beam_size: 1
ctc_weight: 0.0
lm_weight: 0.0
maxlenratio: 0.3
minlenratio: 0.0
penalty: 0.0
74 changes: 74 additions & 0 deletions egs2/wsj/asr1/conf/tuning/train_asr_whisper_full_warmup3000.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
normalize: null

encoder: whisper
encoder_conf:
whisper_model: medium
dropout_rate: 0.0
use_specaug: true
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 27
num_freq_mask: 2
apply_time_mask: true
time_mask_width_ratio_range:
- 0.
- 0.05
num_time_mask: 5


decoder: whisper
decoder_conf:
whisper_model: medium
dropout_rate: 0.0

model_conf:
ctc_weight: 0.0
lsm_weight: 0.1
length_normalized_loss: false
sym_sos: "<|startoftranscript|>"
sym_eos: "<|endoftext|>"
extract_feats_in_collect_stats: false


frontend: null
input_size: 1 # to prevent build_model() from complaining

seed: 2022
log_interval: 100
num_att_plot: 0
num_workers: 4
sort_in_batch: descending # how to sort data in making batch
sort_batch: descending # how to sort created batches
batch_type: numel
batch_bins: 8000000 # good for single GPU w/ 40G mem
accum_grad: 4
max_epoch: 3
patience: none
init: none
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 3

use_amp: true
cudnn_deterministic: false
cudnn_benchmark: false

optim: adamw
grad_clip: 1.0
optim_conf:
lr: 1.0e-05
weight_decay: 0.01
betas:
- 0.9
- 0.99
eps: 1.0e-06
scheduler: warmuplr
scheduler_conf:
warmup_steps: 3000

0 comments on commit 42f3182

Please sign in to comment.