Fix ST inference and CI test errors.

espnet · Oct 11, 2023 · 2d3ef4f · 2d3ef4f
1 parent df4536b
commit 2d3ef4f
Show file tree

Hide file tree

Showing 6 changed files with 14 additions and 12 deletions.
diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh
@@ -383,7 +383,7 @@ elif [ "${tgt_token_type}" = char ]; then
 elif [ "${tgt_token_type}" = word ]; then
     tgt_token_list="${tgt_wordtoken_list}"
     tgt_bpemodel=none
-elif [ "${tgt_token_type}" = whisper_en ]; then # should make token_list an output filepath here
+elif [ "${tgt_token_type}" = whisper_en ]; then
     tgt_token_list="${token_listdir}"/tgt_whisper_en/tokens.txt
     tgt_bpemodel=whisper_en
     hyp_cleaner=${cleaner}
@@ -830,8 +830,6 @@ if ! "${skip_data_prep}"; then
         elif grep -q "whisper" <<< ${tgt_token_type}; then
             log "Stage 5a: Generate whisper token_list from ${tgt_token_type} tokenizer"
 
-            # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
-            # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
             echo ${tgt_token_list}
             ${python} -m espnet2.bin.whisper_export_vocabulary  \
                 --whisper_model "${tgt_token_type}" \
@@ -927,8 +925,6 @@ if ! "${skip_data_prep}"; then
             elif grep -q "whisper" <<< ${src_token_type}; then
                 log "Stage 5b: Generate whisper token_list from ${src_token_type} tokenizer"
 
-                # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
-                # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
                 echo ${src_token_list}
                 ${python} -m espnet2.bin.whisper_export_vocabulary  \
                     --whisper_model "${src_token_type}" \

diff --git a/espnet/nets/e2e_mt_common.py b/espnet/nets/e2e_mt_common.py
@@ -33,6 +33,7 @@ def __init__(self, char_list, sym_space, sym_pad, report_bleu=False):
         if self.pad in self.char_list:
             self.idx_blank = self.char_list.index(self.pad)
         else:
+            # for OpenAI Whisper model, which doesn't use <blank> token
             self.idx_blank = None
         if self.space in self.char_list:
             self.idx_space = self.char_list.index(self.space)

diff --git a/espnet2/bin/asr_inference.py b/espnet2/bin/asr_inference.py
@@ -393,10 +393,7 @@ def __init__(
         elif bpemodel not in ["whisper_en", "whisper_multilingual"]:
             converter = TokenIDConverter(token_list=token_list)
         else:
-            if (
-                hasattr(asr_train_args, "preprocessor_conf")
-                and "speaker_change_symbol" in asr_train_args.preprocessor_conf
-            ):
+            if "speaker_change_symbol" in preprocessor_conf:
                 sot_asr = True
             else:
                 sot_asr = False

diff --git a/espnet2/bin/st_inference.py b/espnet2/bin/st_inference.py
@@ -360,12 +360,13 @@ def __init__(
 
         # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text
         # compatibility for whisper tokenizer
-        whisper_language = st_train_args.preprocessor_conf.get("whisper_language", None)
+        preprocessor_conf = getattr(st_train_args, "preprocessor_conf", {})
+        whisper_language = preprocessor_conf.get("whisper_language", None)
+        whisper_task = preprocessor_conf.get("whisper_task", None)
         if whisper_language:
             src_token_lang, token_lang = whisper_language
         else:
             src_token_lang, token_lang = None, None
-        whisper_task = st_train_args.preprocessor_conf.get("whisper_task", None)
 
         if token_type is None:
             token_type = st_train_args.token_type
@@ -383,7 +384,6 @@ def __init__(
                 tokenizer = build_tokenizer(
                     token_type=token_type,
                     bpemodel=bpemodel,
-                    # Whisper model only support X -> En translation
                     whisper_language=token_lang,
                     whisper_task=whisper_task,
                 )
@@ -397,6 +397,9 @@ def __init__(
                 language=token_lang or "en",
                 task=whisper_task or "translate",
             )
+            beam_search.set_hyp_primer(
+                list(converter.tokenizer.sot_sequence_including_notimestamps)
+            )
         else:
             converter = TokenIDConverter(token_list=token_list)
         logging.info(f"Text tokenizer: {tokenizer}")
@@ -426,6 +429,9 @@ def __init__(
                 language=src_token_lang or "en",
                 task=whisper_task or "translate",
             )
+            asr_beam_search.set_hyp_primer(
+                list(src_converter.tokenizer.sot_sequence_including_notimestamps)
+            )
         else:
             src_converter = TokenIDConverter(token_list=src_token_list)
         logging.info(f"Src Text tokenizer: {src_tokenizer}")

diff --git a/espnet2/st/espnet_model.py b/espnet2/st/espnet_model.py
@@ -136,6 +136,7 @@ def __init__(
             if tgt_sym_blank in token_list:
                 self.blank_id = token_list.index(tgt_sym_blank)
             else:
+                # OpenAI Whisper model doesn't <blank> token
                 self.blank_id = 0
             self.st_criterion_transducer = RNNTLoss(
                 blank=self.blank_id,

diff --git a/test/espnet2/layers/test_create_lora_adapter.py b/test/espnet2/layers/test_create_lora_adapter.py
@@ -2,6 +2,7 @@
 
 import pytest
 import torch
+from packaging.version import parse as V
 
 from espnet2.asr.decoder.transformer_decoder import TransformerDecoder
 from espnet2.layers.create_lora_adapter import create_lora_adapter