espnet · sw005320 · Jun 20, 2023 · Nov 23, 2022 · Nov 23, 2022 · Nov 23, 2022
diff --git a/README.md b/README.md
@@ -75,9 +75,9 @@ ESPnet uses [pytorch](http://pytorch.org/) as a deep learning engine and also fo
 - Data augmentation
 - **Transducer** based end-to-end ASR
   - Architecture:
-    - RNN-based encoder and decoder.
-    - Custom encoder and decoder supporting Transformer, Conformer (encoder), 1D Conv / TDNN (encoder) and causal 1D Conv (decoder) blocks.
-    - VGG2L (RNN/custom encoder) and Conv2D (custom encoder) bottlenecks.
+    - Custom encoder supporting RNNs, Conformer, Branchformer (w/ variants), 1D Conv / TDNN.
+    - Decoder w/ parameters shared accross blocks supporting RNN, stateless w/ 1D Conv, [[MEGA]](https://arxiv.org/abs/2209.10655), and [[RWKV]](https://arxiv.org/abs/2305.13048).
+    - Pre-encoder: VGG2L or Conv2D available.
   - Search algorithms:
     - Greedy search constrained to one emission by timestep.
     - Default beam search algorithm [[Graves, 2012]](https://arxiv.org/abs/1211.3711) without prefix search.
@@ -86,6 +86,7 @@ ESPnet uses [pytorch](http://pytorch.org/) as a deep learning engine and also fo
     - N-step Constrained beam search modified from [[Kim et al., 2020]](https://arxiv.org/abs/2002.03577).
     - modified Adaptive Expansion Search based on [[Kim et al., 2021]](https://ieeexplore.ieee.org/abstract/document/9250505) and NSC.
   - Features:
+    - Unified interface for offline and streaming speech recognition.
     - Multi-task learning with various auxiliary losses:
       - Encoder: CTC, auxiliary Transducer and symmetric KL divergence.
       - Decoder: cross-entropy w/ label smoothing.

diff --git a/ci/test_integration_espnet2.sh b/ci/test_integration_espnet2.sh
@@ -103,7 +103,7 @@ if python3 -c "from warprnnt_pytorch import RNNTLoss" &> /dev/null; then
             --encoder_conf main_conf='{'dynamic_chunk_training': True}' \
             --encoder_conf body_conf='[{'block_type': 'conformer', 'hidden_size': 30, 'linear_size': 30, 'heads': 2, 'conv_mod_kernel_size': 3}]' \
             --decoder_conf='{'embed_size': 30, 'hidden_size': 30}' --joint_network_conf joint_space_size=30 " \
-            --inference-args "--streaming true --chunk_size 2 --left_context 2 --right_context 0"
+            --inference-args "--streaming true --decoding_window 160 --left_context 2"
     done
 fi
 

diff --git a/doc/espnet2_tutorial.md b/doc/espnet2_tutorial.md
diff --git a/espnet2/asr_transducer/activation.py b/espnet2/asr_transducer/activation.py
@@ -1,4 +1,4 @@
-"""Activation functions for Transducer."""
+"""Activation functions for Transducer models."""
 
 import torch
 from packaging.version import parse as V

diff --git a/espnet2/asr_transducer/beam_search_transducer.py b/espnet2/asr_transducer/beam_search_transducer.py
@@ -17,8 +17,7 @@ class Hypothesis:
     Args:
         score: Total log-probability.
         yseq: Label sequence as integer ID sequence.
-        dec_state: RNNDecoder or StatelessDecoder state.
-                     ((N, 1, D_dec), (N, 1, D_dec) or None) or None
+        dec_state: RNN/MEGA Decoder state (None if Stateless).
         lm_state: RNNLM state. ((N, D_lm), (N, D_lm)) or None
 
     """
@@ -51,7 +50,7 @@ class BeamSearchTransducer:
         decoder: Decoder module.
         joint_network: Joint network module.
         beam_size: Size of the beam.
-        lm: LM class.
+        lm: LM module.
         lm_weight: LM weight for soft fusion.
         search_type: Search algorithm to use during inference.
         max_sym_exp: Number of maximum symbol expansions at each time step. (TSD)
@@ -146,7 +145,7 @@ def __init__(
         self.score_norm = score_norm
         self.nbest = nbest
 
-        self.reset_inference_cache()
+        self.reset_cache()
 
     def __call__(
         self,
@@ -168,16 +167,16 @@ def __call__(
         hyps = self.search_algorithm(enc_out)
 
         if is_final:
-            self.reset_inference_cache()
+            self.reset_cache()
 
             return self.sort_nbest(hyps)
 
         self.search_cache = hyps
 
         return hyps
 
-    def reset_inference_cache(self) -> None:
-        """Reset cache for decoder scoring and streaming."""
+    def reset_cache(self) -> None:
+        """Reset cache for streaming decoding."""
         self.decoder.score_cache = {}
         self.search_cache = None
 
@@ -312,14 +311,7 @@ def default_beam_search(self, enc_out: torch.Tensor) -> List[Hypothesis]:
                 max_hyp = max(hyps, key=lambda x: x.score)
                 hyps.remove(max_hyp)
 
-                label = torch.full(
-                    (1, 1),
-                    max_hyp.yseq[-1],
-                    dtype=torch.long,
-                    device=self.decoder.device,
-                )
                 dec_out, state = self.decoder.score(
-                    label,
                     max_hyp.yseq,
                     max_hyp.dec_state,
                 )
@@ -405,6 +397,7 @@ def align_length_sync_decoding(
 
             B_ = []
             B_enc_out = []
+
             for hyp in B:
                 u = len(hyp.yseq) - 1
                 t = i - u

diff --git a/espnet2/asr_transducer/decoder/abs_decoder.py b/espnet2/asr_transducer/decoder/abs_decoder.py
@@ -1,7 +1,7 @@
 """Abstract decoder definition for Transducer models."""
 
 from abc import ABC, abstractmethod
-from typing import Any, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 
@@ -14,33 +14,40 @@ def forward(self, labels: torch.Tensor) -> torch.Tensor:
         """Encode source label sequences.
 
         Args:
-            labels: Label ID sequences. (B, L)
+            labels: Label ID sequences.
 
         Returns:
-            dec_out: Decoder output sequences. (B, T, D_dec)
+            : Decoder output sequences.
 
         """
         raise NotImplementedError
 
     @abstractmethod
     def score(
         self,
-        label: torch.Tensor,
         label_sequence: List[int],
-        dec_state: Optional[Tuple[torch.Tensor, Optional[torch.Tensor]]],
-    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, Optional[torch.Tensor]]]]:
+        states: Union[
+            List[Dict[str, torch.Tensor]],
+            List[torch.Tensor],
+            Tuple[torch.Tensor, Optional[torch.Tensor]],
+        ],
+    ) -> Tuple[
+        torch.Tensor,
+        Union[
+            List[Dict[str, torch.Tensor]],
+            List[torch.Tensor],
+            Tuple[torch.Tensor, Optional[torch.Tensor]],
+        ],
+    ]:
         """One-step forward hypothesis.
 
         Args:
-            label: Previous label. (1, 1)
             label_sequence: Current label sequence.
-            dec_state: Previous decoder hidden states.
-                         ((N, 1, D_dec), (N, 1, D_dec) or None) or None
+            state: Decoder hidden states.
 
         Returns:
-            dec_out: Decoder output sequence. (1, D_dec) or (1, D_emb)
-            dec_state: Decoder hidden states.
-                         ((N, 1, D_dec), (N, 1, D_dec) or None) or None
+            out: Decoder output sequence.
+            state: Decoder hidden states.
 
         """
         raise NotImplementedError
@@ -49,16 +56,22 @@ def score(
     def batch_score(
         self,
         hyps: List[Any],
-    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, Optional[torch.Tensor]]]]:
+    ) -> Tuple[
+        torch.Tensor,
+        Union[
+            List[Dict[str, torch.Tensor]],
+            List[torch.Tensor],
+            Tuple[torch.Tensor, Optional[torch.Tensor]],
+        ],
+    ]:
         """One-step forward hypotheses.
 
         Args:
             hyps: Hypotheses.
 
         Returns:
-            dec_out: Decoder output sequences. (B, D_dec) or (B, D_emb)
+            out: Decoder output sequences.
             states: Decoder hidden states.
-                      ((N, B, D_dec), (N, B, D_dec) or None) or None
 
         """
         raise NotImplementedError
@@ -76,35 +89,70 @@ def set_device(self, device: torch.Tensor) -> None:
     @abstractmethod
     def init_state(
         self, batch_size: int
-    ) -> Optional[Tuple[torch.Tensor, Optional[torch.tensor]]]:
+    ) -> Union[
+        List[Dict[str, torch.Tensor]],
+        List[torch.Tensor],
+        Tuple[torch.Tensor, Optional[torch.tensor]],
+    ]:
         """Initialize decoder states.
 
         Args:
             batch_size: Batch size.
 
         Returns:
-            : Initial decoder hidden states.
-                ((N, B, D_dec), (N, B, D_dec) or None) or None
+            : Decoder hidden states.
 
         """
         raise NotImplementedError
 
     @abstractmethod
     def select_state(
         self,
-        states: Optional[Tuple[torch.Tensor, Optional[torch.Tensor]]] = None,
+        states: Union[
+            List[Dict[str, torch.Tensor]],
+            List[torch.Tensor],
+            Tuple[torch.Tensor, Optional[torch.Tensor]],
+        ],
         idx: int = 0,
-    ) -> Optional[Tuple[torch.Tensor, Optional[torch.Tensor]]]:
+    ) -> Union[
+        List[Dict[str, torch.Tensor]],
+        List[torch.Tensor],
+        Tuple[torch.Tensor, Optional[torch.Tensor]],
+    ]:
         """Get specified ID state from batch of states, if provided.
 
         Args:
             states: Decoder hidden states.
-                      ((N, B, D_dec), (N, B, D_dec) or None) or None
             idx: State ID to extract.
 
         Returns:
             : Decoder hidden state for given ID.
-                ((N, 1, D_dec), (N, 1, D_dec) or None) or None
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def create_batch_states(
+        self,
+        new_states: List[
+            Union[
+                List[Dict[str, Optional[torch.Tensor]]],
+                List[List[torch.Tensor]],
+                Tuple[torch.Tensor, Optional[torch.Tensor]],
+            ],
+        ],
+    ) -> Union[
+        List[Dict[str, torch.Tensor]],
+        List[torch.Tensor],
+        Tuple[torch.Tensor, Optional[torch.Tensor]],
+    ]:
+        """Create batch of decoder hidden states given a list of new states.
+
+        Args:
+            new_states: Decoder hidden states.
+
+        Returns:
+            : Decoder hidden states.
 
         """
         raise NotImplementedError
diff --git a/espnet2/asr_transducer/decoder/blocks/__init__.py b/espnet2/asr_transducer/decoder/blocks/__init__.py