diff --git a/.gitignore b/.gitignore index b63d90c7bcb..d2d9850cfcb 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,8 @@ \#*\# .\#* *DS_Store +dummy_token_list +empty.py out.txt espnet.egg-info/ doc/_build @@ -31,6 +33,8 @@ test_spm.model *.nfs* constraints.txt +out/config.yaml + # recipe related egs*/*/*/data* egs*/*/*/db @@ -48,6 +52,7 @@ egs*/*/*/nltk* egs*/*/*/.cache* egs*/*/*/pretrained_models* egs*/fisher_callhome_spanish/*/local/mapping* +egs2/test/* # tools related tools/chainer diff --git a/ci/test_configuration_espnet2.sh b/ci/test_configuration_espnet2.sh index c298081d65c..aba67ce11ec 100755 --- a/ci/test_configuration_espnet2.sh +++ b/ci/test_configuration_espnet2.sh @@ -20,9 +20,38 @@ python3 -m pip uninstall -y chainer echo "" > dummy_token_list echo "==== [ESPnet2] Validation configuration files ===" if python3 -c 'import torch as t; from packaging.version import parse as L; assert L(t.__version__) >= L("1.8.0")' &> /dev/null; then + + s3prl_confs='[ "egs2/fsc/asr1/conf/train_asr.yaml", + "egs2/americasnlp22/asr1/conf/train_asr_transformer.yaml", + "egs2/aphasiabank/asr1/conf/train_asr.yaml". + "egs2/bur_openslr80/asr1/conf/train_asr_hubert_transformer_adam_specaug.yaml", + "egs2/catslu/asr1/conf/train_asr.yaml", + "egs2/dcase22_task1/asr1/conf/train_asr.yaml", + "egs2/fleurs/asr1/conf/train_asr.yaml", + "egs2/fsc_challenge/asr1/conf/train_asr.yaml", + "egs2/fsc_unseen/asr1/conf/train_asr.yaml", + "egs2/meld/asr1/conf/train_asr.yaml", + "egs2/microsoft_speech/asr1/conf/train_asr.yaml", + "egs2/mini_an4/asr1/conf/train_asr_transducer_debug.yaml", + "egs2/slue-voxceleb/asr1/conf/train_asr.yaml", + "egs2/slue-voxpopuli/asr1/conf/train_asr.yaml", + "egs2/stop/asr1/conf/train_asr2_hubert_lr0.002.yaml", + "egs2/stop/asr1/conf/train_asr2_wav2vec2_lr0.002.yaml", + "egs2/stop/asr1/conf/train_asr2_wavlm_branchformer.yaml", + "egs2/stop/asr1/conf/train_asr2_wavlm_lr0.002.yaml", + "egs2/swbd_da/asr1/conf/train_asr.yaml", + "egs2/totonac/asr1/conf/train_asr.yaml" ]' + + warprnnt_confs='[ "egs2/librispeech/asr1/conf/train_asr_rnnt.yaml" ]' + for f in egs2/*/asr1/conf/train_asr*.yaml; do - if [ "$f" == "egs2/fsc/asr1/conf/train_asr.yaml" ]; then - if ! python3 -c "import s3prl" > /dev/null; then + if [[ ${s3prl_confs} =~ \"${f}\" ]]; then + if ! python3 -c "import s3prl" &> /dev/null; then + continue + fi + fi + if [[ ${warprnnt_confs} =~ \"${f}\" ]]; then + if ! python3 -c "from warprnnt_pytorch import RNNTLoss" &> /dev/null; then continue fi fi diff --git a/ci/test_integration_espnet1.sh b/ci/test_integration_espnet1.sh index 1042119ec0c..76a662ef2ab 100755 --- a/ci/test_integration_espnet1.sh +++ b/ci/test_integration_espnet1.sh @@ -46,31 +46,33 @@ echo "=== ASR (backend=pytorch num-encs 2, model=transformer) ===" ./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer.yaml \ --decode-config conf/decode.yaml -# test transducer recipe -echo "=== ASR (backend=pytorch, model=rnnt) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer.yaml \ - --decode-config conf/decode_transducer.yaml -echo "=== ASR (backend=pytorch, model=transformer-transducer) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer_transducer.yaml \ - --decode-config conf/decode_transducer.yaml -echo "=== ASR (backend=pytorch, model=conformer-transducer) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_conformer_transducer.yaml \ - --decode-config conf/decode_transducer.yaml - -# test transducer with auxiliary task recipe -echo "=== ASR (backend=pytorch, model=rnnt, tasks=L1+L2+L3+L4+L5)" -./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer_aux.yaml \ - --decode-config conf/decode_transducer.yaml - -# test finetuning -## test transfer learning -echo "=== ASR (backend=pytorch, model=rnnt, transfer_learning=enc) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer_pre_init_enc.yaml \ - --decode-config conf/decode_transducer.yaml -echo "=== ASR (backend=pytorch, model=rnnt, transfer_learning=LM) ===" -./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer_pre_init_lm.yaml \ - --decode-config conf/decode_transducer.yaml -## to do: cover all tasks + freezing option +if python3 -c "from warprnnt_pytorch import RNNTLoss" &> /dev/null; then + # test transducer recipe + echo "=== ASR (backend=pytorch, model=rnnt) ===" + ./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer.yaml \ + --decode-config conf/decode_transducer.yaml + echo "=== ASR (backend=pytorch, model=transformer-transducer) ===" + ./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer_transducer.yaml \ + --decode-config conf/decode_transducer.yaml + echo "=== ASR (backend=pytorch, model=conformer-transducer) ===" + ./run.sh --python "${python}" --stage 4 --train-config conf/train_conformer_transducer.yaml \ + --decode-config conf/decode_transducer.yaml + + # test transducer with auxiliary task recipe + echo "=== ASR (backend=pytorch, model=rnnt, tasks=L1+L2+L3+L4+L5)" + ./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer_aux.yaml \ + --decode-config conf/decode_transducer.yaml + + # test finetuning + ## test transfer learning + echo "=== ASR (backend=pytorch, model=rnnt, transfer_learning=enc) ===" + ./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer_pre_init_enc.yaml \ + --decode-config conf/decode_transducer.yaml + echo "=== ASR (backend=pytorch, model=rnnt, transfer_learning=LM) ===" + ./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer_pre_init_lm.yaml \ + --decode-config conf/decode_transducer.yaml + ## to do: cover all tasks + freezing option +fi echo "==== ASR (backend=pytorch num-encs 2) ===" ./run.sh --python "${python}" --stage 2 --train-config ./conf/train_mulenc2.yaml --decode-config ./conf/decode_mulenc2.yaml --mulenc true diff --git a/ci/test_integration_espnet2.sh b/ci/test_integration_espnet2.sh index e79bd05a9ee..fbe066fec49 100755 --- a/ci/test_integration_espnet2.sh +++ b/ci/test_integration_espnet2.sh @@ -65,21 +65,23 @@ echo "==== use_streaming, feats_type=raw, token_types=bpe, model_conf.extract_fe --decoder=transformer --decoder_conf='{'attention_heads': 2, 'linear_units': 2, 'num_blocks': 1}' --max_epoch 1 --num_iters_per_epoch 1 --batch_size 2 --batch_type folded --num_workers 0" -echo "==== Transducer, feats_type=raw, token_types=bpe ===" -./run.sh --asr-tag "espnet_model_transducer" --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false \ - --feats-type "raw" --token-type "bpe" --python "${python}" \ - --asr-args "--decoder transducer --decoder_conf hidden_size=2 --model_conf ctc_weight=0.0 --joint_net_conf joint_space_size=2 --num_workers 0 \ - --best_model_criterion '(valid, loss, min)'" --inference_asr_model "valid.loss.best.pth" - -if [ "$(python3 -c "import torch; print(torch.cuda.is_available())")" == "True" ]; then - echo "==== Multi-Blank Transducer, feats_type=raw, token_types=bpe ===" - ./run.sh --asr-tag "espnet_model_multi_blank_transducer" --ngpu 1 --stage 10 --stop-stage 13 --skip-upload false \ +if python3 -c "from warprnnt_pytorch import RNNTLoss" &> /dev/null; then + echo "==== Transducer, feats_type=raw, token_types=bpe ===" + ./run.sh --asr-tag "espnet_model_transducer" --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false \ --feats-type "raw" --token-type "bpe" --python "${python}" \ - --asr-tag "train_multi_black_transducer" \ - --asr_args "--decoder transducer --decoder_conf hidden_size=2 --model_conf ctc_weight=0.0 --joint_net_conf joint_space_size=2 \ - --best_model_criterion '(valid, loss, min)' --model_conf transducer_multi_blank_durations=[2] \ - --max_epoch 1 --num_iters_per_epoch 1 --batch_size 2 --batch_type folded --num_workers 0" \ - --inference_asr_model "valid.loss.best.pth" --inference_config "conf/decode_multi_blank_transducer_debug.yaml" + --asr-args "--decoder transducer --decoder_conf hidden_size=2 --model_conf ctc_weight=0.0 --joint_net_conf joint_space_size=2 --num_workers 0 \ + --best_model_criterion '(valid, loss, min)'" --inference_asr_model "valid.loss.best.pth" + + if [ "$(python3 -c "import torch; print(torch.cuda.is_available())")" == "True" ]; then + echo "==== Multi-Blank Transducer, feats_type=raw, token_types=bpe ===" + ./run.sh --asr-tag "espnet_model_multi_blank_transducer" --ngpu 1 --stage 10 --stop-stage 13 --skip-upload false \ + --feats-type "raw" --token-type "bpe" --python "${python}" \ + --asr-tag "train_multi_black_transducer" \ + --asr_args "--decoder transducer --decoder_conf hidden_size=2 --model_conf ctc_weight=0.0 --joint_net_conf joint_space_size=2 \ + --best_model_criterion '(valid, loss, min)' --model_conf transducer_multi_blank_durations=[2] \ + --max_epoch 1 --num_iters_per_epoch 1 --batch_size 2 --batch_type folded --num_workers 0" \ + --inference_asr_model "valid.loss.best.pth" --inference_config "conf/decode_multi_blank_transducer_debug.yaml" + fi fi if python3 -c "import k2" &> /dev/null; then @@ -311,7 +313,9 @@ cd ./egs2/mini_an4/s2st1 gen_dummy_coverage echo "==== [ESPnet2] S2ST ===" ./run.sh --ngpu 0 --stage 1 --stop_stage 8 --use_discrete_unit false --s2st_config conf/s2st_spec_debug.yaml --python "${python}" -./run.sh --ngpu 0 --stage 1 --stop_stage 8 --python "${python}" --use_discrete_unit true --s2st_config conf/train_s2st_discrete_unit_debug.yaml --clustering_num_threads 2 --feature_num_clusters 5 +if python3 -c "import s3prl" &> /dev/null; then + ./run.sh --ngpu 0 --stage 1 --stop_stage 8 --python "${python}" --use_discrete_unit true --s2st_config conf/train_s2st_discrete_unit_debug.yaml --clustering_num_threads 2 --feature_num_clusters 5 +fi # Remove generated files in order to reduce the disk usage rm -rf exp dump data ckpt .cache cd "${cwd}" diff --git a/ci/test_integration_espnetez.sh b/ci/test_integration_espnetez.sh index 18d3a56b097..e3c0265fab0 100755 --- a/ci/test_integration_espnetez.sh +++ b/ci/test_integration_espnetez.sh @@ -51,27 +51,29 @@ python -m coverage run --append ../../../test/espnetez/test_integration_espnetez # Remove generated files in order to reduce the disk usage rm -rf exp data/spm -# [ESPnet Easy] test asr transducer recipe with coverage -python -m coverage run --append ../../../test/espnetez/test_integration_espnetez.py \ - --task asr \ - --data_path data \ - --train_dump_path dump/raw/train_nodev \ - --valid_dump_path dump/raw/train_dev \ - --exp_path ./exp \ - --config_path conf/train_asr_transducer_debug.yaml \ - --train_sentencepiece_model \ - --run_collect_stats \ - --run_train - -# finetuning -python -m coverage run --append ../../../test/espnetez/test_integration_espnetez_ft.py \ - --task asr \ - --data_path data \ - --train_dump_path dump/raw/train_nodev \ - --valid_dump_path dump/raw/train_dev \ - --exp_path ./exp \ - --config_path conf/train_asr_transducer_debug.yaml \ - --run_finetune +if python3 -c "from warprnnt_pytorch import RNNTLoss" &> /dev/null; then + # [ESPnet Easy] test asr transducer recipe with coverage + python -m coverage run --append ../../../test/espnetez/test_integration_espnetez.py \ + --task asr \ + --data_path data \ + --train_dump_path dump/raw/train_nodev \ + --valid_dump_path dump/raw/train_dev \ + --exp_path ./exp \ + --config_path conf/train_asr_transducer_debug.yaml \ + --train_sentencepiece_model \ + --run_collect_stats \ + --run_train + + # finetuning + python -m coverage run --append ../../../test/espnetez/test_integration_espnetez_ft.py \ + --task asr \ + --data_path data \ + --train_dump_path dump/raw/train_nodev \ + --valid_dump_path dump/raw/train_dev \ + --exp_path ./exp \ + --config_path conf/train_asr_transducer_debug.yaml \ + --run_finetune +fi # Remove generated files in order to reduce the disk usage rm -rf exp data/spm diff --git a/egs2/TEMPLATE/asr1/pyscripts/audio/compute_vad.py b/egs2/TEMPLATE/asr1/pyscripts/audio/compute_vad.py index 60bbefa585b..f6df4a40818 100755 --- a/egs2/TEMPLATE/asr1/pyscripts/audio/compute_vad.py +++ b/egs2/TEMPLATE/asr1/pyscripts/audio/compute_vad.py @@ -14,7 +14,6 @@ import soundfile as sf from scipy.signal import lfilter from tqdm import tqdm -from typeguard import check_argument_types from espnet2.fileio.read_text import read_2columns_text from espnet.utils.cli_utils import get_commandline_args diff --git a/egs2/TEMPLATE/asr1/pyscripts/audio/format_wav_scp.py b/egs2/TEMPLATE/asr1/pyscripts/audio/format_wav_scp.py index 8426c79e55b..8a29f2d6e99 100755 --- a/egs2/TEMPLATE/asr1/pyscripts/audio/format_wav_scp.py +++ b/egs2/TEMPLATE/asr1/pyscripts/audio/format_wav_scp.py @@ -11,7 +11,7 @@ import resampy import soundfile from tqdm import tqdm -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.read_text import read_2columns_text from espnet2.fileio.sound_scp import SoundScpWriter, soundfile_read @@ -26,6 +26,7 @@ def humanfriendly_or_none(value: str): return humanfriendly.parse_size(value) +@typechecked def str2int_tuple(integers: str) -> Optional[Tuple[int, ...]]: """ @@ -33,16 +34,15 @@ def str2int_tuple(integers: str) -> Optional[Tuple[int, ...]]: (3, 4, 5) """ - assert check_argument_types() if integers.strip() in ("none", "None", "NONE", "null", "Null", "NULL"): return None return tuple(map(int, integers.strip().split(","))) +@typechecked def vad_trim(vad_reader: VADScpReader, uttid: str, wav: np.array, fs: int) -> np.array: # Conduct trim wtih vad information - assert check_argument_types() assert uttid in vad_reader, uttid vad_info = vad_reader[uttid] @@ -72,8 +72,8 @@ class SegmentsExtractor: "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5\n" """ + @typechecked def __init__(self, fname: str, segments: str = None, multi_columns: bool = False): - assert check_argument_types() self.wav_scp = fname self.multi_columns = multi_columns self.wav_dict = {} diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/calculate_speech_metrics.py b/egs2/TEMPLATE/asr1/pyscripts/utils/calculate_speech_metrics.py index 34c1cfc96fb..8763541efa2 100644 --- a/egs2/TEMPLATE/asr1/pyscripts/utils/calculate_speech_metrics.py +++ b/egs2/TEMPLATE/asr1/pyscripts/utils/calculate_speech_metrics.py @@ -8,7 +8,7 @@ import torch from mir_eval.separation import bss_eval_sources from pystoi import stoi -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.enh.encoder.stft_encoder import STFTEncoder from espnet2.enh.espnet_model import ESPnetEnhancementModel @@ -18,6 +18,7 @@ from espnet.utils.cli_utils import get_commandline_args +@typechecked def scoring( output_dir: str, dtype: str, @@ -30,7 +31,6 @@ def scoring( frame_size: int = 512, frame_hop: int = 256, ): - assert check_argument_types() for metric in metrics: assert metric in ( "STOI", diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_rttm.py b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_rttm.py index f1153329a8a..2597d67cfe8 100755 --- a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_rttm.py +++ b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_rttm.py @@ -11,11 +11,12 @@ import humanfriendly import numpy as np import soundfile -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.utils.types import str_or_int +@typechecked def convert_rttm_text( path: Union[Path, str], wavscp_path: Union[Path, str], @@ -31,7 +32,6 @@ def convert_rttm_text( "w", encoding="utf-8" ) - assert check_argument_types() utt_ids = set() with Path(path).open("r", encoding="utf-8") as f: for linenum, line in enumerate(f, 1): diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/evaluate_whisper_inference.py b/egs2/TEMPLATE/asr1/pyscripts/utils/evaluate_whisper_inference.py index ee356aa8578..cc9666666fd 100644 --- a/egs2/TEMPLATE/asr1/pyscripts/utils/evaluate_whisper_inference.py +++ b/egs2/TEMPLATE/asr1/pyscripts/utils/evaluate_whisper_inference.py @@ -9,7 +9,7 @@ import torch import whisper -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.datadir_writer import DatadirWriter from espnet2.torch_utils.set_all_random_seed import set_all_random_seed @@ -22,19 +22,20 @@ class Speech2Text: """Speech2Text class""" + @typechecked def __init__( self, model_tag: str = "base", model_dir: str = "./models", device: str = "cpu", ): - assert check_argument_types() self.model = whisper.load_model( name=model_tag, download_root=model_dir, device=device ) @torch.no_grad() + @typechecked def __call__(self, speech: str, **decode_options) -> Optional[str]: """Inference @@ -44,7 +45,6 @@ def __call__(self, speech: str, **decode_options) -> Optional[str]: text """ - assert check_argument_types() # Input as audio signal result = self.model.transcribe(speech, **decode_options) @@ -52,6 +52,7 @@ def __call__(self, speech: str, **decode_options) -> Optional[str]: return result["text"] +@typechecked def inference( output_dir: str, ngpu: int, @@ -65,7 +66,6 @@ def inference( allow_variable_data_keys: bool, decode_options: Dict, ): - assert check_argument_types() if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") diff --git a/egs2/kiritan/svs1/local/prep_segments_from_xml.py b/egs2/kiritan/svs1/local/prep_segments_from_xml.py index 1818da2aa78..c93b0909bc8 100755 --- a/egs2/kiritan/svs1/local/prep_segments_from_xml.py +++ b/egs2/kiritan/svs1/local/prep_segments_from_xml.py @@ -6,7 +6,7 @@ import music21 as m21 import numpy as np -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.read_text import read_2columns_text from espnet2.fileio.score_scp import NOTE, SingingScoreWriter @@ -53,12 +53,12 @@ class XMLReader: >>> tempo, note_list = reader['key1'] """ + @typechecked def __init__( self, fname, dtype=np.int16, ): - assert check_argument_types() self.fname = fname self.dtype = dtype self.data = read_2columns_text(fname) # get key-value dict diff --git a/espnet2/asr/ctc.py b/espnet2/asr/ctc.py index 0f0f45ee28f..13621a47cf7 100644 --- a/espnet2/asr/ctc.py +++ b/espnet2/asr/ctc.py @@ -1,8 +1,9 @@ import logging +from typing import Optional import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked class CTC(torch.nn.Module): @@ -18,6 +19,7 @@ class CTC(torch.nn.Module): zero_infinity: Whether to zero infinite losses and the associated gradients. """ + @typechecked def __init__( self, odim: int, @@ -25,13 +27,12 @@ def __init__( dropout_rate: float = 0.0, ctc_type: str = "builtin", reduce: bool = True, - ignore_nan_grad: bool = None, + ignore_nan_grad: Optional[bool] = None, zero_infinity: bool = True, brctc_risk_strategy: str = "exp", brctc_group_strategy: str = "end", brctc_risk_factor: float = 0.0, ): - assert check_argument_types() super().__init__() eprojs = encoder_output_size self.dropout_rate = dropout_rate @@ -56,7 +57,7 @@ def __init__( elif self.ctc_type == "brctc": try: - import k2 + import k2 # noqa except ImportError: raise ImportError("You should install K2 to use Bayes Risk CTC") diff --git a/espnet2/asr/decoder/hugging_face_transformers_decoder.py b/espnet2/asr/decoder/hugging_face_transformers_decoder.py index 9b7e41d7db1..1af31b3679d 100644 --- a/espnet2/asr/decoder/hugging_face_transformers_decoder.py +++ b/espnet2/asr/decoder/hugging_face_transformers_decoder.py @@ -10,7 +10,7 @@ import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.decoder.abs_decoder import AbsDecoder from espnet.nets.pytorch_backend.nets_utils import make_pad_mask @@ -34,6 +34,7 @@ class HuggingFaceTransformersDecoder(AbsDecoder, BatchScorerInterface): model_name_or_path: Hugging Face Transformers model name """ + @typechecked def __init__( self, vocab_size: int, @@ -43,7 +44,6 @@ def __init__( prefix: str = "", postfix: str = "", ): - assert check_argument_types() super().__init__() if not is_transformers_available: diff --git a/espnet2/asr/decoder/mlm_decoder.py b/espnet2/asr/decoder/mlm_decoder.py index 17719c39074..a787185de11 100644 --- a/espnet2/asr/decoder/mlm_decoder.py +++ b/espnet2/asr/decoder/mlm_decoder.py @@ -5,7 +5,7 @@ from typing import Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.decoder.abs_decoder import AbsDecoder from espnet.nets.pytorch_backend.nets_utils import make_pad_mask @@ -20,6 +20,7 @@ class MLMDecoder(AbsDecoder): + @typechecked def __init__( self, vocab_size: int, @@ -37,7 +38,6 @@ def __init__( normalize_before: bool = True, concat_after: bool = False, ): - assert check_argument_types() super().__init__() attention_dim = encoder_output_size vocab_size += 1 # for mask token diff --git a/espnet2/asr/decoder/rnn_decoder.py b/espnet2/asr/decoder/rnn_decoder.py index 05a588178ad..634108357a2 100644 --- a/espnet2/asr/decoder/rnn_decoder.py +++ b/espnet2/asr/decoder/rnn_decoder.py @@ -3,7 +3,7 @@ import numpy as np import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.decoder.abs_decoder import AbsDecoder from espnet2.utils.get_default_kwargs import get_default_kwargs @@ -80,6 +80,7 @@ def build_attention_list( class RNNDecoder(AbsDecoder): + @typechecked def __init__( self, vocab_size: int, @@ -95,7 +96,6 @@ def __init__( att_conf: dict = get_default_kwargs(build_attention_list), ): # FIXME(kamo): The parts of num_spk should be refactored more more more - assert check_argument_types() if rnn_type not in {"lstm", "gru"}: raise ValueError(f"Not supported: rnn_type={rnn_type}") diff --git a/espnet2/asr/decoder/s4_decoder.py b/espnet2/asr/decoder/s4_decoder.py index efc937ba457..30828bcb23d 100644 --- a/espnet2/asr/decoder/s4_decoder.py +++ b/espnet2/asr/decoder/s4_decoder.py @@ -3,7 +3,7 @@ from typing import Any, List, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.decoder.abs_decoder import AbsDecoder from espnet2.asr.state_spaces.model import SequenceModel @@ -33,6 +33,7 @@ class S4Decoder(AbsDecoder, BatchScorerInterface): drop_path: drop rate for stochastic depth """ + @typechecked def __init__( self, vocab_size: int, @@ -52,7 +53,6 @@ def __init__( track_norms=True, drop_path: float = 0.0, ): - assert check_argument_types() super().__init__() self.d_model = encoder_output_size diff --git a/espnet2/asr/decoder/transducer_decoder.py b/espnet2/asr/decoder/transducer_decoder.py index 6dcc7b52b25..857a7d09215 100644 --- a/espnet2/asr/decoder/transducer_decoder.py +++ b/espnet2/asr/decoder/transducer_decoder.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.decoder.abs_decoder import AbsDecoder from espnet2.asr.transducer.beam_search_transducer import ExtendedHypothesis, Hypothesis @@ -23,6 +23,7 @@ class TransducerDecoder(AbsDecoder): """ + @typechecked def __init__( self, vocab_size: int, @@ -33,7 +34,6 @@ def __init__( dropout_embed: float = 0.0, embed_pad: int = 0, ): - assert check_argument_types() if rnn_type not in {"lstm", "gru"}: raise ValueError(f"Not supported: rnn_type={rnn_type}") diff --git a/espnet2/asr/decoder/transformer_decoder.py b/espnet2/asr/decoder/transformer_decoder.py index d679aba1456..0386a4c7e48 100644 --- a/espnet2/asr/decoder/transformer_decoder.py +++ b/espnet2/asr/decoder/transformer_decoder.py @@ -5,7 +5,7 @@ from typing import Any, List, Sequence, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.decoder.abs_decoder import AbsDecoder from espnet.nets.pytorch_backend.nets_utils import make_pad_mask @@ -47,6 +47,7 @@ class BaseTransformerDecoder(AbsDecoder, BatchScorerInterface): i.e. x -> x + att(x) """ + @typechecked def __init__( self, vocab_size: int, @@ -58,7 +59,6 @@ def __init__( pos_enc_class=PositionalEncoding, normalize_before: bool = True, ): - assert check_argument_types() super().__init__() attention_dim = encoder_output_size @@ -284,6 +284,7 @@ def batch_score( class TransformerDecoder(BaseTransformerDecoder): + @typechecked def __init__( self, vocab_size: int, @@ -302,7 +303,6 @@ def __init__( concat_after: bool = False, layer_drop_rate: float = 0.0, ): - assert check_argument_types() super().__init__( vocab_size=vocab_size, encoder_output_size=encoder_output_size, @@ -335,6 +335,7 @@ def __init__( class LightweightConvolutionTransformerDecoder(BaseTransformerDecoder): + @typechecked def __init__( self, vocab_size: int, @@ -355,7 +356,6 @@ def __init__( conv_kernel_length: Sequence[int] = (11, 11, 11, 11, 11, 11), conv_usebias: int = False, ): - assert check_argument_types() if len(conv_kernel_length) != num_blocks: raise ValueError( "conv_kernel_length must have equal number of values to num_blocks: " @@ -397,6 +397,7 @@ def __init__( class LightweightConvolution2DTransformerDecoder(BaseTransformerDecoder): + @typechecked def __init__( self, vocab_size: int, @@ -417,7 +418,6 @@ def __init__( conv_kernel_length: Sequence[int] = (11, 11, 11, 11, 11, 11), conv_usebias: int = False, ): - assert check_argument_types() if len(conv_kernel_length) != num_blocks: raise ValueError( "conv_kernel_length must have equal number of values to num_blocks: " @@ -459,6 +459,7 @@ def __init__( class DynamicConvolutionTransformerDecoder(BaseTransformerDecoder): + @typechecked def __init__( self, vocab_size: int, @@ -479,7 +480,6 @@ def __init__( conv_kernel_length: Sequence[int] = (11, 11, 11, 11, 11, 11), conv_usebias: int = False, ): - assert check_argument_types() if len(conv_kernel_length) != num_blocks: raise ValueError( "conv_kernel_length must have equal number of values to num_blocks: " @@ -521,6 +521,7 @@ def __init__( class DynamicConvolution2DTransformerDecoder(BaseTransformerDecoder): + @typechecked def __init__( self, vocab_size: int, @@ -541,7 +542,6 @@ def __init__( conv_kernel_length: Sequence[int] = (11, 11, 11, 11, 11, 11), conv_usebias: int = False, ): - assert check_argument_types() if len(conv_kernel_length) != num_blocks: raise ValueError( "conv_kernel_length must have equal number of values to num_blocks: " @@ -583,6 +583,7 @@ def __init__( class TransformerMDDecoder(BaseTransformerDecoder): + @typechecked def __init__( self, vocab_size: int, @@ -601,7 +602,6 @@ def __init__( concat_after: bool = False, use_speech_attn: bool = True, ): - assert check_argument_types() super().__init__( vocab_size=vocab_size, encoder_output_size=encoder_output_size, diff --git a/espnet2/asr/decoder/whisper_decoder.py b/espnet2/asr/decoder/whisper_decoder.py index ea278d75c89..b0106bd1bf5 100644 --- a/espnet2/asr/decoder/whisper_decoder.py +++ b/espnet2/asr/decoder/whisper_decoder.py @@ -1,8 +1,8 @@ import copy -from typing import Any, List, Tuple +from typing import Any, List, Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.decoder.abs_decoder import AbsDecoder from espnet.nets.scorer_interface import BatchScorerInterface @@ -44,13 +44,14 @@ class OpenAIWhisperDecoder(AbsDecoder, BatchScorerInterface): URL: https://github.com/openai/whisper """ + @typechecked def __init__( self, vocab_size: int, encoder_output_size: int, dropout_rate: float = 0.0, whisper_model: str = "small", - download_dir: str = None, + download_dir: Optional[str] = None, load_origin_token_embedding=False, ): try: @@ -63,7 +64,6 @@ def __init__( ) raise e - assert check_argument_types() super().__init__() assert whisper_model in whisper.available_models() diff --git a/espnet2/asr/discrete_asr_espnet_model.py b/espnet2/asr/discrete_asr_espnet_model.py index 4144ea035de..bc0f60e2271 100644 --- a/espnet2/asr/discrete_asr_espnet_model.py +++ b/espnet2/asr/discrete_asr_espnet_model.py @@ -3,7 +3,7 @@ import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.decoder.abs_decoder import AbsDecoder @@ -30,6 +30,7 @@ def autocast(enabled=True): class ESPnetDiscreteASRModel(ESPnetMTModel): """Encoder-Decoder model""" + @typechecked def __init__( self, vocab_size: int, @@ -55,7 +56,6 @@ def __init__( share_decoder_input_output_embed: bool = False, share_encoder_decoder_input_embed: bool = False, ): - assert check_argument_types() assert 0.0 <= ctc_weight <= 1.0, ctc_weight super().__init__( diff --git a/espnet2/asr/encoder/avhubert_encoder.py b/espnet2/asr/encoder/avhubert_encoder.py index 48232060918..feb14e88f51 100644 --- a/espnet2/asr/encoder/avhubert_encoder.py +++ b/espnet2/asr/encoder/avhubert_encoder.py @@ -13,15 +13,16 @@ import math import os import random +from collections import OrderedDict from copy import deepcopy from dataclasses import dataclass, field -from typing import Dict, List, Optional, Tuple +from typing import Dict, Optional, Tuple import numpy as np import torch import torch.nn as nn from filelock import FileLock -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet.nets.pytorch_backend.nets_utils import make_pad_mask @@ -78,6 +79,7 @@ class FairseqAVHubertEncoder(AbsEncoder): avhubert_dir_path: dir_path for downloading pre-trained avhubert model """ + @typechecked def __init__( self, input_size: int = 1, @@ -107,7 +109,6 @@ def __init__( max_noise_weight: float = 0.5, audio_only: bool = False, ): - assert check_argument_types() super().__init__() self._output_size = encoder_embed_dim @@ -187,6 +188,7 @@ def forward( prev_states: torch.Tensor = None, ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: """Forward AVHubert Encoder. + Args: xs_pad[video]: input tensor (B, 1, L, H, W) xs_pad[audio]: input tensor (B, D, L) @@ -201,7 +203,7 @@ def forward( elif "audio" in xs_pad: masks = make_pad_mask(ilens, length_dim=2).to(xs_pad["audio"].device) else: - ValueError(f"Input should have video or audio") + ValueError("Input should have video or audio") ft = self.freeze_finetune_updates <= self.num_updates @@ -705,6 +707,7 @@ def extract_finetune( self, source, padding_mask=None, mask=False, ret_conv=False, output_layer=None ): """Forward AVHubert Pretrain Encoder. + Args: source['video']: input tensor (B, 1, L, H, W) source['audio']: input tensor (B, F, L) @@ -804,6 +807,7 @@ def modality_fusion(self, features_audio, features_video): def forward_transformer(self, source, padding_mask=None, output_layer=None): """Forward AVHubert Pretrain Encoder (without frontend). + Assume the source is already fused feature. Args: source: input tensor (B, L, D*2) diff --git a/espnet2/asr/encoder/branchformer_encoder.py b/espnet2/asr/encoder/branchformer_encoder.py index 34bfc13d5fa..568a545d483 100644 --- a/espnet2/asr/encoder/branchformer_encoder.py +++ b/espnet2/asr/encoder/branchformer_encoder.py @@ -16,7 +16,7 @@ import numpy import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet2.asr.layers.cgmlp import ConvolutionalGatingMLP @@ -296,6 +296,7 @@ def forward(self, x_input, mask, cache=None): class BranchformerEncoder(AbsEncoder): """Branchformer encoder module.""" + @typechecked def __init__( self, input_size: int, @@ -322,7 +323,6 @@ def __init__( padding_idx: int = -1, stochastic_depth_rate: Union[float, List[float]] = 0.0, ): - assert check_argument_types() super().__init__() self._output_size = output_size diff --git a/espnet2/asr/encoder/conformer_encoder.py b/espnet2/asr/encoder/conformer_encoder.py index 6231b6b0d30..32f2134c561 100644 --- a/espnet2/asr/encoder/conformer_encoder.py +++ b/espnet2/asr/encoder/conformer_encoder.py @@ -7,7 +7,7 @@ from typing import List, Optional, Tuple, Union import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.encoder.abs_encoder import AbsEncoder @@ -84,6 +84,7 @@ class ConformerEncoder(AbsEncoder): """ + @typechecked def __init__( self, input_size: int, @@ -115,7 +116,6 @@ def __init__( layer_drop_rate: float = 0.0, max_pos_emb_len: int = 5000, ): - assert check_argument_types() super().__init__() self._output_size = output_size diff --git a/espnet2/asr/encoder/contextual_block_conformer_encoder.py b/espnet2/asr/encoder/contextual_block_conformer_encoder.py index 7aa2db8fdfe..f8a761ac70a 100644 --- a/espnet2/asr/encoder/contextual_block_conformer_encoder.py +++ b/espnet2/asr/encoder/contextual_block_conformer_encoder.py @@ -9,7 +9,7 @@ from typing import Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet.nets.pytorch_backend.conformer.contextual_block_encoder_layer import ( @@ -63,6 +63,7 @@ class ContextualBlockConformerEncoder(AbsEncoder): ctx_pos_enc: whether to use positional encoding to the context vectors """ + @typechecked def __init__( self, input_size: int, @@ -91,7 +92,6 @@ def __init__( init_average: bool = True, ctx_pos_enc: bool = True, ): - assert check_argument_types() super().__init__() self._output_size = output_size self.pos_enc = pos_enc_class(output_size, positional_dropout_rate) diff --git a/espnet2/asr/encoder/contextual_block_transformer_encoder.py b/espnet2/asr/encoder/contextual_block_transformer_encoder.py index 2e21a4ced93..a0732c8cd1f 100644 --- a/espnet2/asr/encoder/contextual_block_transformer_encoder.py +++ b/espnet2/asr/encoder/contextual_block_transformer_encoder.py @@ -6,7 +6,7 @@ from typing import Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet.nets.pytorch_backend.nets_utils import make_pad_mask @@ -62,6 +62,7 @@ class ContextualBlockTransformerEncoder(AbsEncoder): ctx_pos_enc: whether to use positional encoding to the context vectors """ + @typechecked def __init__( self, input_size: int, @@ -85,7 +86,6 @@ def __init__( init_average: bool = True, ctx_pos_enc: bool = True, ): - assert check_argument_types() super().__init__() self._output_size = output_size diff --git a/espnet2/asr/encoder/e_branchformer_encoder.py b/espnet2/asr/encoder/e_branchformer_encoder.py index 4b629d5d06d..ae2381c234e 100644 --- a/espnet2/asr/encoder/e_branchformer_encoder.py +++ b/espnet2/asr/encoder/e_branchformer_encoder.py @@ -10,10 +10,10 @@ """ import logging -from typing import List, Optional, Tuple +from typing import Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.encoder.abs_encoder import AbsEncoder @@ -184,6 +184,7 @@ def forward(self, x_input, mask, cache=None): class EBranchformerEncoder(AbsEncoder): """E-Branchformer encoder module.""" + @typechecked def __init__( self, input_size: int, @@ -214,7 +215,6 @@ def __init__( interctc_layer_idx=None, interctc_use_conditioning: bool = False, ): - assert check_argument_types() super().__init__() self._output_size = output_size diff --git a/espnet2/asr/encoder/hubert_encoder.py b/espnet2/asr/encoder/hubert_encoder.py index 6956d2d66fb..b2d58c2074e 100644 --- a/espnet2/asr/encoder/hubert_encoder.py +++ b/espnet2/asr/encoder/hubert_encoder.py @@ -18,7 +18,7 @@ import torch import yaml from filelock import FileLock -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet.nets.pytorch_backend.nets_utils import make_pad_mask @@ -85,6 +85,7 @@ class TorchAudioHuBERTPretrainEncoder(AbsEncoder): https://pytorch.org/audio/stable/generated/torchaudio.models.hubert_pretrain_model.html#torchaudio.models.hubert_pretrain_model """ + @typechecked def __init__( self, input_size: int = None, @@ -131,7 +132,6 @@ def __init__( finetuning: bool = False, freeze_encoder_updates: int = 0, ): - assert check_argument_types() super().__init__() try: import torchaudio @@ -299,6 +299,7 @@ class FairseqHubertEncoder(AbsEncoder): https://github.com/pytorch/fairseq/blob/master/fairseq/models/hubert/hubert.py """ + @typechecked def __init__( self, input_size: int, @@ -322,7 +323,6 @@ def __init__( layerdrop: float = 0.1, feature_grad_mult: float = 0.0, ): - assert check_argument_types() super().__init__() self.apply_mask = apply_mask try: @@ -508,6 +508,7 @@ class FairseqHubertPretrainEncoder(AbsEncoder): normalize_before: whether to use layer_norm before the first block """ + @typechecked def __init__( self, input_size: int = 1, @@ -525,7 +526,6 @@ def __init__( use_amp: bool = False, **kwargs, ): - assert check_argument_types() super().__init__() self._output_size = output_size self.use_amp = use_amp diff --git a/espnet2/asr/encoder/hugging_face_transformers_encoder.py b/espnet2/asr/encoder/hugging_face_transformers_encoder.py index 633b0e6eec3..1d363764ca3 100644 --- a/espnet2/asr/encoder/hugging_face_transformers_encoder.py +++ b/espnet2/asr/encoder/hugging_face_transformers_encoder.py @@ -9,7 +9,7 @@ from typing import Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet.nets.pytorch_backend.nets_utils import make_pad_mask @@ -25,6 +25,7 @@ class HuggingFaceTransformersEncoder(AbsEncoder): """Hugging Face Transformers PostEncoder.""" + @typechecked def __init__( self, input_size: int, @@ -32,7 +33,6 @@ def __init__( lang_token_id: int = -1, ): """Initialize the module.""" - assert check_argument_types() super().__init__() if not is_transformers_available: diff --git a/espnet2/asr/encoder/linear_encoder.py b/espnet2/asr/encoder/linear_encoder.py index 5b23d05803a..3cf2a607c3c 100644 --- a/espnet2/asr/encoder/linear_encoder.py +++ b/espnet2/asr/encoder/linear_encoder.py @@ -3,10 +3,10 @@ """Linear encoder definition.""" -from typing import List, Optional, Tuple +from typing import Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet.nets.pytorch_backend.nets_utils import make_pad_mask @@ -34,6 +34,7 @@ class LinearEncoder(AbsEncoder): padding_idx: padding_idx for input_layer=embed """ + @typechecked def __init__( self, input_size: int, @@ -43,7 +44,6 @@ def __init__( normalize_before: bool = True, padding_idx: int = -1, ): - assert check_argument_types() super().__init__() self._output_size = output_size diff --git a/espnet2/asr/encoder/longformer_encoder.py b/espnet2/asr/encoder/longformer_encoder.py index 0ee4f9b31db..1de6c75c555 100644 --- a/espnet2/asr/encoder/longformer_encoder.py +++ b/espnet2/asr/encoder/longformer_encoder.py @@ -6,7 +6,7 @@ from typing import List, Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.encoder.conformer_encoder import ConformerEncoder @@ -77,6 +77,7 @@ class LongformerEncoder(ConformerEncoder): """ + @typechecked def __init__( self, input_size: int, @@ -107,7 +108,6 @@ def __init__( attention_dilation: list = [1, 1, 1, 1, 1, 1], attention_mode: str = "sliding_chunks", ): - assert check_argument_types() super().__init__(input_size) self._output_size = output_size diff --git a/espnet2/asr/encoder/rnn_encoder.py b/espnet2/asr/encoder/rnn_encoder.py index 15a6b8fe43e..bc4912b5ec7 100644 --- a/espnet2/asr/encoder/rnn_encoder.py +++ b/espnet2/asr/encoder/rnn_encoder.py @@ -2,7 +2,7 @@ import numpy as np import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet.nets.pytorch_backend.nets_utils import make_pad_mask @@ -23,6 +23,7 @@ class RNNEncoder(AbsEncoder): """ + @typechecked def __init__( self, input_size: int, @@ -35,7 +36,6 @@ def __init__( dropout: float = 0.0, subsample: Optional[Sequence[int]] = (2, 2, 1, 1), ): - assert check_argument_types() super().__init__() self._output_size = output_size self.rnn_type = rnn_type @@ -46,13 +46,13 @@ def __init__( raise ValueError(f"Not supported rnn_type={rnn_type}") if subsample is None: - subsample = np.ones(num_layers + 1, dtype=np.int64) + _subsample = np.ones(num_layers + 1, dtype=np.int64) else: - subsample = subsample[:num_layers] + _subsample = subsample[:num_layers] # Append 1 at the beginning because the second or later is used - subsample = np.pad( - np.array(subsample, dtype=np.int64), - [1, num_layers - len(subsample)], + _subsample = np.pad( + np.array(_subsample, dtype=np.int64), + [1, num_layers - len(_subsample)], mode="constant", constant_values=1, ) @@ -66,7 +66,7 @@ def __init__( num_layers, hidden_size, output_size, - subsample, + _subsample, dropout, typ=rnn_type, ) diff --git a/espnet2/asr/encoder/transformer_encoder.py b/espnet2/asr/encoder/transformer_encoder.py index b98ec8b744c..ca42ede6359 100644 --- a/espnet2/asr/encoder/transformer_encoder.py +++ b/espnet2/asr/encoder/transformer_encoder.py @@ -6,7 +6,7 @@ from typing import List, Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.encoder.abs_encoder import AbsEncoder @@ -60,6 +60,7 @@ class TransformerEncoder(AbsEncoder): padding_idx: padding_idx for input_layer=embed """ + @typechecked def __init__( self, input_size: int, @@ -81,7 +82,6 @@ def __init__( interctc_use_conditioning: bool = False, layer_drop_rate: float = 0.0, ): - assert check_argument_types() super().__init__() self._output_size = output_size diff --git a/espnet2/asr/encoder/transformer_encoder_multispkr.py b/espnet2/asr/encoder/transformer_encoder_multispkr.py index 8e8de4d16b0..8f79389a822 100644 --- a/espnet2/asr/encoder/transformer_encoder_multispkr.py +++ b/espnet2/asr/encoder/transformer_encoder_multispkr.py @@ -5,7 +5,7 @@ from typing import Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet.nets.pytorch_backend.nets_utils import make_pad_mask @@ -59,6 +59,7 @@ class TransformerEncoder(AbsEncoder): num_inf: number of inference output """ + @typechecked def __init__( self, input_size: int, @@ -79,7 +80,6 @@ def __init__( padding_idx: int = -1, num_inf: int = 1, ): - assert check_argument_types() super().__init__() self._output_size = output_size diff --git a/espnet2/asr/encoder/vgg_rnn_encoder.py b/espnet2/asr/encoder/vgg_rnn_encoder.py index 420fbc0bcb1..fd457e7f8ff 100644 --- a/espnet2/asr/encoder/vgg_rnn_encoder.py +++ b/espnet2/asr/encoder/vgg_rnn_encoder.py @@ -2,7 +2,7 @@ import numpy as np import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet.nets.e2e_asr_common import get_vgg2l_odim @@ -24,6 +24,7 @@ class VGGRNNEncoder(AbsEncoder): """ + @typechecked def __init__( self, input_size: int, @@ -36,7 +37,6 @@ def __init__( dropout: float = 0.0, in_channel: int = 1, ): - assert check_argument_types() super().__init__() self._output_size = output_size self.rnn_type = rnn_type diff --git a/espnet2/asr/encoder/wav2vec2_encoder.py b/espnet2/asr/encoder/wav2vec2_encoder.py index dec3a4f576e..8eb535dee3f 100644 --- a/espnet2/asr/encoder/wav2vec2_encoder.py +++ b/espnet2/asr/encoder/wav2vec2_encoder.py @@ -10,7 +10,7 @@ import torch from filelock import FileLock -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet.nets.pytorch_backend.nets_utils import make_pad_mask @@ -30,6 +30,7 @@ class FairSeqWav2Vec2Encoder(AbsEncoder): 0 means to finetune every layer if freeze_w2v=False. """ + @typechecked def __init__( self, input_size: int, @@ -39,7 +40,6 @@ def __init__( normalize_before: bool = False, freeze_finetune_updates: int = 0, ): - assert check_argument_types() super().__init__() if w2v_url != "": diff --git a/espnet2/asr/encoder/whisper_encoder.py b/espnet2/asr/encoder/whisper_encoder.py index 285ba413f14..5e96b9b8900 100644 --- a/espnet2/asr/encoder/whisper_encoder.py +++ b/espnet2/asr/encoder/whisper_encoder.py @@ -3,7 +3,7 @@ import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet2.asr.specaug.specaug import SpecAug @@ -15,12 +15,13 @@ class OpenAIWhisperEncoder(AbsEncoder): URL: https://github.com/openai/whisper """ + @typechecked def __init__( self, input_size: int = 1, dropout_rate: float = 0.0, whisper_model: str = "small", - download_dir: str = None, + download_dir: Optional[str] = None, use_specaug: bool = False, specaug_conf: Union[dict, None] = None, do_pad_trim: bool = False, @@ -36,7 +37,6 @@ def __init__( ) raise e - assert check_argument_types() super().__init__() self.n_fft = N_FFT diff --git a/espnet2/asr/espnet_model.py b/espnet2/asr/espnet_model.py index f49e0e8886b..b5224585cf7 100644 --- a/espnet2/asr/espnet_model.py +++ b/espnet2/asr/espnet_model.py @@ -4,7 +4,7 @@ import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.decoder.abs_decoder import AbsDecoder @@ -37,6 +37,7 @@ def autocast(enabled=True): class ESPnetASRModel(AbsESPnetModel): """CTC-attention hybrid Encoder-Decoder model""" + @typechecked def __init__( self, vocab_size: int, @@ -50,7 +51,7 @@ def __init__( decoder: Optional[AbsDecoder], ctc: CTC, joint_network: Optional[torch.nn.Module], - aux_ctc: dict = None, + aux_ctc: Optional[dict] = None, ctc_weight: float = 0.5, interctc_weight: float = 0.0, ignore_id: int = -1, @@ -69,7 +70,6 @@ def __init__( extract_feats_in_collect_stats: bool = True, lang_token_id: int = -1, ): - assert check_argument_types() assert 0.0 <= ctc_weight <= 1.0, ctc_weight assert 0.0 <= interctc_weight < 1.0, interctc_weight diff --git a/espnet2/asr/frontend/asteroid_frontend.py b/espnet2/asr/frontend/asteroid_frontend.py index 64d5e910d2e..4e9237081ae 100644 --- a/espnet2/asr/frontend/asteroid_frontend.py +++ b/espnet2/asr/frontend/asteroid_frontend.py @@ -10,7 +10,7 @@ import torch.nn as nn import torch.nn.functional as F from asteroid_filterbanks import Encoder, ParamSincFB -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.frontend.abs_frontend import AbsFrontend @@ -30,6 +30,7 @@ class AsteroidFrontend(AbsFrontend): "Filterbank design for end-to-end speech separation," in Proc. ICASSP, 2020 """ + @typechecked def __init__( self, sinc_filters: int = 256, @@ -48,7 +49,6 @@ def __init__( preemph_coef: the coeifficient for preempahsis. log_term: the log term to prevent infinity. """ - assert check_argument_types() super().__init__() # kernel for preemphasis diff --git a/espnet2/asr/frontend/default.py b/espnet2/asr/frontend/default.py index f2d29c560fc..1cceef269d5 100644 --- a/espnet2/asr/frontend/default.py +++ b/espnet2/asr/frontend/default.py @@ -5,7 +5,7 @@ import numpy as np import torch from torch_complex.tensor import ComplexTensor -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.frontend.abs_frontend import AbsFrontend from espnet2.layers.log_mel import LogMel @@ -20,24 +20,24 @@ class DefaultFrontend(AbsFrontend): Stft -> WPE -> MVDR-Beamformer -> Power-spec -> Log-Mel-Fbank """ + @typechecked def __init__( self, fs: Union[int, str] = 16000, n_fft: int = 512, - win_length: int = None, + win_length: Optional[int] = None, hop_length: int = 128, window: Optional[str] = "hann", center: bool = True, normalized: bool = False, onesided: bool = True, n_mels: int = 80, - fmin: int = None, - fmax: int = None, + fmin: Optional[int] = None, + fmax: Optional[int] = None, htk: bool = False, frontend_conf: Optional[dict] = get_default_kwargs(Frontend), apply_stft: bool = True, ): - assert check_argument_types() super().__init__() if isinstance(fs, str): fs = humanfriendly.parse_size(fs) diff --git a/espnet2/asr/frontend/fused.py b/espnet2/asr/frontend/fused.py index 34f3315fa71..ab4cd7fdbe8 100644 --- a/espnet2/asr/frontend/fused.py +++ b/espnet2/asr/frontend/fused.py @@ -2,7 +2,7 @@ import numpy as np import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.frontend.abs_frontend import AbsFrontend from espnet2.asr.frontend.default import DefaultFrontend @@ -10,10 +10,10 @@ class FusedFrontends(AbsFrontend): + @typechecked def __init__( self, frontends=None, align_method="linear_projection", proj_dim=100, fs=16000 ): - assert check_argument_types() super().__init__() self.align_method = ( align_method # fusing method : linear_projection only for now diff --git a/espnet2/asr/frontend/melspec_torch.py b/espnet2/asr/frontend/melspec_torch.py index a6939594891..26a1f108f21 100644 --- a/espnet2/asr/frontend/melspec_torch.py +++ b/espnet2/asr/frontend/melspec_torch.py @@ -4,22 +4,20 @@ """Torchaudio MFCC""" -from typing import Tuple +from typing import Optional, Tuple import torch -import torch.nn as nn import torch.nn.functional as F import torchaudio as ta -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.frontend.abs_frontend import AbsFrontend class MelSpectrogramTorch(AbsFrontend): - """ - Mel-Spectrogram using Torchaudio Implementation. - """ + """Mel-Spectrogram using Torchaudio Implementation.""" + @typechecked def __init__( self, preemp: bool = True, @@ -32,9 +30,8 @@ def __init__( n_mels: int = 80, window_fn: str = "hamming", mel_scale: str = "htk", - normalize: str = None, + normalize: Optional[str] = None, ): - assert check_argument_types() super().__init__() self.log = log diff --git a/espnet2/asr/frontend/s3prl.py b/espnet2/asr/frontend/s3prl.py index b1961cd9808..39ccefb56a4 100644 --- a/espnet2/asr/frontend/s3prl.py +++ b/espnet2/asr/frontend/s3prl.py @@ -4,7 +4,7 @@ import humanfriendly import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.frontend.abs_frontend import AbsFrontend from espnet2.utils.get_default_kwargs import get_default_kwargs @@ -14,11 +14,12 @@ class S3prlFrontend(AbsFrontend): """Speech Pretrained Representation frontend structure for ASR.""" + @typechecked def __init__( self, fs: Union[int, str] = 16000, frontend_conf: Optional[dict] = get_default_kwargs(Frontend), - download_dir: str = None, + download_dir: Optional[str] = None, multilayer_feature: bool = False, layer: int = -1, ): @@ -30,7 +31,6 @@ def __init__( print("Please install S3PRL: cd ${MAIN_ROOT}/tools && make s3prl.done") raise e - assert check_argument_types() super().__init__() if isinstance(fs, str): diff --git a/espnet2/asr/frontend/whisper.py b/espnet2/asr/frontend/whisper.py index 3bbd013a52e..f77d7ce77d1 100644 --- a/espnet2/asr/frontend/whisper.py +++ b/espnet2/asr/frontend/whisper.py @@ -1,9 +1,9 @@ import contextlib -from typing import Tuple +from typing import Optional, Tuple import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.frontend.abs_frontend import AbsFrontend @@ -14,11 +14,12 @@ class WhisperFrontend(AbsFrontend): URL: https://github.com/openai/whisper """ + @typechecked def __init__( self, whisper_model: str = "small", freeze_weights: bool = True, - download_dir: str = None, + download_dir: Optional[str] = None, ): try: import whisper @@ -31,7 +32,6 @@ def __init__( ) raise e - assert check_argument_types() super().__init__() self.n_fft = N_FFT diff --git a/espnet2/asr/frontend/windowing.py b/espnet2/asr/frontend/windowing.py index e79d0129a44..f4a34d68e94 100644 --- a/espnet2/asr/frontend/windowing.py +++ b/espnet2/asr/frontend/windowing.py @@ -4,10 +4,10 @@ """Sliding Window for raw audio input data.""" -from typing import Tuple +from typing import Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.frontend.abs_frontend import AbsFrontend @@ -26,12 +26,13 @@ class SlidingWindow(AbsFrontend): There is currently no additional window function applied to input values. """ + @typechecked def __init__( self, win_length: int = 400, hop_length: int = 160, channels: int = 1, - padding: int = None, + padding: Optional[int] = None, fs=None, ): """Initialize. @@ -43,7 +44,6 @@ def __init__( padding: Padding (placeholder, currently not implemented). fs: Sampling rate (placeholder for compatibility, not used). """ - assert check_argument_types() super().__init__() self.fs = fs self.win_length = win_length diff --git a/espnet2/asr/maskctc_model.py b/espnet2/asr/maskctc_model.py index b6b013ebc2a..b3960359340 100644 --- a/espnet2/asr/maskctc_model.py +++ b/espnet2/asr/maskctc_model.py @@ -6,7 +6,7 @@ import numpy import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.decoder.mlm_decoder import MLMDecoder @@ -39,6 +39,7 @@ def autocast(enabled=True): class MaskCTCModel(ESPnetASRModel): """Hybrid CTC/Masked LM Encoder-Decoder model (Mask-CTC)""" + @typechecked def __init__( self, vocab_size: int, @@ -64,7 +65,6 @@ def __init__( sym_mask: str = "", extract_feats_in_collect_stats: bool = True, ): - assert check_argument_types() super().__init__( vocab_size=vocab_size, diff --git a/espnet2/asr/pit_espnet_model.py b/espnet2/asr/pit_espnet_model.py index cf0b2d94c5a..aa62abbc471 100644 --- a/espnet2/asr/pit_espnet_model.py +++ b/espnet2/asr/pit_espnet_model.py @@ -5,7 +5,7 @@ import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.decoder.abs_decoder import AbsDecoder @@ -121,6 +121,7 @@ def permutate(self, perm, *args): class ESPnetASRModel(SingleESPnetASRModel): """CTC-attention hybrid Encoder-Decoder model""" + @typechecked def __init__( self, vocab_size: int, @@ -154,7 +155,6 @@ def __init__( num_inf: int = 1, num_ref: int = 1, ): - assert check_argument_types() assert 0.0 < ctc_weight <= 1.0, ctc_weight assert interctc_weight == 0.0, "interctc is not supported for multispeaker ASR" diff --git a/espnet2/asr/postencoder/hugging_face_transformers_postencoder.py b/espnet2/asr/postencoder/hugging_face_transformers_postencoder.py index b8cd08776f2..75e88a23688 100644 --- a/espnet2/asr/postencoder/hugging_face_transformers_postencoder.py +++ b/espnet2/asr/postencoder/hugging_face_transformers_postencoder.py @@ -9,7 +9,7 @@ from typing import Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder from espnet.nets.pytorch_backend.nets_utils import make_pad_mask @@ -26,6 +26,7 @@ class HuggingFaceTransformersPostEncoder(AbsPostEncoder): """Hugging Face Transformers PostEncoder.""" + @typechecked def __init__( self, input_size: int, @@ -34,7 +35,6 @@ def __init__( lang_token_id: int = -1, ): """Initialize the module.""" - assert check_argument_types() super().__init__() if not is_transformers_available: diff --git a/espnet2/asr/postencoder/length_adaptor_postencoder.py b/espnet2/asr/postencoder/length_adaptor_postencoder.py index f39289b99f4..40420197c60 100644 --- a/espnet2/asr/postencoder/length_adaptor_postencoder.py +++ b/espnet2/asr/postencoder/length_adaptor_postencoder.py @@ -7,7 +7,7 @@ from typing import Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError @@ -16,6 +16,7 @@ class LengthAdaptorPostEncoder(AbsPostEncoder): """Length Adaptor PostEncoder.""" + @typechecked def __init__( self, input_size: int, @@ -26,7 +27,6 @@ def __init__( return_int_enc: bool = False, ): """Initialize the module.""" - assert check_argument_types() super().__init__() if input_layer == "linear": diff --git a/espnet2/asr/preencoder/linear.py b/espnet2/asr/preencoder/linear.py index f24d0a41e9d..94e857bdb12 100644 --- a/espnet2/asr/preencoder/linear.py +++ b/espnet2/asr/preencoder/linear.py @@ -7,7 +7,7 @@ from typing import Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder @@ -15,9 +15,9 @@ class LinearProjection(AbsPreEncoder): """Linear Projection Preencoder.""" + @typechecked def __init__(self, input_size: int, output_size: int, dropout: float = 0.0): """Initialize the module.""" - assert check_argument_types() super().__init__() self.output_dim = output_size diff --git a/espnet2/asr/preencoder/sinc.py b/espnet2/asr/preencoder/sinc.py index ca8652d94ab..778ccf8acfe 100644 --- a/espnet2/asr/preencoder/sinc.py +++ b/espnet2/asr/preencoder/sinc.py @@ -9,7 +9,7 @@ import humanfriendly import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder from espnet2.layers.sinc_conv import LogCompression, SincConv @@ -38,6 +38,7 @@ class LightweightSincConvs(AbsPreEncoder): Use `plot_sinc_filters.py` to visualize the learned Sinc filters. """ + @typechecked def __init__( self, fs: Union[int, str, float] = 16000, @@ -59,7 +60,6 @@ def __init__( windowing_type: Choice of windowing function. scale_type: Choice of filter-bank initialization scale. """ - assert check_argument_types() super().__init__() if isinstance(fs, str): fs = humanfriendly.parse_size(fs) @@ -256,6 +256,7 @@ class SpatialDropout(torch.nn.Module): Apply dropout to full channels on tensors of input (B, C, D) """ + @typechecked def __init__( self, dropout_probability: float = 0.15, @@ -267,7 +268,6 @@ def __init__( dropout_probability: Dropout probability. shape (tuple, list): Shape of input tensors. """ - assert check_argument_types() super().__init__() if shape is None: shape = (0, 2, 1) diff --git a/espnet2/asr/transducer/rnnt_multi_blank/rnnt.py b/espnet2/asr/transducer/rnnt_multi_blank/rnnt.py index 3b4fb45060e..c007fa5536d 100644 --- a/espnet2/asr/transducer/rnnt_multi_blank/rnnt.py +++ b/espnet2/asr/transducer/rnnt_multi_blank/rnnt.py @@ -48,8 +48,7 @@ def rnnt_loss_cpu( clamp: float, num_threads: int, ): - """ - Wrapper method for accessing CPU RNNT loss. + """Wrapper method for accessing CPU RNNT loss. CPU implementation ported from [HawkAaron/warp-transducer] (https://github.com/HawkAaron/warp-transducer). @@ -157,8 +156,7 @@ def rnnt_loss_gpu( clamp: float, num_threads: int, ): - """ - Wrapper method for accessing GPU RNNT loss. + """Wrapper method for accessing GPU RNNT loss. CUDA implementation ported from [HawkAaron/warp-transducer] (https://github.com/HawkAaron/warp-transducer). @@ -272,9 +270,9 @@ def multiblank_rnnt_loss_gpu( num_threads: int, sigma: float, ): - """ - Wrapper method for accessing GPU Multi-blank RNNT loss - (https://arxiv.org/pdf/2211.03541.pdf). + """Wrapper method for accessing GPU Multi-blank RNNT loss + + (https://arxiv.org/pdf/2211.03541.pdf). CUDA implementation ported from [HawkAaron/warp-transducer] (https://github.com/HawkAaron/warp-transducer). diff --git a/espnet2/asr/transducer/rnnt_multi_blank/rnnt_multi_blank.py b/espnet2/asr/transducer/rnnt_multi_blank/rnnt_multi_blank.py index e829173facc..7054f2abaf5 100644 --- a/espnet2/asr/transducer/rnnt_multi_blank/rnnt_multi_blank.py +++ b/espnet2/asr/transducer/rnnt_multi_blank/rnnt_multi_blank.py @@ -26,7 +26,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import torch from torch.autograd import Function from torch.nn import Module @@ -50,7 +49,8 @@ def forward( fastemit_lambda, clamp, ): - """ + """RNNTNumba Forward. + log_probs: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network labels: 2 dimensional Tensor containing all the targets of @@ -107,8 +107,9 @@ def backward(ctx, grad_output): class _MultiblankRNNTNumba(Function): - """ - Numba class for multi-blank transducer loss (https://arxiv.org/pdf/2211.03541.pdf) + """Numba class for multi-blank transducer loss + + (https://arxiv.org/pdf/2211.03541.pdf) """ @staticmethod @@ -125,7 +126,8 @@ def forward( clamp, sigma, ): - """ + """MultiblankRNNTNumba Forward. + big_blank_durations: list of durations for multi-blank transducer, e.g. [2, 4, 8]. sigma: hyper-parameter for logit under-normalization method for training @@ -207,6 +209,7 @@ def rnnt_loss( clamp: float = 0.0, ): """RNN Transducer Loss (functional form) + Args: acts: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network @@ -253,9 +256,9 @@ def multiblank_rnnt_loss( fastemit_lambda: float = 0.0, clamp: float = 0.0, ): - """ - Multi-blank RNN Transducer (https://arxiv.org/pdf/2211.03541.pdf) - Loss (functional form) + """Multi-blank RNN Transducer (https://arxiv.org/pdf/2211.03541.pdf) + + Loss (functional form) Args: acts: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network @@ -306,7 +309,8 @@ def multiblank_rnnt_loss( class RNNTLossNumba(Module): - """ + """RNNT Loss Numba + Parameters: blank (int, optional): blank label. Default: 0. reduction (string, optional): Specifies the reduction to apply to the output: @@ -331,7 +335,8 @@ def __init__( self.loss = _RNNTNumba.apply def forward(self, acts, labels, act_lens, label_lens): - """ + """Forward RNNTLossNumba. + log_probs: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network labels: 2 dimensional Tensor containing all the targets of the @@ -369,7 +374,8 @@ def forward(self, acts, labels, act_lens, label_lens): class MultiblankRNNTLossNumba(Module): - """ + """Multiblank RNNT Loss Numba + Parameters: blank (int): standard blank label. big_blank_durations: list of durations for multi-blank transducer, e.g. @@ -408,7 +414,8 @@ def __init__( self.sigma = sigma def forward(self, acts, labels, act_lens, label_lens): - """ + """MultiblankRNNTLossNumba Forward. + log_probs: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network labels: 2 dimensional Tensor containing all the targets of diff --git a/espnet2/asr/transducer/rnnt_multi_blank/utils/__init__.py b/espnet2/asr/transducer/rnnt_multi_blank/utils/__init__.py index bc443be41c4..e69de29bb2d 100644 --- a/espnet2/asr/transducer/rnnt_multi_blank/utils/__init__.py +++ b/espnet2/asr/transducer/rnnt_multi_blank/utils/__init__.py @@ -1,13 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/espnet2/asr/transducer/rnnt_multi_blank/utils/cpu_utils/__init__.py b/espnet2/asr/transducer/rnnt_multi_blank/utils/cpu_utils/__init__.py index 1b4bbd40dff..e69de29bb2d 100644 --- a/espnet2/asr/transducer/rnnt_multi_blank/utils/cpu_utils/__init__.py +++ b/espnet2/asr/transducer/rnnt_multi_blank/utils/cpu_utils/__init__.py @@ -1,27 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright 2018-2019, Mingkun Huang -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/espnet2/asr/transducer/rnnt_multi_blank/utils/cpu_utils/cpu_rnnt.py b/espnet2/asr/transducer/rnnt_multi_blank/utils/cpu_utils/cpu_rnnt.py index e49a36e8cf1..d39fa0a2ce3 100644 --- a/espnet2/asr/transducer/rnnt_multi_blank/utils/cpu_utils/cpu_rnnt.py +++ b/espnet2/asr/transducer/rnnt_multi_blank/utils/cpu_utils/cpu_rnnt.py @@ -38,9 +38,7 @@ def log_sum_exp(a: torch.Tensor, b: torch.Tensor): - """ - Logsumexp with safety checks for infs. - """ + """Logsumexp with safety checks for infs.""" if torch.isinf(a): return b @@ -57,8 +55,8 @@ class CpuRNNT_index: def __init__( self, U: int, maxU: int, minibatch: int, alphabet_size: int, batch_first: bool ): - """ - A placeholder Index computation class that emits the resolved index in a + """A placeholder Index computation class that emits the resolved index in a + flattened tensor, mimicing pointer indexing in CUDA kernels on the CPU. Args: @@ -101,8 +99,7 @@ def __init__( log_probs: torch.Tensor, idx: CpuRNNT_index, ): - """ - Metadata for CPU based RNNT loss calculation. Holds the working space memory. + """Metadata for CPU based RNNT loss calculation. Holds the working space memory. Args: T: Length of the acoustic sequence (without padding). @@ -191,8 +188,7 @@ def __init__( num_threads: int, batch_first: bool, ): - """ - Helper class to compute the Transducer Loss on CPU. + """Helper class to compute the Transducer Loss on CPU. Args: minibatch: Size of the minibatch b. @@ -270,8 +266,7 @@ def cost_and_grad_kernel( def compute_alphas( self, log_probs: torch.Tensor, T: int, U: int, alphas: torch.Tensor ): - """ - Compute the probability of the forward variable alpha. + """Compute the probability of the forward variable alpha. Args: log_probs: Flattened tensor [B, T, U, V+1] @@ -319,8 +314,8 @@ def compute_betas_and_grads( labels: torch.Tensor, logll: torch.Tensor, ): - """ - Compute backward variable beta as well as gradients of the activation + """Compute backward variable beta as well as gradients of the activation + matrix wrt loglikelihood of forward variable. Args: diff --git a/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/__init__.py b/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/__init__.py index 1b4bbd40dff..e69de29bb2d 100644 --- a/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/__init__.py +++ b/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/__init__.py @@ -1,27 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright 2018-2019, Mingkun Huang -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/gpu_rnnt.py b/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/gpu_rnnt.py index 4cc309103c4..1073a5b9ccd 100644 --- a/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/gpu_rnnt.py +++ b/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/gpu_rnnt.py @@ -27,7 +27,7 @@ # limitations under the License. import multiprocessing -from typing import Optional, Tuple +from typing import Optional, Tuple, Union import numba import torch @@ -54,8 +54,7 @@ def __init__( num_threads: int, stream, ): - """ - Helper class to launch the CUDA Kernels to compute the Transducer Loss. + """Helper class to launch the CUDA Kernels to compute the Transducer Loss. Args: minibatch: Int representing the batch size. @@ -97,8 +96,8 @@ def __init__( self.num_threads_ = numba.get_num_threads() def log_softmax(self, acts: torch.Tensor, denom: torch.Tensor): - """ - Computes the log softmax denominator of the input activation tensor + """Computes the log softmax denominator of the input activation tensor + and stores the result in denom. Args: @@ -139,8 +138,7 @@ def compute_cost_and_score( label_lengths: torch.Tensor, input_lengths: torch.Tensor, ) -> global_constants.RNNTStatus: - """ - Compute both the loss and the gradients. + """Compute both the loss and the gradients. Args: acts: A flattened tensor of shape [B, T, U, V+1] representing the @@ -301,9 +299,9 @@ def score_forward( ) def _prepare_workspace(self) -> Tuple[int, Tuple[torch.Tensor, ...]]: - """ - Helper method that uses the workspace and constructs slices of it - that can be used. + """Helper method that uses the workspace and constructs slices of it + + that can be used. Returns: An int, representing the offset of the used workspace (practically, the @@ -355,9 +353,9 @@ def __init__( num_threads: int, stream, ): - """ - Helper class to launch the CUDA Kernels to compute Multi-blank Transducer Loss - (https://arxiv.org/pdf/2211.03541). + """Helper class to launch the CUDA Kernels to compute Multi-blank + + Transducer Loss(https://arxiv.org/pdf/2211.03541). Args: sigma: Hyper-parameter related to the logit-normalization method @@ -414,8 +412,7 @@ def compute_cost_and_score( label_lengths: torch.Tensor, input_lengths: torch.Tensor, ) -> global_constants.RNNTStatus: - """ - Compute both the loss and the gradients. + """Compute both the loss and the gradients. Args: acts: A flattened tensor of shape [B, T, U, V+1] representing @@ -585,10 +582,10 @@ def score_forward( acts, None, costs, pad_labels, label_lengths, input_lengths ) - def _prepare_workspace(self) -> (int, Tuple[torch.Tensor]): - """ - Helper method that uses the workspace and constructs slices of it that - can be used. + def _prepare_workspace(self) -> Union[int, Tuple[torch.Tensor]]: + """Helper method that uses the workspace and constructs slices of it that + + can be used. Returns: An int, representing the offset of the used workspace (practically, diff --git a/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/gpu_rnnt_kernel.py b/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/gpu_rnnt_kernel.py index ce66651416c..470b665e457 100644 --- a/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/gpu_rnnt_kernel.py +++ b/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/gpu_rnnt_kernel.py @@ -48,8 +48,9 @@ def logp( u: int, v: int, ): - """ - Compute the sum of log probability from the activation tensor and its denominator. + """Compute the sum of log probability from the activation tensor + + and its denominator. Args: denom: Tensor of shape [B, T, U] flattened. Represents the denominator of the @@ -89,8 +90,7 @@ def compute_alphas_kernel( alphabet_size: int, blank_: int, ): - """ - Compute alpha (forward variable) probabilities over the transduction step. + """Compute alpha (forward variable) probabilities over the transduction step. Args: acts: Tensor of shape [B, T, U, V+1] flattened. @@ -200,8 +200,7 @@ def compute_betas_kernel( alphabet_size: int, blank_: int, ): - """ - Compute beta (backward variable) probabilities over the transduction step. + """Compute beta (backward variable) probabilities over the transduction step. Args: acts: Tensor of shape [B, T, U, V+1] flattened. @@ -314,8 +313,7 @@ def compute_grad_kernel( fastemit_lambda: float, clamp: float, ): - """ - Compute gradients over the transduction step. + """Compute gradients over the transduction step. Args: grads: Zero Tensor of shape [B, T, U, V+1]. Is updated by this kernel to @@ -477,9 +475,9 @@ def compute_multiblank_alphas_kernel( big_blank_duration: torch.Tensor, num_big_blanks: int, ): - """ - Compute alpha (forward variable) probabilities for multi-blank transducuer loss - (https://arxiv.org/pdf/2211.03541). + """Compute alpha (forward variable) probabilities for multi-blank transducuer loss + + (https://arxiv.org/pdf/2211.03541). Args: acts: Tensor of shape [B, T, U, V + 1 + num_big_blanks] flattened. @@ -693,9 +691,9 @@ def compute_multiblank_betas_kernel( big_blank_duration: torch.Tensor, num_big_blanks: int, ): - """ - Compute beta (backward variable) probabilities for multi-blank transducer loss - (https://arxiv.org/pdf/2211.03541). + """Compute beta (backward variable) probabilities for multi-blank transducer loss + + (https://arxiv.org/pdf/2211.03541). Args: acts: Tensor of shape [B, T, U, V + 1 + num-big-blanks] flattened. @@ -894,9 +892,9 @@ def compute_multiblank_grad_kernel( fastemit_lambda: float, clamp: float, ): - """ - Compute gradients for multi-blank transducer loss - (https://arxiv.org/pdf/2211.03541). + """Compute gradients for multi-blank transducer loss + + (https://arxiv.org/pdf/2211.03541). Args: grads: Zero Tensor of shape [B, T, U, V + 1 + num_big_blanks]. diff --git a/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/reduce.py b/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/reduce.py index 8638f1b4dc4..94307f9abcf 100644 --- a/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/reduce.py +++ b/espnet2/asr/transducer/rnnt_multi_blank/utils/cuda_utils/reduce.py @@ -41,18 +41,14 @@ class I_Op(enum.Enum): - """ - Represents an operation that is performed on the input tensor - """ + """Represents an operation that is performed on the input tensor""" EXPONENTIAL = 0 IDENTITY = 1 class R_Op(enum.Enum): - """ - Represents a reduction operation performed on the input tensor - """ + """Represents a reduction operation performed on the input tensor""" ADD = 0 MAXIMUM = 1 @@ -60,8 +56,7 @@ class R_Op(enum.Enum): @cuda.jit(device=True) def CTAReduce(tid: int, x, storage, count: int, R_opid: int): - """ - CUDA Warp reduction kernel. + """CUDA Warp reduction kernel. It is a device kernel to be called by other kernels. @@ -123,8 +118,7 @@ def CTAReduce(tid: int, x, storage, count: int, R_opid: int): @cuda.jit() def _reduce_rows(I_opid: int, R_opid: int, acts, output, num_rows: int): - """ - CUDA Warp reduction kernel which reduces via the R_Op.Maximum + """CUDA Warp reduction kernel which reduces via the R_Op.Maximum Reduces the input data such that I_Op = Identity and R_op = Maximum. The result is stored in the blockIdx, and is stored as an identity op. @@ -192,8 +186,7 @@ def _reduce_rows(I_opid: int, R_opid: int, acts, output, num_rows: int): @cuda.jit() def _reduce_minus(I_opid: int, R_opid: int, acts, output, num_rows: int): - """ - CUDA Warp reduction kernel which reduces via the R_Op.Add + """CUDA Warp reduction kernel which reduces via the R_Op.Add Reduces the input data such that I_Op = Exponential and R_op = Add. The result is stored in the blockIdx, and is stored as an exp op. @@ -268,8 +261,8 @@ def ReduceHelper( minus: bool, stream, ): - """ - CUDA Warp reduction kernel helper which reduces via the R_Op.Add and writes + """CUDA Warp reduction kernel helper which reduces via the R_Op.Add and writes + the result to `output` according to I_op id. The result is stored in the blockIdx. @@ -314,8 +307,7 @@ def ReduceHelper( def reduce_exp(acts: torch.Tensor, denom, rows: int, cols: int, minus: bool, stream): - """ - Helper method to call the Warp Reduction Kernel to perform `exp` reduction. + """Helper method to call the Warp Reduction Kernel to perform `exp` reduction. Note: Efficient warp occurs at input shapes of 2 ^ K. @@ -350,8 +342,7 @@ def reduce_exp(acts: torch.Tensor, denom, rows: int, cols: int, minus: bool, str def reduce_max(acts: torch.Tensor, denom, rows: int, cols: int, minus: bool, stream): - """ - Helper method to call the Warp Reduction Kernel to perform `max` reduction. + """Helper method to call the Warp Reduction Kernel to perform `max` reduction. Note: Efficient warp occurs at input shapes of 2 ^ K. diff --git a/espnet2/asr_transducer/decoder/mega_decoder.py b/espnet2/asr_transducer/decoder/mega_decoder.py index b47c2355ee1..22f4490da3e 100644 --- a/espnet2/asr_transducer/decoder/mega_decoder.py +++ b/espnet2/asr_transducer/decoder/mega_decoder.py @@ -4,7 +4,7 @@ from typing import Dict, List, Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr_transducer.activation import get_activation from espnet2.asr_transducer.beam_search_transducer import Hypothesis @@ -46,6 +46,7 @@ class MEGADecoder(AbsDecoder): """ + @typechecked def __init__( self, vocab_size: int, @@ -73,8 +74,6 @@ def __init__( """Construct a MEGADecoder object.""" super().__init__() - assert check_argument_types() - self.embed = torch.nn.Embedding(vocab_size, block_size, padding_idx=embed_pad) self.dropout_embed = torch.nn.Dropout(p=embed_dropout_rate) diff --git a/espnet2/asr_transducer/decoder/modules/rwkv/attention.py b/espnet2/asr_transducer/decoder/modules/rwkv/attention.py index 2436a1fff2f..a43774e3cae 100644 --- a/espnet2/asr_transducer/decoder/modules/rwkv/attention.py +++ b/espnet2/asr_transducer/decoder/modules/rwkv/attention.py @@ -2,14 +2,15 @@ Based/Modified from https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4/src/model.py. -Some variables are renamed according to https://github.com/huggingface/transformers/blob/main/src/transformers/models/rwkv/modeling_rwkv.py. +Some variables are renamed according to +https://github.com/huggingface/transformers/blob/main/src/transformers/models/rwkv/modeling_rwkv.py. -""" # noqa +""" import math from importlib.util import find_spec from pathlib import Path -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch @@ -83,7 +84,7 @@ def backward( """ time_decay, time_first, key, value, output = ctx.saved_tensors - grad_dtype = ctx.input_dtype + grad_dtype = ctx.input_dtype # noqa batch, _, dim = key.size() diff --git a/espnet2/asr_transducer/decoder/rnn_decoder.py b/espnet2/asr_transducer/decoder/rnn_decoder.py index ba96ff94765..4ea358ca854 100644 --- a/espnet2/asr_transducer/decoder/rnn_decoder.py +++ b/espnet2/asr_transducer/decoder/rnn_decoder.py @@ -3,7 +3,7 @@ from typing import List, Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr_transducer.beam_search_transducer import Hypothesis from espnet2.asr_transducer.decoder.abs_decoder import AbsDecoder @@ -24,6 +24,7 @@ class RNNDecoder(AbsDecoder): """ + @typechecked def __init__( self, vocab_size: int, @@ -38,8 +39,6 @@ def __init__( """Construct a RNNDecoder object.""" super().__init__() - assert check_argument_types() - if rnn_type not in ("lstm", "gru"): raise ValueError(f"Not supported: rnn_type={rnn_type}") diff --git a/espnet2/asr_transducer/decoder/rwkv_decoder.py b/espnet2/asr_transducer/decoder/rwkv_decoder.py index 82fe7960de5..24e6bd6f5b0 100644 --- a/espnet2/asr_transducer/decoder/rwkv_decoder.py +++ b/espnet2/asr_transducer/decoder/rwkv_decoder.py @@ -1,10 +1,9 @@ """RWKV decoder definition for Transducer models.""" -import math from typing import Dict, List, Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr_transducer.beam_search_transducer import Hypothesis from espnet2.asr_transducer.decoder.abs_decoder import AbsDecoder @@ -34,6 +33,7 @@ class RWKVDecoder(AbsDecoder): """ + @typechecked def __init__( self, vocab_size: int, @@ -53,8 +53,6 @@ def __init__( """Construct a RWKVDecoder object.""" super().__init__() - assert check_argument_types() - norm_class, norm_args = get_normalization( normalization_type, **normalization_args ) diff --git a/espnet2/asr_transducer/decoder/stateless_decoder.py b/espnet2/asr_transducer/decoder/stateless_decoder.py index 53521c66ea9..d0aae236811 100644 --- a/espnet2/asr_transducer/decoder/stateless_decoder.py +++ b/espnet2/asr_transducer/decoder/stateless_decoder.py @@ -3,7 +3,7 @@ from typing import Any, List, Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr_transducer.beam_search_transducer import Hypothesis from espnet2.asr_transducer.decoder.abs_decoder import AbsDecoder @@ -20,6 +20,7 @@ class StatelessDecoder(AbsDecoder): """ + @typechecked def __init__( self, vocab_size: int, @@ -30,8 +31,6 @@ def __init__( """Construct a StatelessDecoder object.""" super().__init__() - assert check_argument_types() - self.embed = torch.nn.Embedding(vocab_size, embed_size, padding_idx=embed_pad) self.embed_dropout_rate = torch.nn.Dropout(p=embed_dropout_rate) diff --git a/espnet2/asr_transducer/encoder/encoder.py b/espnet2/asr_transducer/encoder/encoder.py index 951bd8bcdbe..c1336955433 100644 --- a/espnet2/asr_transducer/encoder/encoder.py +++ b/espnet2/asr_transducer/encoder/encoder.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr_transducer.encoder.building import ( build_body_blocks, @@ -31,6 +31,7 @@ class Encoder(torch.nn.Module): """ + @typechecked def __init__( self, input_size: int, @@ -41,8 +42,6 @@ def __init__( """Construct an Encoder object.""" super().__init__() - assert check_argument_types() - embed_size, output_size = validate_architecture( input_conf, body_conf, input_size ) diff --git a/espnet2/asr_transducer/espnet_transducer_model.py b/espnet2/asr_transducer/espnet_transducer_model.py index ecf6bd073e9..d9b7369995b 100644 --- a/espnet2/asr_transducer/espnet_transducer_model.py +++ b/espnet2/asr_transducer/espnet_transducer_model.py @@ -6,7 +6,7 @@ import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.frontend.abs_frontend import AbsFrontend from espnet2.asr.specaug.abs_specaug import AbsSpecAug @@ -59,6 +59,7 @@ class ESPnetASRTransducerModel(AbsESPnetModel): """ + @typechecked def __init__( self, vocab_size: int, @@ -89,8 +90,6 @@ def __init__( """Construct an ESPnetASRTransducerModel object.""" super().__init__() - assert check_argument_types() - # The following labels ID are reserved: # - 0: Blank symbol. # - 1: Unknown symbol. diff --git a/espnet2/asvspoof/decoder/linear_decoder.py b/espnet2/asvspoof/decoder/linear_decoder.py index 7ee4db91388..b5675a21b02 100644 --- a/espnet2/asvspoof/decoder/linear_decoder.py +++ b/espnet2/asvspoof/decoder/linear_decoder.py @@ -13,17 +13,18 @@ def __init__( encoder_output_size: int, ): super().__init__() - # TODO1 (checkpoint3): initialize a linear projection layer + # TODO(checkpoint3): initialize a linear projection layer def forward(self, input: torch.Tensor, ilens: Optional[torch.Tensor]): """Forward. + Args: input (torch.Tensor): hidden_space [Batch, T, F] ilens (torch.Tensor): input lengths [Batch] """ - # TODO2 (checkpoint3): compute mean over time-domain (dimension 1) + # TODO(checkpoint3): compute mean over time-domain (dimension 1) - # TODO3 (checkpoint3): apply the projection layer + # TODO(checkpoint3): apply the projection layer - # TODO4 (checkpoint3): change the return value + # TODO(checkpoint3): change the return value return None diff --git a/espnet2/asvspoof/espnet_model.py b/espnet2/asvspoof/espnet_model.py index d3eed0bbb6c..44a557a91cc 100644 --- a/espnet2/asvspoof/espnet_model.py +++ b/espnet2/asvspoof/espnet_model.py @@ -1,16 +1,12 @@ # Copyright 2022 Jiatong Shi (Carnegie Mellon University) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -import logging from contextlib import contextmanager -from itertools import permutations from typing import Dict, Optional, Tuple -import numpy as np import torch -import torch.nn.functional as F from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet2.asr.frontend.abs_frontend import AbsFrontend @@ -21,7 +17,6 @@ from espnet2.layers.abs_normalize import AbsNormalize from espnet2.torch_utils.device_funcs import force_gatherable from espnet2.train.abs_espnet_model import AbsESPnetModel -from espnet.nets.pytorch_backend.nets_utils import to_device if V(torch.__version__) >= V("1.6.0"): from torch.cuda.amp import autocast @@ -34,9 +29,11 @@ def autocast(enabled=True): class ESPnetASVSpoofModel(AbsESPnetModel): """ASV Spoofing model + A simple ASV Spoofing model """ + @typechecked def __init__( self, frontend: Optional[AbsFrontend], @@ -47,7 +44,6 @@ def __init__( decoder: AbsDecoder, losses: Dict[str, AbsASVSpoofLoss], ): - assert check_argument_types() super().__init__() @@ -67,6 +63,7 @@ def forward( **kwargs, ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]: """Frontend + Encoder + Decoder + Calc loss + Args: speech: (Batch, samples) spk_labels: (Batch, ) @@ -123,6 +120,7 @@ def encode( speech_lengths: torch.Tensor, ) -> Tuple[torch.Tensor, torch.Tensor]: """Frontend + Encoder + Args: speech: (Batch, Length, ...) speech_lengths: (Batch,) diff --git a/espnet2/asvspoof/loss/abs_loss.py b/espnet2/asvspoof/loss/abs_loss.py index 5d8230914df..426949af446 100644 --- a/espnet2/asvspoof/loss/abs_loss.py +++ b/espnet2/asvspoof/loss/abs_loss.py @@ -27,4 +27,4 @@ def score( self, pred, ) -> torch.Tensor: - raise NotImplemented + raise NotImplementedError diff --git a/espnet2/asvspoof/loss/am_softmax_loss.py b/espnet2/asvspoof/loss/am_softmax_loss.py index a7304aa7afa..83735f992ba 100644 --- a/espnet2/asvspoof/loss/am_softmax_loss.py +++ b/espnet2/asvspoof/loss/am_softmax_loss.py @@ -1,7 +1,6 @@ import torch from espnet2.asvspoof.loss.abs_loss import AbsASVSpoofLoss -from espnet.nets.pytorch_backend.nets_utils import to_device class ASVSpoofAMSoftmaxLoss(AbsASVSpoofLoss): @@ -25,6 +24,7 @@ def __init__( def forward(self, label: torch.Tensor, emb: torch.Tensor, **kwargs): """Forward. + Args: label (torch.Tensor): ground truth label [Batch, 1] emb (torch.Tensor): encoder embedding output [Batch, T, enc_dim] @@ -49,6 +49,7 @@ def forward(self, label: torch.Tensor, emb: torch.Tensor, **kwargs): def score(self, emb: torch.Tensor): """Prediction. + Args: emb (torch.Tensor): encoder embedding output [Batch, T, enc_dim] """ diff --git a/espnet2/asvspoof/loss/binary_loss.py b/espnet2/asvspoof/loss/binary_loss.py index b7f4eda3ca4..b2b920b9c0a 100644 --- a/espnet2/asvspoof/loss/binary_loss.py +++ b/espnet2/asvspoof/loss/binary_loss.py @@ -1,7 +1,6 @@ import torch from espnet2.asvspoof.loss.abs_loss import AbsASVSpoofLoss -from espnet.nets.pytorch_backend.nets_utils import to_device class ASVSpoofBinaryLoss(AbsASVSpoofLoss): @@ -18,6 +17,7 @@ def __init__( def forward(self, pred: torch.Tensor, label: torch.Tensor, **kwargs): """Forward. + Args: pred (torch.Tensor): prediction probability [Batch, 2] label (torch.Tensor): ground truth label [Batch, 2] diff --git a/espnet2/asvspoof/loss/oc_softmax_loss.py b/espnet2/asvspoof/loss/oc_softmax_loss.py index 5fba105afc7..0cbd662759d 100644 --- a/espnet2/asvspoof/loss/oc_softmax_loss.py +++ b/espnet2/asvspoof/loss/oc_softmax_loss.py @@ -1,7 +1,6 @@ import torch from espnet2.asvspoof.loss.abs_loss import AbsASVSpoofLoss -from espnet.nets.pytorch_backend.nets_utils import to_device class ASVSpoofOCSoftmaxLoss(AbsASVSpoofLoss): @@ -27,30 +26,32 @@ def __init__( def forward(self, label: torch.Tensor, emb: torch.Tensor, **kwargs): """Forward. + Args: label (torch.Tensor): ground truth label [Batch, 1] emb (torch.Tensor): encoder embedding output [Batch, T, enc_dim] """ emb = torch.mean(emb, dim=1) - w = torch.nn.functional.normalize(self.center, p=2, dim=1) - x = torch.nn.functional.normalize(emb, p=2, dim=1) + w = torch.nn.functional.normalize(self.center, p=2, dim=1) # noqa + x = torch.nn.functional.normalize(emb, p=2, dim=1) # noqa - # TODO1 (exercise 2): compute scores based on w and x + # TODO(exercise 2): compute scores based on w and x - # TODO2 (exercise 2): calculate the score bias based on m_real and m_fake + # TODO(exercise 2): calculate the score bias based on m_real and m_fake - # TODO3 (exercise 2): apply alpha and softplus + # TODO(exercise 2): apply alpha and softplus - # TODO4 (exercise 2): returnthe final loss + # TODO(exercise 2): returnthe final loss return None def score(self, emb: torch.Tensor): """Prediction. + Args: emb (torch.Tensor): encoder embedding output [Batch, T, enc_dim] """ emb = torch.mean(emb, dim=1) - w = torch.nn.functional.normalize(self.center, p=2, dim=1) - x = torch.nn.functional.normalize(emb, p=2, dim=1) + w = torch.nn.functional.normalize(self.center, p=2, dim=1) # noqa + x = torch.nn.functional.normalize(emb, p=2, dim=1) # noqa - # TODO5 (exercise 2): compute scores + # TODO(exercise 2): compute scores diff --git a/espnet2/bin/asr_align.py b/espnet2/bin/asr_align.py index c66fe3f7971..0ad496d84c2 100755 --- a/espnet2/bin/asr_align.py +++ b/espnet2/bin/asr_align.py @@ -21,7 +21,7 @@ prepare_text, prepare_token_list, ) -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.tasks.asr import ASRTask from espnet2.torch_utils.device_funcs import to_device @@ -173,10 +173,11 @@ class CTCSegmentation: warned_about_misconfiguration = False config = CtcSegmentationParameters() + @typechecked def __init__( self, asr_train_config: Union[Path, str], - asr_model_file: Union[Path, str] = None, + asr_model_file: Union[Path, str, None] = None, fs: int = 16000, ngpu: int = 0, batch_size: int = 1, @@ -217,7 +218,6 @@ def __init__( longer audio files: "auto". **ctc_segmentation_args: Parameters for CTC segmentation. """ - assert check_argument_types() # Basic settings if batch_size > 1: @@ -531,6 +531,7 @@ def prepare_segmentation_task(self, text, lpz, name=None, speech_len=None): return task @staticmethod + @typechecked def get_segments(task: CTCSegmentationTask): """Obtain segments for given utterance texts and CTC log posteriors. @@ -542,7 +543,6 @@ def get_segments(task: CTCSegmentationTask): result: Dictionary with alignments. Combine this with the task object to obtain a human-readable segments representation. """ - assert check_argument_types() assert task.config is not None config = task.config lpz = task.lpz @@ -568,6 +568,7 @@ def get_segments(task: CTCSegmentationTask): } return result + @typechecked def __call__( self, speech: Union[torch.Tensor, np.ndarray], @@ -587,7 +588,6 @@ def __call__( Returns: CTCSegmentationTask object with segments. """ - assert check_argument_types() if fs is not None: self.set_config(fs=fs) # Get log CTC posterior probabilities @@ -597,10 +597,10 @@ def __call__( # Apply CTC segmentation segments = self.get_segments(task) task.set(**segments) - assert check_return_type(task) return task +@typechecked def ctc_align( log_level: Union[int, str], asr_train_config: str, @@ -613,7 +613,6 @@ def ctc_align( **kwargs, ): """Provide the scripting interface to align text to audio.""" - assert check_argument_types() logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", diff --git a/espnet2/bin/asr_inference.py b/espnet2/bin/asr_inference.py index d6f4766d0e4..63d93670bd1 100644 --- a/espnet2/bin/asr_inference.py +++ b/espnet2/bin/asr_inference.py @@ -10,7 +10,7 @@ import numpy as np import torch import torch.quantization -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.decoder.hugging_face_transformers_decoder import ( get_hugging_face_model_lm_head, @@ -39,7 +39,6 @@ from espnet.nets.batch_beam_search_online_sim import BatchBeamSearchOnlineSim from espnet.nets.beam_search import BeamSearch, Hypothesis from espnet.nets.beam_search_timesync import BeamSearchTimeSync -from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError from espnet.nets.scorer_interface import BatchScorerInterface from espnet.nets.scorers.ctc import CTCPrefixScorer @@ -77,17 +76,18 @@ class Speech2Text: """ + @typechecked def __init__( self, - asr_train_config: Union[Path, str] = None, - asr_model_file: Union[Path, str] = None, - transducer_conf: dict = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, + asr_train_config: Union[Path, str, None] = None, + asr_model_file: Union[Path, str, None] = None, + transducer_conf: Optional[Dict] = None, + lm_train_config: Union[Path, str, None] = None, + lm_file: Union[Path, str, None] = None, ngram_scorer: str = "full", - ngram_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, + ngram_file: Union[Path, str, None] = None, + token_type: Optional[str] = None, + bpemodel: Optional[str] = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, @@ -115,7 +115,6 @@ def __init__( nlp_prompt_token: Optional[str] = None, prompt_token_file: Optional[str] = None, ): - assert check_argument_types() task = ASRTask if not enh_s2t_task else EnhS2TTask @@ -128,8 +127,8 @@ def __init__( "torch version < 1.5.0. Switch to qint8 dtype instead." ) - quantize_modules = set([getattr(torch.nn, q) for q in quantize_modules]) - quantize_dtype = getattr(torch, quantize_dtype) + qconfig_spec = set([getattr(torch.nn, q) for q in quantize_modules]) + quantize_dtype: torch.dtype = getattr(torch, quantize_dtype) # 1. Build ASR model scorers = {} @@ -155,7 +154,7 @@ def __init__( logging.info("Use quantized asr model for decoding.") asr_model = torch.quantization.quantize_dynamic( - asr_model, qconfig_spec=quantize_modules, dtype=quantize_dtype + asr_model, qconfig_spec=qconfig_spec, dtype=quantize_dtype ) decoder = asr_model.decoder @@ -178,7 +177,7 @@ def __init__( logging.info("Use quantized lm for decoding.") lm = torch.quantization.quantize_dynamic( - lm, qconfig_spec=quantize_modules, dtype=quantize_dtype + lm, qconfig_spec=qconfig_spec, dtype=quantize_dtype ) scorers["lm"] = lm.lm @@ -460,11 +459,13 @@ def __init__( self.multi_asr = multi_asr @torch.no_grad() + @typechecked def __call__(self, speech: Union[torch.Tensor, np.ndarray]) -> Union[ ListOfHypothesis, + List[ListOfHypothesis], Tuple[ ListOfHypothesis, - Optional[Dict[int, List[str]]], + Union[Dict[int, List[str]], None], ], ]: """Inference @@ -475,7 +476,6 @@ def __call__(self, speech: Union[torch.Tensor, np.ndarray]) -> Union[ text, token, token_int, hyp """ - assert check_argument_types() # Input as audio signal if isinstance(speech, np.ndarray): @@ -512,7 +512,6 @@ def __call__(self, speech: Union[torch.Tensor, np.ndarray]) -> Union[ # c. Passed the encoder result and the beam search ret = self._decode_single_sample(enc_spk[0]) - assert check_return_type(ret) results.append(ret) else: @@ -530,14 +529,13 @@ def __call__(self, speech: Union[torch.Tensor, np.ndarray]) -> Union[ if intermediate_outs is not None: encoder_interctc_res = self._decode_interctc(intermediate_outs) results = (results, encoder_interctc_res) - assert check_return_type(results) return results + @typechecked def _decode_interctc( self, intermediate_outs: List[Tuple[int, torch.Tensor]] ) -> Dict[int, List[str]]: - assert check_argument_types() exclude_ids = [self.asr_model.blank_id, self.asr_model.sos, self.asr_model.eos] res = {} @@ -552,7 +550,8 @@ def _decode_interctc( return res - def _decode_single_sample(self, enc: torch.Tensor): + @typechecked + def _decode_single_sample(self, enc: torch.Tensor) -> ListOfHypothesis: if self.beam_search_transducer: logging.info("encoder output length: " + str(enc.shape[0])) nbest_hyps = self.beam_search_transducer(enc) @@ -679,6 +678,7 @@ def from_pretrained( return Speech2Text(**kwargs) +@typechecked def inference( output_dir: str, maxlenratio: float, @@ -724,7 +724,6 @@ def inference( nlp_prompt_token: Optional[str], prompt_token_file: Optional[str], ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: @@ -862,7 +861,7 @@ def inference( # Write intermediate predictions to # encoder_interctc_layer.txt - ibest_writer = writer[f"1best_recog"] + ibest_writer = writer["1best_recog"] if encoder_interctc_res is not None: for idx, text in encoder_interctc_res.items(): ibest_writer[f"encoder_interctc_layer{idx}.txt"][key] = ( diff --git a/espnet2/bin/asr_inference_k2.py b/espnet2/bin/asr_inference_k2.py index 830721197b3..4455e22daa2 100755 --- a/espnet2/bin/asr_inference_k2.py +++ b/espnet2/bin/asr_inference_k2.py @@ -10,7 +10,7 @@ import numpy as np import torch import yaml -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.fileio.datadir_writer import DatadirWriter from espnet2.fst.lm_rescore import nbest_am_lm_scores @@ -127,14 +127,15 @@ class k2Speech2Text: """ + @typechecked def __init__( self, asr_train_config: Union[Path, str], - asr_model_file: Union[Path, str] = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, + asr_model_file: Union[Path, str, None] = None, + lm_train_config: Union[Path, str, None] = None, + lm_file: Union[Path, str, None] = None, + token_type: Optional[str] = None, + bpemodel: Optional[str] = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, @@ -163,7 +164,6 @@ def __init__( nbest_batch_size: int = 500, nll_batch_size: int = 100, ): - assert check_argument_types() # 1. Build ASR model asr_model, asr_train_args = ASRTask.build_model_from_file( @@ -229,6 +229,7 @@ def __init__( self.nll_batch_size = nll_batch_size @torch.no_grad() + @typechecked def __call__( self, batch: Dict[str, Union[torch.Tensor, np.ndarray]] ) -> List[Tuple[Optional[str], List[str], List[int], float]]: @@ -240,7 +241,6 @@ def __call__( text, token, token_int, hyp """ - assert check_argument_types() if isinstance(batch["speech"], np.ndarray): batch["speech"] = torch.tensor(batch["speech"]) @@ -418,7 +418,6 @@ def __call__( text = self.tokenizer.tokens2text(token) results.append((text, token, token_int, score)) - assert check_return_type(results) return results @staticmethod @@ -452,6 +451,7 @@ def from_pretrained( return k2Speech2Text(**kwargs) +@typechecked def inference( output_dir: str, maxlenratio: float, @@ -488,7 +488,6 @@ def inference( k2_config: Optional[str], ): assert is_ctc_decoding, "Currently, only ctc_decoding graph is supported." - assert check_argument_types() if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") diff --git a/espnet2/bin/asr_inference_maskctc.py b/espnet2/bin/asr_inference_maskctc.py index fb07b3dc3df..07627c3488e 100644 --- a/espnet2/bin/asr_inference_maskctc.py +++ b/espnet2/bin/asr_inference_maskctc.py @@ -7,7 +7,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.maskctc_model import MaskCTCInference from espnet2.fileio.datadir_writer import DatadirWriter @@ -35,19 +35,19 @@ class Speech2Text: """ + @typechecked def __init__( self, asr_train_config: Union[Path, str], - asr_model_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, + asr_model_file: Union[Path, str, None] = None, + token_type: Optional[str] = None, + bpemodel: Optional[str] = None, device: str = "cpu", batch_size: int = 1, dtype: str = "float32", maskctc_n_iterations: int = 10, maskctc_threshold_probability: float = 0.99, ): - assert check_argument_types() # 1. Build ASR model asr_model, asr_train_args = ASRTask.build_model_from_file( @@ -90,6 +90,7 @@ def __init__( self.dtype = dtype @torch.no_grad() + @typechecked def __call__( self, speech: Union[torch.Tensor, np.ndarray] ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]: @@ -101,7 +102,6 @@ def __call__( text, token, token_int, hyp """ - assert check_argument_types() # Input as audio signal if isinstance(speech, np.ndarray): @@ -141,7 +141,6 @@ def __call__( text = None results = [(text, token, token_int, hyp)] - assert check_return_type(results) return results @staticmethod @@ -175,6 +174,7 @@ def from_pretrained( return Speech2Text(**kwargs) +@typechecked def inference( output_dir: str, batch_size: int, @@ -194,7 +194,6 @@ def inference( maskctc_n_iterations: int, maskctc_threshold_probability: float, ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: diff --git a/espnet2/bin/asr_inference_streaming.py b/espnet2/bin/asr_inference_streaming.py index 676cdf34d50..00b0838ef6b 100755 --- a/espnet2/bin/asr_inference_streaming.py +++ b/espnet2/bin/asr_inference_streaming.py @@ -8,7 +8,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.encoder.contextual_block_conformer_encoder import ( # noqa: H301 ContextualBlockConformerEncoder, @@ -49,14 +49,15 @@ class Speech2TextStreaming: """ + @typechecked def __init__( self, asr_train_config: Union[Path, str], - asr_model_file: Union[Path, str] = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, + asr_model_file: Union[Path, str, None] = None, + lm_train_config: Union[Path, str, None] = None, + lm_file: Union[Path, str, None] = None, + token_type: Optional[str] = None, + bpemodel: Optional[str] = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, @@ -72,7 +73,6 @@ def __init__( decoder_text_length_limit=0, encoded_feat_length_limit=0, ): - assert check_argument_types() # 1. Build ASR model scorers = {} @@ -291,6 +291,7 @@ def apply_frontend( return feats, feats_lengths, next_states @torch.no_grad() + @typechecked def __call__( self, speech: Union[torch.Tensor, np.ndarray], is_final: bool = True ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]: @@ -302,7 +303,6 @@ def __call__( text, token, token_int, hyp """ - assert check_argument_types() # Input as audio signal if isinstance(speech, np.ndarray): @@ -355,10 +355,10 @@ def assemble_hyps(self, hyps): text = None results.append((text, token, token_int, hyp)) - assert check_return_type(results) return results +@typechecked def inference( output_dir: str, maxlenratio: float, @@ -391,7 +391,6 @@ def inference( encoded_feat_length_limit: int, decoder_text_length_limit: int, ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: diff --git a/espnet2/bin/asr_transducer_inference.py b/espnet2/bin/asr_transducer_inference.py index 05a4a171088..06610b947bf 100755 --- a/espnet2/bin/asr_transducer_inference.py +++ b/espnet2/bin/asr_transducer_inference.py @@ -13,7 +13,7 @@ import numpy as np import torch from packaging.version import parse as V -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr_transducer.beam_search_transducer import ( BeamSearchTransducer, @@ -58,21 +58,22 @@ class Speech2Text: """ + @typechecked def __init__( self, - asr_train_config: Union[Path, str] = None, - asr_model_file: Union[Path, str] = None, - beam_search_config: Dict[str, Any] = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, + asr_train_config: Union[Path, str, None] = None, + asr_model_file: Union[Path, str, None] = None, + beam_search_config: Optional[Dict[str, Any]] = None, + lm_train_config: Union[Path, str, None] = None, + lm_file: Union[Path, str, None] = None, + token_type: Optional[str] = None, + bpemodel: Optional[str] = None, device: str = "cpu", beam_size: int = 5, dtype: str = "float32", lm_weight: float = 1.0, quantize_asr_model: bool = False, - quantize_modules: List[str] = None, + quantize_modules: Optional[List[str]] = None, quantize_dtype: str = "qint8", nbest: int = 1, streaming: bool = False, @@ -82,8 +83,6 @@ def __init__( """Construct a Speech2Text object.""" super().__init__() - assert check_argument_types() - asr_model, asr_train_args = ASRTransducerTask.build_model_from_file( asr_train_config, asr_model_file, device ) @@ -247,6 +246,7 @@ def streaming_decode( return nbest_hyps @torch.no_grad() + @typechecked def __call__(self, speech: Union[torch.Tensor, np.ndarray]) -> List[Hypothesis]: """Speech2Text call. @@ -257,7 +257,6 @@ def __call__(self, speech: Union[torch.Tensor, np.ndarray]) -> List[Hypothesis]: nbest_hypothesis: N-best hypothesis. """ - assert check_argument_types() if isinstance(speech, np.ndarray): speech = torch.tensor(speech) @@ -303,8 +302,6 @@ def hypotheses_to_results(self, nbest_hyps: List[Hypothesis]) -> List[Any]: text = None results.append((text, token, token_int, hyp)) - assert check_return_type(results) - return results @staticmethod @@ -337,6 +334,7 @@ def from_pretrained( return Speech2Text(**kwargs) +@typechecked def inference( output_dir: str, batch_size: int, @@ -401,7 +399,6 @@ def inference( display_hypotheses: Whether to display (partial and full) hypotheses. """ - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") diff --git a/espnet2/bin/asvspoof_inference.py b/espnet2/bin/asvspoof_inference.py index 2885e4b1e97..4c8260f7866 100644 --- a/espnet2/bin/asvspoof_inference.py +++ b/espnet2/bin/asvspoof_inference.py @@ -2,14 +2,13 @@ import argparse import logging import sys -from distutils.version import LooseVersion from pathlib import Path -from typing import Any, List, Optional, Sequence, Tuple, Union +from typing import Optional, Sequence, Tuple, Union import numpy as np import torch import torch.quantization -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.fileio.datadir_writer import DatadirWriter from espnet2.tasks.asvspoof import ASVSpoofTask @@ -23,6 +22,7 @@ class SpeechAntiSpoof: """SpeechAntiSpoof class + Examples: >>> import soundfile >>> speech_anti_spoof = SpeechAntiSpoof("asvspoof_config.yml", "asvspoof.pth") @@ -31,15 +31,15 @@ class SpeechAntiSpoof: prediction_result (int) """ + @typechecked def __init__( self, - asvspoof_train_config: Union[Path, str] = None, - asvspoof_model_file: Union[Path, str] = None, + asvspoof_train_config: Union[Path, str, None] = None, + asvspoof_model_file: Union[Path, str, None] = None, device: str = "cpu", batch_size: int = 1, dtype: str = "float32", ): - assert check_argument_types() asvspoof_model, asvspoof_train_args = ASVSpoofTask.build_model_from_file( asvspoof_train_config, asvspoof_model_file, device @@ -52,14 +52,15 @@ def __init__( self.dtype = dtype @torch.no_grad() + @typechecked def __call__(self, speech: Union[torch.Tensor, np.ndarray]) -> float: """Inference + Args: data: Input speech data Returns: [prediction, scores] """ - assert check_argument_types() # Input as audio signal if isinstance(speech, np.ndarray): @@ -75,17 +76,18 @@ def __call__(self, speech: Union[torch.Tensor, np.ndarray]) -> float: # To device batch = to_device(batch, device=self.device) - # TODO1 (checkpoint 4): Forward feature extraction and encoder etc. + # TODO(checkpoint 4): Forward feature extraction and encoder etc. if "oc_softmax_loss" in self.asvspoof_model.losses: - pass # TODO1 (exercise2): use loss score function to estimate score + pass # TODO(exercise2): use loss score function to estimate score else: - pass # TODO2 (checkpoint 4): Pass the encoder result to decoder + pass # TODO(checkpoint 4): Pass the encoder result to decoder - # TODO3 (checkpoint 4): return the prediction score + # TODO(checkpoint 4): return the prediction score return None +@typechecked def inference( output_dir: str, batch_size: int, @@ -100,7 +102,6 @@ def inference( asvspoof_model_file: Optional[str], allow_variable_data_keys: bool, ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: @@ -168,7 +169,7 @@ def inference( key = keys[0] # Create a directory: outdir/{n}best_recog - result_writer = writer[f"prediction"] + result_writer = writer["prediction"] # Write the result to each file result_writer["score"][key] = str(score) diff --git a/espnet2/bin/asvspoof_train.py b/espnet2/bin/asvspoof_train.py index c6b4b09ab77..8682085114c 100644 --- a/espnet2/bin/asvspoof_train.py +++ b/espnet2/bin/asvspoof_train.py @@ -9,6 +9,7 @@ def get_parser(): def main(cmd=None): r"""ASVSpoof training. + Example: % python asvspoof_train.py asr --print_config --optim adadelta \ > conf/train_asvspoof.yaml diff --git a/espnet2/bin/diar_inference.py b/espnet2/bin/diar_inference.py index 1698b1804d4..85f2156bd34 100755 --- a/espnet2/bin/diar_inference.py +++ b/espnet2/bin/diar_inference.py @@ -11,7 +11,7 @@ import torch import torch.nn.functional as F from tqdm import trange -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainMSE from espnet2.enh.loss.criterions.time_domain import SISNRLoss @@ -45,10 +45,11 @@ class DiarizeSpeech: """ + @typechecked def __init__( self, - train_config: Union[Path, str] = None, - model_file: Union[Path, str] = None, + train_config: Union[Path, str, None] = None, + model_file: Union[Path, str, None] = None, segment_size: Optional[float] = None, hop_size: Optional[float] = None, normalize_segment_scale: bool = False, @@ -60,7 +61,6 @@ def __init__( enh_s2t_task: bool = False, multiply_diar_result: bool = False, ): - assert check_argument_types() task = DiarizationTask if not enh_s2t_task else EnhS2TTask @@ -120,9 +120,10 @@ def __init__( logging.info("Perform direct speaker diarization on the input") @torch.no_grad() + @typechecked def __call__( self, speech: Union[torch.Tensor, np.ndarray], fs: int = 8000 - ) -> List[torch.Tensor]: + ) -> Union[List[torch.Tensor], Tuple]: """Inference Args: @@ -132,7 +133,6 @@ def __call__( [speaker_info1, speaker_info2, ...] """ - assert check_argument_types() # Input as audio signal if isinstance(speech, np.ndarray): @@ -462,6 +462,7 @@ def decode(self, encoder_out, encoder_out_lens): return spk_prediction, num_spk +@typechecked def inference( output_dir: str, batch_size: int, @@ -486,7 +487,6 @@ def inference( multiply_diar_result: bool, enh_s2t_task: bool, ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: diff --git a/espnet2/bin/enh_inference.py b/espnet2/bin/enh_inference.py index 5ea097d748e..0711e860eaf 100755 --- a/espnet2/bin/enh_inference.py +++ b/espnet2/bin/enh_inference.py @@ -11,7 +11,7 @@ import torch import yaml from tqdm import trange -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.enh.diffusion_enh import ESPnetDiffusionModel from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainMSE @@ -95,11 +95,12 @@ class SeparateSpeech: """ + @typechecked def __init__( self, - train_config: Union[Path, str] = None, - model_file: Union[Path, str] = None, - inference_config: Union[Path, str] = None, + train_config: Union[Path, str, None] = None, + model_file: Union[Path, str, None] = None, + inference_config: Union[Path, str, None] = None, segment_size: Optional[float] = None, hop_size: Optional[float] = None, normalize_segment_scale: bool = False, @@ -110,7 +111,6 @@ def __init__( dtype: str = "float32", enh_s2t_task: bool = False, ): - assert check_argument_types() task = EnhancementTask if not enh_s2t_task else EnhS2TTask @@ -191,9 +191,10 @@ def __init__( logging.info("Perform direct speech %s on the input" % task) @torch.no_grad() + @typechecked def __call__( self, speech_mix: Union[torch.Tensor, np.ndarray], fs: int = 8000, **kwargs - ) -> List[torch.Tensor]: + ) -> List[Union[torch.Tensor, np.array]]: """Inference Args: @@ -203,7 +204,6 @@ def __call__( [separated_audio1, separated_audio2, ...] """ - assert check_argument_types() # Input as audio signal if isinstance(speech_mix, np.ndarray): @@ -426,6 +426,7 @@ def humanfriendly_or_none(value: str): return humanfriendly.parse_size(value) +@typechecked def inference( output_dir: str, batch_size: int, @@ -450,7 +451,6 @@ def inference( normalize_output_wav: bool, enh_s2t_task: bool, ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: @@ -507,7 +507,7 @@ def inference( ) # 4. Start for-loop - output_dir = Path(output_dir).expanduser().resolve() + output_dir: Path = Path(output_dir).expanduser().resolve() writers = [] for i in range(separate_speech.num_spk): writers.append( diff --git a/espnet2/bin/enh_inference_streaming.py b/espnet2/bin/enh_inference_streaming.py index 26af8b9ab02..85b6d75ba64 100755 --- a/espnet2/bin/enh_inference_streaming.py +++ b/espnet2/bin/enh_inference_streaming.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import argparse import logging -import math import sys from itertools import chain from pathlib import Path @@ -10,9 +9,8 @@ import humanfriendly import numpy as np import torch -import torch_complex import yaml -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.bin.enh_inference import ( build_model_from_args_and_file, @@ -24,7 +22,6 @@ from espnet2.tasks.enh_s2t import EnhS2TTask from espnet2.torch_utils.device_funcs import to_device from espnet2.torch_utils.set_all_random_seed import set_all_random_seed -from espnet2.train.abs_espnet_model import AbsESPnetModel from espnet2.utils import config_argparse from espnet2.utils.types import str2bool, str2triple_str, str_or_none from espnet.utils.cli_utils import get_commandline_args @@ -54,17 +51,17 @@ class SeparateSpeechStreaming: >>> for chunks in output_chunks ] """ + @typechecked def __init__( self, - train_config: Union[Path, str] = None, - model_file: Union[Path, str] = None, - inference_config: Union[Path, str] = None, + train_config: Union[Path, str, None] = None, + model_file: Union[Path, str, None] = None, + inference_config: Union[Path, str, None] = None, ref_channel: Optional[int] = None, device: str = "cpu", dtype: str = "float32", enh_s2t_task: bool = False, ): - assert check_argument_types() task = EnhancementTask if not enh_s2t_task else EnhS2TTask @@ -135,6 +132,7 @@ def reset(self): self.streaming_states = None @torch.no_grad() + @typechecked def __call__( self, speech_mix: Union[torch.Tensor, np.ndarray], fs: int = 8000 ) -> List[torch.Tensor]: @@ -147,7 +145,6 @@ def __call__( [separated_audio1, separated_audio2, ...] """ - assert check_argument_types() # Input as audio signal if isinstance(speech_mix, np.ndarray): @@ -218,6 +215,7 @@ def humanfriendly_or_none(value: str): return humanfriendly.parse_size(value) +@typechecked def inference( output_dir: str, batch_size: int, @@ -237,7 +235,6 @@ def inference( ref_channel: Optional[int], enh_s2t_task: bool, ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: diff --git a/espnet2/bin/enh_scoring.py b/espnet2/bin/enh_scoring.py index 042fc42d461..fd9f4b4e28d 100755 --- a/espnet2/bin/enh_scoring.py +++ b/espnet2/bin/enh_scoring.py @@ -10,7 +10,7 @@ import torch from mir_eval.separation import bss_eval_sources from pystoi import stoi -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.enh.loss.criterions.time_domain import SISNRLoss from espnet2.fileio.datadir_writer import DatadirWriter @@ -47,6 +47,7 @@ def read_audio(reader, key, audio_format="sound"): raise ValueError(f"Unknown audio format: {audio_format}") +@typechecked def scoring( output_dir: str, dtype: str, @@ -61,7 +62,6 @@ def scoring( dnsmos_args: Dict, use_pesq: bool, ): - assert check_argument_types() logging.basicConfig( level=log_level, diff --git a/espnet2/bin/enh_tse_inference.py b/espnet2/bin/enh_tse_inference.py index eec439e8fb7..8964cf91e20 100755 --- a/espnet2/bin/enh_tse_inference.py +++ b/espnet2/bin/enh_tse_inference.py @@ -11,7 +11,7 @@ import torch import yaml from tqdm import trange -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainMSE from espnet2.enh.loss.criterions.time_domain import SISNRLoss @@ -93,11 +93,12 @@ class SeparateSpeech: """ + @typechecked def __init__( self, - train_config: Union[Path, str] = None, - model_file: Union[Path, str] = None, - inference_config: Union[Path, str] = None, + train_config: Union[Path, str, None] = None, + model_file: Union[Path, str, None] = None, + inference_config: Union[Path, str, None] = None, segment_size: Optional[float] = None, hop_size: Optional[float] = None, normalize_segment_scale: bool = False, @@ -107,7 +108,6 @@ def __init__( device: str = "cpu", dtype: str = "float32", ): - assert check_argument_types() # 1. Build Enh model if inference_config is None: @@ -180,9 +180,10 @@ def __init__( logging.info("Perform direct speech %s on the input" % task) @torch.no_grad() + @typechecked def __call__( self, speech_mix: Union[torch.Tensor, np.ndarray], fs: int = 8000, **kwargs - ) -> List[torch.Tensor]: + ) -> List[Union[torch.Tensor, np.array]]: """Inference Args: @@ -195,7 +196,6 @@ def __call__( [separated_audio1, separated_audio2, ...] """ - assert check_argument_types() enroll_ref = [ # (Batch, samples_aux) @@ -415,6 +415,7 @@ def humanfriendly_or_none(value: str): return humanfriendly.parse_size(value) +@typechecked def inference( output_dir: str, batch_size: int, @@ -438,7 +439,6 @@ def inference( ref_channel: Optional[int], normalize_output_wav: bool, ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: @@ -509,7 +509,7 @@ def inference( ) # 4. Start for-loop - output_dir = Path(output_dir).expanduser().resolve() + output_dir: Path = Path(output_dir).expanduser().resolve() writers = [] for i in range(separate_speech.num_spk): writers.append( diff --git a/espnet2/bin/hugging_face_export_vocabulary.py b/espnet2/bin/hugging_face_export_vocabulary.py index 84ffeb3bb13..9b09cfeb59a 100755 --- a/espnet2/bin/hugging_face_export_vocabulary.py +++ b/espnet2/bin/hugging_face_export_vocabulary.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import List -from typeguard import check_argument_types +from typeguard import typechecked from espnet.utils.cli_utils import get_commandline_args @@ -17,13 +17,13 @@ is_transformers_available = False +@typechecked def export_vocabulary( output: str, model_name_or_path: str, log_level: str, add_symbol: List[str], ): - assert check_argument_types() if not is_transformers_available: raise ImportError( diff --git a/espnet2/bin/lm_calc_perplexity.py b/espnet2/bin/lm_calc_perplexity.py index be7b99c634a..400f3d03578 100755 --- a/espnet2/bin/lm_calc_perplexity.py +++ b/espnet2/bin/lm_calc_perplexity.py @@ -8,7 +8,7 @@ import numpy as np import torch from torch.nn.parallel import data_parallel -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.datadir_writer import DatadirWriter from espnet2.tasks.lm import LMTask @@ -20,6 +20,7 @@ from espnet.utils.cli_utils import get_commandline_args +@typechecked def calc_perplexity( output_dir: str, batch_size: int, @@ -35,7 +36,6 @@ def calc_perplexity( log_base: Optional[float], allow_variable_data_keys: bool, ): - assert check_argument_types() logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", diff --git a/espnet2/bin/lm_inference.py b/espnet2/bin/lm_inference.py index 879b84b889c..9fccd7bab67 100644 --- a/espnet2/bin/lm_inference.py +++ b/espnet2/bin/lm_inference.py @@ -3,19 +3,18 @@ import logging import sys from pathlib import Path -from typing import Any, Dict, List, Optional, Sequence, Tuple, Union +from typing import Any, List, Optional, Sequence, Tuple, Union import numpy as np import torch import torch.quantization -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.fileio.datadir_writer import DatadirWriter from espnet2.tasks.lm import LMTask from espnet2.text.build_tokenizer import build_tokenizer from espnet2.text.token_id_converter import TokenIDConverter from espnet2.text.whisper_token_id_converter import OpenAIWhisperTokenIDConverter -from espnet2.torch_utils.device_funcs import to_device from espnet2.torch_utils.set_all_random_seed import set_all_random_seed from espnet2.utils import config_argparse from espnet2.utils.types import str2bool, str2triple_str, str_or_none @@ -52,14 +51,15 @@ class GenerateText: """ + @typechecked def __init__( self, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, + lm_train_config: Union[Path, str, None] = None, + lm_file: Union[Path, str, None] = None, ngram_scorer: str = "full", - ngram_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, + ngram_file: Union[Path, str, None] = None, + token_type: Optional[str] = None, + bpemodel: Optional[str] = None, device: str = "cpu", maxlen: int = 100, minlen: int = 0, @@ -73,7 +73,6 @@ def __init__( quantize_modules: List[str] = ["Linear"], quantize_dtype: str = "qint8", ): - assert check_argument_types() # 1. Build language model lm, lm_train_args = LMTask.build_model_from_file( @@ -192,6 +191,7 @@ def __init__( self.nbest = nbest @torch.no_grad() + @typechecked def __call__(self, text: Union[str, torch.Tensor, np.ndarray]) -> ListOfHypothesis: """Inference @@ -204,7 +204,6 @@ def __call__(self, text: Union[str, torch.Tensor, np.ndarray]) -> ListOfHypothes List of (text, token, token_int, hyp) """ - assert check_argument_types() if isinstance(text, str): tokens = self.tokenizer.text2tokens(text) @@ -240,12 +239,11 @@ def __call__(self, text: Union[str, torch.Tensor, np.ndarray]) -> ListOfHypothes # Change integer-ids to tokens token = self.converter.ids2tokens(token_int) - text = None + _text = None if self.tokenizer is not None: - text = self.tokenizer.tokens2text(token) - results.append((text, token, token_int, hyp)) + _text = self.tokenizer.tokens2text(token) + results.append((_text, token, token_int, hyp)) - assert check_return_type(results) return results @staticmethod @@ -279,6 +277,7 @@ def from_pretrained( return GenerateText(**kwargs) +@typechecked def inference( output_dir: str, maxlen: int, @@ -308,7 +307,6 @@ def inference( quantize_modules: List[str], quantize_dtype: str, ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: diff --git a/espnet2/bin/mt_inference.py b/espnet2/bin/mt_inference.py index ae3f652054f..8f77bb523e6 100755 --- a/espnet2/bin/mt_inference.py +++ b/espnet2/bin/mt_inference.py @@ -7,7 +7,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.fileio.datadir_writer import DatadirWriter from espnet2.tasks.lm import LMTask @@ -37,16 +37,17 @@ class Text2Text: """ + @typechecked def __init__( self, - mt_train_config: Union[Path, str] = None, - mt_model_file: Union[Path, str] = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, + mt_train_config: Union[Path, str, None] = None, + mt_model_file: Union[Path, str, None] = None, + lm_train_config: Union[Path, str, None] = None, + lm_file: Union[Path, str, None] = None, ngram_scorer: str = "full", - ngram_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, + ngram_file: Union[Path, str, None] = None, + token_type: Optional[str] = None, + bpemodel: Optional[str] = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, @@ -60,7 +61,6 @@ def __init__( nbest: int = 1, normalize_length: bool = False, ): - assert check_argument_types() # 1. Build MT model scorers = {} @@ -175,6 +175,7 @@ def __init__( self.nbest = nbest @torch.no_grad() + @typechecked def __call__( self, src_text: Union[torch.Tensor, np.ndarray] ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]: @@ -186,7 +187,6 @@ def __call__( text, token, token_int, hyp """ - assert check_argument_types() # Input as audio signal if isinstance(src_text, np.ndarray): @@ -237,7 +237,6 @@ def __call__( text = None results.append((text, token, token_int, hyp)) - assert check_return_type(results) return results @staticmethod @@ -270,6 +269,7 @@ def from_pretrained( return Text2Text(**kwargs) +@typechecked def inference( output_dir: str, maxlenratio: float, @@ -301,7 +301,6 @@ def inference( bpemodel: Optional[str], allow_variable_data_keys: bool, ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: diff --git a/espnet2/bin/s2st_inference.py b/espnet2/bin/s2st_inference.py index 23bf00162b5..c463761783e 100755 --- a/espnet2/bin/s2st_inference.py +++ b/espnet2/bin/s2st_inference.py @@ -14,7 +14,7 @@ import soundfile as sf import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.datadir_writer import DatadirWriter from espnet2.fileio.npy_scp import NpyScpWriter @@ -27,7 +27,6 @@ from espnet2.utils.types import str2bool, str2triple_str, str_or_none from espnet.nets.batch_beam_search import BatchBeamSearch from espnet.nets.beam_search import BeamSearch, Hypothesis -from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError from espnet.nets.scorer_interface import BatchScorerInterface from espnet.nets.scorers.length_bonus import LengthBonus from espnet.utils.cli_utils import get_commandline_args @@ -36,10 +35,11 @@ class Speech2Speech: """Speech2Speech class.""" + @typechecked def __init__( self, - train_config: Union[Path, str] = None, - model_file: Union[Path, str] = None, + train_config: Union[Path, str, None] = None, + model_file: Union[Path, str, None] = None, threshold: float = 0.5, minlenratio: float = 0.0, maxlenratio: float = 10.0, @@ -56,10 +56,10 @@ def __init__( st_subtask_beam_size: int = 5, st_subtask_penalty: float = 0.0, st_subtask_nbest: int = 1, - st_subtask_token_type: str = None, - st_subtask_bpemodel: str = None, - vocoder_config: Union[Path, str] = None, - vocoder_file: Union[Path, str] = None, + st_subtask_token_type: Optional[str] = None, + st_subtask_bpemodel: Optional[str] = None, + vocoder_config: Union[Path, str, None] = None, + vocoder_file: Union[Path, str, None] = None, dtype: str = "float32", device: str = "cpu", seed: int = 777, @@ -67,7 +67,6 @@ def __init__( prefer_normalized_feats: bool = False, ): """Initialize Speech2Speech module.""" - assert check_argument_types() # setup model model, train_args = S2STTask.build_model_from_file( @@ -214,7 +213,7 @@ def __init__( if st_subtask_token_type is None: st_subtask_token_type = train_args.tgt_token_type elif st_subtask_token_type == "bpe": - if st_subtask_tokenizer is not None: + if st_subtask_bpemodel is not None: self.st_subtask_tokenizer = build_tokenizer( token_type=st_subtask_token_type, bpemodel=st_subtask_bpemodel, @@ -235,19 +234,19 @@ def __init__( ) @torch.no_grad() + @typechecked def __call__( self, src_speech: Union[torch.Tensor, np.ndarray], - src_speech_lengths: Union[torch.Tensor, np.ndarray] = None, - tgt_speech: Union[torch.Tensor, np.ndarray] = None, - tgt_speech_lengths: Union[torch.Tensor, np.ndarray] = None, - spembs: Union[torch.Tensor, np.ndarray] = None, - sids: Union[torch.Tensor, np.ndarray] = None, - lids: Union[torch.Tensor, np.ndarray] = None, + src_speech_lengths: Union[torch.Tensor, np.ndarray, None] = None, + tgt_speech: Union[torch.Tensor, np.ndarray, None] = None, + tgt_speech_lengths: Union[torch.Tensor, np.ndarray, None] = None, + spembs: Union[torch.Tensor, np.ndarray, None] = None, + sids: Union[torch.Tensor, np.ndarray, None] = None, + lids: Union[torch.Tensor, np.ndarray, None] = None, decode_conf: Optional[Dict[str, Any]] = None, ) -> Dict[str, torch.Tensor]: """Run speech-to-speech.""" - assert check_argument_types() # check inputs if self.use_speech and tgt_speech is None: @@ -510,6 +509,7 @@ def from_pretrained( return Speech2Speech(**kwargs) +@typechecked def inference( output_dir: str, batch_size: int, @@ -547,7 +547,6 @@ def inference( vocoder_tag: Optional[str], ): """Run text-to-speech inference.""" - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: @@ -613,7 +612,7 @@ def inference( ) # 6. Start for-loop - output_dir = Path(output_dir) + output_dir: Path = Path(output_dir) (output_dir / "norm").mkdir(parents=True, exist_ok=True) (output_dir / "denorm").mkdir(parents=True, exist_ok=True) (output_dir / "speech_shape").mkdir(parents=True, exist_ok=True) @@ -745,12 +744,14 @@ def inference( ) if output_dict.get("st_subtask_token") is not None: - writer["token"][key] = " ".join(output_dict["st_subtask_token"]) - writer["token_int"][key] == " ".join( + st_subtask_wrtier["token"][key] = " ".join( + output_dict["st_subtask_token"] + ) + st_subtask_wrtier["token_int"][key] == " ".join( map(str, output_dict["st_subtask_token_int"]) ) if output_dict.get("st_subtask_text") is not None: - writer["text"][key] = output_dict["st_subtask_text"] + st_subtask_wrtier["text"][key] = output_dict["st_subtask_text"] # remove files if those are not included in output dict if output_dict.get("feat_gen") is None: diff --git a/espnet2/bin/s2t_inference.py b/espnet2/bin/s2t_inference.py index f969172c48b..15734dec5c1 100644 --- a/espnet2/bin/s2t_inference.py +++ b/espnet2/bin/s2t_inference.py @@ -10,7 +10,7 @@ import torch import torch.nn.functional as F import torch.quantization -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.decoder.s4_decoder import S4Decoder from espnet2.fileio.datadir_writer import DatadirWriter @@ -158,16 +158,17 @@ class Speech2Text: """ + @typechecked def __init__( self, - s2t_train_config: Union[Path, str] = None, - s2t_model_file: Union[Path, str] = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, + s2t_train_config: Union[Path, str, None] = None, + s2t_model_file: Union[Path, str, None] = None, + lm_train_config: Union[Path, str, None] = None, + lm_file: Union[Path, str, None] = None, ngram_scorer: str = "full", - ngram_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, + ngram_file: Union[Path, str, None] = None, + token_type: Optional[str] = None, + bpemodel: Optional[str] = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, @@ -189,13 +190,12 @@ def __init__( task_sym: str = "", predict_time: bool = False, ): - assert check_argument_types() if ctc_weight > 0.0 and predict_time: raise ValueError("CTC cannot predict timestamps") - quantize_modules = set([getattr(torch.nn, q) for q in quantize_modules]) - quantize_dtype = getattr(torch, quantize_dtype) + qconfig_spec = set([getattr(torch.nn, q) for q in quantize_modules]) + quantize_dtype: torch.dtype = getattr(torch, quantize_dtype) # 1. Build S2T model s2t_model, s2t_train_args = S2TTask.build_model_from_file( @@ -207,7 +207,7 @@ def __init__( logging.info("Use quantized s2t model for decoding.") s2t_model = torch.quantization.quantize_dynamic( - s2t_model, qconfig_spec=quantize_modules, dtype=quantize_dtype + s2t_model, qconfig_spec=qconfig_spec, dtype=quantize_dtype ) decoder = s2t_model.decoder @@ -243,7 +243,7 @@ def __init__( logging.info("Use quantized lm for decoding.") lm = torch.quantization.quantize_dynamic( - lm, qconfig_spec=quantize_modules, dtype=quantize_dtype + lm, qconfig_spec=qconfig_spec, dtype=quantize_dtype ) scorers["lm"] = lm.lm @@ -350,10 +350,11 @@ def __init__( self.predict_time = predict_time @torch.no_grad() + @typechecked def __call__( self, speech: Union[torch.Tensor, np.ndarray], - text_prev: Optional[Union[torch.Tensor, np.ndarray, str]] = None, + text_prev: Optional[Union[torch.Tensor, np.ndarray, str, List]] = None, lang_sym: Optional[str] = None, task_sym: Optional[str] = None, predict_time: Optional[bool] = None, @@ -377,7 +378,6 @@ def __call__( n-best list of (text, token, token_int, text_nospecial, hyp) """ - assert check_argument_types() lang_sym = lang_sym if lang_sym is not None else self.lang_sym task_sym = task_sym if task_sym is not None else self.task_sym @@ -457,8 +457,6 @@ def __call__( encoder_interctc_res = self._decode_interctc(intermediate_outs) results = (results, encoder_interctc_res) - assert check_return_type(results) - return results def _decode_single_sample(self, enc: torch.Tensor): @@ -504,10 +502,10 @@ def _decode_single_sample(self, enc: torch.Tensor): return results + @typechecked def _decode_interctc( self, intermediate_outs: List[Tuple[int, torch.Tensor]] ) -> Dict[int, List[str]]: - assert check_argument_types() exclude_ids = [self.s2t_model.blank_id, self.s2t_model.sos, self.s2t_model.eos] res = {} @@ -523,6 +521,7 @@ def _decode_interctc( return res @torch.no_grad() + @typechecked def decode_long( self, speech: Union[torch.Tensor, np.ndarray], @@ -547,8 +546,6 @@ def decode_long( """ - assert check_argument_types() - lang_sym = lang_sym if lang_sym is not None else self.lang_sym task_sym = task_sym if task_sym is not None else self.task_sym segment_len = int( @@ -684,6 +681,7 @@ def from_pretrained( return Speech2Text(**kwargs) +@typechecked def inference( output_dir: str, maxlenratio: float, @@ -722,7 +720,6 @@ def inference( task_sym: str, predict_time: bool, ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: @@ -835,7 +832,7 @@ def inference( # Write intermediate predictions to # encoder_interctc_layer.txt - ibest_writer = writer[f"1best_recog"] + ibest_writer = writer["1best_recog"] if encoder_interctc_res is not None: for idx, text in encoder_interctc_res.items(): ibest_writer[f"encoder_interctc_layer{idx}.txt"][key] = " ".join( diff --git a/espnet2/bin/s2t_inference_language.py b/espnet2/bin/s2t_inference_language.py index c9b0296335a..7b68e3c0ba3 100644 --- a/espnet2/bin/s2t_inference_language.py +++ b/espnet2/bin/s2t_inference_language.py @@ -3,13 +3,13 @@ import logging import sys from pathlib import Path -from typing import Any, Dict, List, Optional, Sequence, Tuple, Union +from typing import Any, List, Optional, Sequence, Tuple, Union import numpy as np import torch import torch.nn.functional as F import torch.quantization -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.fileio.datadir_writer import DatadirWriter from espnet2.tasks.s2t import S2TTask @@ -22,10 +22,11 @@ class Speech2Language: + @typechecked def __init__( self, - s2t_train_config: Union[Path, str] = None, - s2t_model_file: Union[Path, str] = None, + s2t_train_config: Union[Path, str, None] = None, + s2t_model_file: Union[Path, str, None] = None, device: str = "cpu", batch_size: int = 1, dtype: str = "float32", @@ -36,10 +37,9 @@ def __init__( first_lang_sym: str = "", last_lang_sym: str = "", ): - assert check_argument_types() - quantize_modules = set([getattr(torch.nn, q) for q in quantize_modules]) - quantize_dtype = getattr(torch, quantize_dtype) + qconfig_spec = set([getattr(torch.nn, q) for q in quantize_modules]) + quantize_dtype: torch.dtype = getattr(torch, quantize_dtype) s2t_model, s2t_train_args = S2TTask.build_model_from_file( s2t_train_config, s2t_model_file, device @@ -50,7 +50,7 @@ def __init__( logging.info("Use quantized s2t model for decoding.") s2t_model = torch.quantization.quantize_dynamic( - s2t_model, qconfig_spec=quantize_modules, dtype=quantize_dtype + s2t_model, qconfig_spec=qconfig_spec, dtype=quantize_dtype ) logging.info(f"Decoding device={device}, dtype={dtype}") @@ -67,6 +67,7 @@ def __init__( self.last_lang_id = token_list.index(last_lang_sym) @torch.no_grad() + @typechecked def __call__( self, speech: Union[torch.Tensor, np.ndarray], @@ -84,8 +85,6 @@ def __call__( """ - assert check_argument_types() - # Preapre speech if isinstance(speech, np.ndarray): speech = torch.tensor(speech) @@ -136,7 +135,6 @@ def __call__( (self.s2t_model.token_list[idx + self.first_lang_id], val.item()) ) - assert check_return_type(results) return results @staticmethod @@ -170,6 +168,7 @@ def from_pretrained( return Speech2Language(**kwargs) +@typechecked def inference( output_dir: str, batch_size: int, @@ -191,7 +190,6 @@ def inference( first_lang_sym: str, last_lang_sym: str, ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: diff --git a/espnet2/bin/slu_inference.py b/espnet2/bin/slu_inference.py index 93da44cffcc..773ef6946f8 100644 --- a/espnet2/bin/slu_inference.py +++ b/espnet2/bin/slu_inference.py @@ -9,7 +9,7 @@ import numpy as np import torch import torch.quantization -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.transducer.beam_search_transducer import BeamSearchTransducer from espnet2.asr.transducer.beam_search_transducer import ( @@ -47,17 +47,18 @@ class Speech2Understand: """ + @typechecked def __init__( self, - slu_train_config: Union[Path, str] = None, - slu_model_file: Union[Path, str] = None, - transducer_conf: dict = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, + slu_train_config: Union[Path, str, None] = None, + slu_model_file: Union[Path, str, None] = None, + transducer_conf: Optional[dict] = None, + lm_train_config: Union[Path, str, None] = None, + lm_file: Union[Path, str, None] = None, ngram_scorer: str = "full", - ngram_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, + ngram_file: Union[Path, str, None] = None, + token_type: Optional[str] = None, + bpemodel: Optional[str] = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, @@ -76,7 +77,6 @@ def __init__( quantize_modules: List[str] = ["Linear"], quantize_dtype: str = "qint8", ): - assert check_argument_types() task = SLUTask @@ -89,8 +89,8 @@ def __init__( "torch version < 1.5.0. Switch to qint8 dtype instead." ) - quantize_modules = set([getattr(torch.nn, q) for q in quantize_modules]) - quantize_dtype = getattr(torch, quantize_dtype) + qconfig_spec = set([getattr(torch.nn, q) for q in quantize_modules]) + quantize_dtype: torch.dtype = getattr(torch, quantize_dtype) # 1. Build ASR model scorers = {} @@ -103,7 +103,7 @@ def __init__( logging.info("Use quantized asr model for decoding.") asr_model = torch.quantization.quantize_dynamic( - asr_model, qconfig_spec=quantize_modules, dtype=quantize_dtype + asr_model, qconfig_spec=qconfig_spec, dtype=quantize_dtype ) decoder = asr_model.decoder @@ -126,7 +126,7 @@ def __init__( logging.info("Use quantized lm for decoding.") lm = torch.quantization.quantize_dynamic( - lm, qconfig_spec=quantize_modules, dtype=quantize_dtype + lm, qconfig_spec=qconfig_spec, dtype=quantize_dtype ) scorers["lm"] = lm.lm @@ -240,8 +240,11 @@ def __init__( self.nbest = nbest @torch.no_grad() + @typechecked def __call__( - self, speech: Union[torch.Tensor, np.ndarray], transcript: torch.Tensor = None + self, + speech: Union[torch.Tensor, np.ndarray], + transcript: Optional[torch.Tensor] = None, ) -> List[ Tuple[ Optional[str], @@ -258,7 +261,6 @@ def __call__( text, token, token_int, hyp """ - assert check_argument_types() # Input as audio signal if isinstance(speech, np.ndarray): @@ -337,7 +339,6 @@ def __call__( text = None results.append((text, token, token_int, hyp)) - assert check_return_type(results) return results @staticmethod @@ -371,6 +372,7 @@ def from_pretrained( return Speech2Understand(**kwargs) +@typechecked def inference( output_dir: str, maxlenratio: float, @@ -408,7 +410,6 @@ def inference( quantize_modules: List[str], quantize_dtype: str, ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: diff --git a/espnet2/bin/spk_embed_extract.py b/espnet2/bin/spk_embed_extract.py index 3d7e44f541c..d4571ec3ee3 100755 --- a/espnet2/bin/spk_embed_extract.py +++ b/espnet2/bin/spk_embed_extract.py @@ -9,7 +9,6 @@ import numpy as np import torch from torch.multiprocessing.spawn import ProcessContext -from typeguard import check_argument_types, check_return_type from espnet2.samplers.build_batch_sampler import BATCH_TYPES from espnet2.tasks.spk import SpeakerTask @@ -22,7 +21,7 @@ get_num_nodes, resolve_distributed_mode, ) -from espnet2.train.reporter import Reporter, SubReporter +from espnet2.train.reporter import Reporter from espnet2.utils import config_argparse from espnet2.utils.build_dataclass import build_dataclass from espnet2.utils.nested_dict_action import NestedDictAction diff --git a/espnet2/bin/spk_inference.py b/espnet2/bin/spk_inference.py index a519cffa843..b0cb9784b12 100755 --- a/espnet2/bin/spk_inference.py +++ b/espnet2/bin/spk_inference.py @@ -2,21 +2,19 @@ import argparse import logging import sys -from distutils.version import LooseVersion -from itertools import groupby from pathlib import Path -from typing import Any, Dict, List, Optional, Sequence, Tuple, Union +from typing import Any, Optional, Sequence, Tuple, Union import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.fileio.npy_scp import NpyScpWriter from espnet2.tasks.spk import SpeakerTask from espnet2.torch_utils.device_funcs import to_device from espnet2.torch_utils.set_all_random_seed import set_all_random_seed from espnet2.utils import config_argparse -from espnet2.utils.types import str2bool, str2triple_str, str_or_none +from espnet2.utils.types import str2triple_str, str_or_none from espnet.utils.cli_utils import get_commandline_args @@ -31,15 +29,15 @@ class Speech2Embedding: """ + @typechecked def __init__( self, - train_config: Union[Path, str] = None, - model_file: Union[Path, str] = None, + train_config: Union[Path, str, None] = None, + model_file: Union[Path, str, None] = None, device: str = "cpu", dtype: str = "float32", batch_size: int = 1, ): - assert check_argument_types() spk_model, spk_train_args = SpeakerTask.build_model_from_file( train_config, model_file, device @@ -51,6 +49,7 @@ def __init__( self.batch_size = batch_size @torch.no_grad() + @typechecked def __call__(self, speech: Union[torch.Tensor, np.ndarray]) -> torch.Tensor: """Inference @@ -62,8 +61,6 @@ def __call__(self, speech: Union[torch.Tensor, np.ndarray]) -> torch.Tensor: """ - assert check_argument_types() - # Input as audio signal if isinstance(speech, np.ndarray): speech = torch.tensor(speech) @@ -112,6 +109,7 @@ def from_pretrained( return Speech2Embedding(**kwargs) +@typechecked def inference( output_dir: str, batch_size: int, @@ -126,7 +124,6 @@ def inference( model_file: Optional[str], model_tag: Optional[str], ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: @@ -137,10 +134,10 @@ def inference( format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) - if ngpu >= 1: - device = "cuda" - else: - device = "cpu" + # if ngpu >= 1: + # device = "cuda" + # else: + # device = "cpu" # 1. Set random-seed set_all_random_seed(seed) diff --git a/espnet2/bin/spk_train.py b/espnet2/bin/spk_train.py index 1145a7a83d8..aa0c39df42a 100755 --- a/espnet2/bin/spk_train.py +++ b/espnet2/bin/spk_train.py @@ -9,9 +9,11 @@ def get_parser(): def main(cmd=None): - r"""Speaker embedding extractor training. Trained model can be used for - speaker verification, open set speaker identification, and also as - embeddings for various other tasks including speaker diarization. + r"""Speaker embedding extractor training. + + Trained model can be used for + speaker verification, open set speaker identification, and also as + embeddings for various other tasks including speaker diarization. Example: % python spk_train.py --print_config --optim adadelta \ diff --git a/espnet2/bin/st_inference.py b/espnet2/bin/st_inference.py index 98ca5f30e14..750f7dd13a2 100755 --- a/espnet2/bin/st_inference.py +++ b/espnet2/bin/st_inference.py @@ -7,7 +7,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.transducer.beam_search_transducer import BeamSearchTransducer from espnet2.asr.transducer.beam_search_transducer import Hypothesis as TransHypothesis @@ -50,23 +50,24 @@ class Speech2Text: """ + @typechecked def __init__( self, - st_train_config: Union[Path, str] = None, - st_model_file: Union[Path, str] = None, - transducer_conf: dict = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, + st_train_config: Union[Path, str, None] = None, + st_model_file: Union[Path, str, None] = None, + transducer_conf: Optional[dict] = None, + lm_train_config: Union[Path, str, None] = None, + lm_file: Union[Path, str, None] = None, ngram_scorer: str = "full", - ngram_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, - src_lm_train_config: Union[Path, str] = None, - src_lm_file: Union[Path, str] = None, + ngram_file: Union[Path, str, None] = None, + token_type: Optional[str] = None, + bpemodel: Optional[str] = None, + src_lm_train_config: Union[Path, str, None] = None, + src_lm_file: Union[Path, str, None] = None, src_ngram_scorer: str = "full", - src_ngram_file: Union[Path, str] = None, - src_token_type: str = None, - src_bpemodel: str = None, + src_ngram_file: Union[Path, str, None] = None, + src_token_type: Optional[str] = None, + src_bpemodel: Optional[str] = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, @@ -92,7 +93,6 @@ def __init__( hugging_face_decoder: bool = False, hugging_face_decoder_max_length: int = 256, ): - assert check_argument_types() task = STTask if not enh_s2t_task else EnhS2TTask @@ -462,6 +462,7 @@ def __init__( self.ctc_greedy = ctc_greedy @torch.no_grad() + @typechecked def __call__( self, speech: Union[torch.Tensor, np.ndarray] ) -> List[ @@ -475,7 +476,6 @@ def __call__( text, token, token_int, hyp """ - assert check_argument_types() # Input as audio signal if isinstance(speech, np.ndarray): @@ -612,7 +612,6 @@ def __call__( if self.st_model.use_multidecoder: return (results, asr_results) - assert check_return_type(results) return results @staticmethod @@ -645,6 +644,7 @@ def from_pretrained( return Speech2Text(**kwargs) +@typechecked def inference( output_dir: str, maxlenratio: float, @@ -696,7 +696,6 @@ def inference( hugging_face_decoder: bool, hugging_face_decoder_max_length: int, ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: diff --git a/espnet2/bin/st_inference_streaming.py b/espnet2/bin/st_inference_streaming.py index bc5feac5a75..38471def57c 100644 --- a/espnet2/bin/st_inference_streaming.py +++ b/espnet2/bin/st_inference_streaming.py @@ -8,7 +8,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.encoder.contextual_block_conformer_encoder import ( # noqa: H301 ContextualBlockConformerEncoder, @@ -57,14 +57,15 @@ class Speech2TextStreaming: """ + @typechecked def __init__( self, st_train_config: Union[Path, str], - st_model_file: Union[Path, str] = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, + st_model_file: Union[Path, str, None] = None, + lm_train_config: Union[Path, str, None] = None, + lm_file: Union[Path, str, None] = None, + token_type: Optional[str] = None, + bpemodel: Optional[str] = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, @@ -83,10 +84,9 @@ def __init__( incremental_decode: bool = False, blank_penalty: float = 1.0, hold_n: int = 0, - transducer_conf: dict = None, + transducer_conf: Optional[dict] = None, hugging_face_decoder: bool = False, ): - assert check_argument_types() # 1. Build ST model scorers = {} @@ -385,6 +385,7 @@ def apply_frontend( return feats, feats_lengths, next_states @torch.no_grad() + @typechecked def __call__( self, speech: Union[torch.Tensor, np.ndarray], is_final: bool = True ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]: @@ -396,7 +397,6 @@ def __call__( text, token, token_int, hyp """ - assert check_argument_types() # Input as audio signal if isinstance(speech, np.ndarray): @@ -455,10 +455,10 @@ def assemble_hyps(self, hyps): text = None results.append((text, token, token_int, hyp)) - assert check_return_type(results) return results +@typechecked def inference( output_dir: str, maxlenratio: float, @@ -497,7 +497,6 @@ def inference( transducer_conf: Optional[dict], hugging_face_decoder: bool, ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: diff --git a/espnet2/bin/svs_inference.py b/espnet2/bin/svs_inference.py index eeac36cde67..48ea801a411 100644 --- a/espnet2/bin/svs_inference.py +++ b/espnet2/bin/svs_inference.py @@ -13,7 +13,8 @@ import numpy as np import soundfile as sf import torch -from typeguard import check_argument_types +from packaging.version import parse as V +from typeguard import typechecked from espnet2.fileio.npy_scp import NpyScpWriter from espnet2.gan_svs.vits import VITS @@ -37,10 +38,11 @@ class SingingGenerate: >>> soundfile.write("out.wav", wav.numpy(), svs.fs, "PCM_16") """ + @typechecked def __init__( self, - train_config: Optional[Union[Path, str]], - model_file: Optional[Union[Path, str]] = None, + train_config: Union[Path, str, None], + model_file: Union[Path, str, None] = None, threshold: float = 0.5, minlenratio: float = 0.0, maxlenratio: float = 10.0, @@ -52,8 +54,8 @@ def __init__( speed_control_alpha: float = 1.0, noise_scale: float = 0.667, noise_scale_dur: float = 0.8, - vocoder_config: Union[Path, str] = None, - vocoder_checkpoint: Union[Path, str] = None, + vocoder_config: Union[Path, str, None] = None, + vocoder_checkpoint: Union[Path, str, None] = None, dtype: str = "float32", device: str = "cpu", seed: int = 777, @@ -61,7 +63,6 @@ def __init__( prefer_normalized_feats: bool = False, ): """Initialize SingingGenerate module.""" - assert check_argument_types() # setup model model, train_args = SVSTask.build_model_from_file( @@ -117,25 +118,25 @@ def __init__( self.decode_conf = decode_conf @torch.no_grad() + @typechecked def __call__( self, text: Union[Dict[str, Tuple], torch.Tensor, np.ndarray], - singing: Union[torch.Tensor, np.ndarray] = None, - label: Union[torch.Tensor, np.ndarray] = None, - midi: Union[torch.Tensor, np.ndarray] = None, - duration_phn: Union[torch.Tensor, np.ndarray] = None, - duration_ruled_phn: Union[torch.Tensor, np.ndarray] = None, - duration_syb: Union[torch.Tensor, np.ndarray] = None, - phn_cnt: Union[torch.Tensor, np.ndarray] = None, - slur: Union[torch.Tensor, np.ndarray] = None, - pitch: Union[torch.Tensor, np.ndarray] = None, - energy: Union[torch.Tensor, np.ndarray] = None, - spembs: Union[torch.Tensor, np.ndarray] = None, - sids: Union[torch.Tensor, np.ndarray] = None, - lids: Union[torch.Tensor, np.ndarray] = None, + singing: Union[torch.Tensor, np.ndarray, None] = None, + label: Union[torch.Tensor, np.ndarray, None] = None, + midi: Union[torch.Tensor, np.ndarray, None] = None, + duration_phn: Union[torch.Tensor, np.ndarray, None] = None, + duration_ruled_phn: Union[torch.Tensor, np.ndarray, None] = None, + duration_syb: Union[torch.Tensor, np.ndarray, None] = None, + phn_cnt: Union[torch.Tensor, np.ndarray, None] = None, + slur: Union[torch.Tensor, np.ndarray, None] = None, + pitch: Union[torch.Tensor, np.ndarray, None] = None, + energy: Union[torch.Tensor, np.ndarray, None] = None, + spembs: Union[torch.Tensor, np.ndarray, None] = None, + sids: Union[torch.Tensor, np.ndarray, None] = None, + lids: Union[torch.Tensor, np.ndarray, None] = None, decode_conf: Optional[Dict[str, Any]] = None, ): - assert check_argument_types() # check inputs if self.use_sids and sids is None: @@ -307,6 +308,7 @@ def from_pretrained( return SingingGenerate(**kwargs) +@typechecked def inference( output_dir: str, batch_size: int, @@ -328,7 +330,6 @@ def inference( vocoder_tag: Optional[str] = None, ): """Perform SVS model decoding.""" - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: diff --git a/espnet2/bin/tokenize_text.py b/espnet2/bin/tokenize_text.py index d1117ee457b..38c541f12b9 100644 --- a/espnet2/bin/tokenize_text.py +++ b/espnet2/bin/tokenize_text.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import List, Optional -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.text.build_tokenizer import build_tokenizer from espnet2.text.cleaner import TextCleaner @@ -60,6 +60,7 @@ def field2slice(field: Optional[str]) -> slice: return slic +@typechecked def tokenize( input: str, output: str, @@ -79,7 +80,6 @@ def tokenize( g2p: Optional[str], add_nonsplit_symbol: List[str], ): - assert check_argument_types() logging.basicConfig( level=log_level, @@ -96,7 +96,7 @@ def tokenize( p.parent.mkdir(parents=True, exist_ok=True) fout = p.open("w", encoding="utf-8") - cleaner = TextCleaner(cleaner) + cleaner: TextCleaner = TextCleaner(cleaner) tokenizer = build_tokenizer( token_type=token_type, bpemodel=bpemodel, @@ -110,7 +110,7 @@ def tokenize( counter = Counter() if field is not None: - field = field2slice(field) + field: slice = field2slice(field) for line in fin: line = line.rstrip() diff --git a/espnet2/bin/tts_inference.py b/espnet2/bin/tts_inference.py index da35ec902c4..3f2d9849bc2 100755 --- a/espnet2/bin/tts_inference.py +++ b/espnet2/bin/tts_inference.py @@ -14,7 +14,7 @@ import soundfile as sf import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.npy_scp import NpyScpWriter from espnet2.gan_tts.vits import VITS @@ -63,10 +63,11 @@ class Text2Speech: """ + @typechecked def __init__( self, - train_config: Union[Path, str] = None, - model_file: Union[Path, str] = None, + train_config: Union[Path, str, None] = None, + model_file: Union[Path, str, None] = None, threshold: float = 0.5, minlenratio: float = 0.0, maxlenratio: float = 10.0, @@ -77,8 +78,8 @@ def __init__( speed_control_alpha: float = 1.0, noise_scale: float = 0.667, noise_scale_dur: float = 0.8, - vocoder_config: Union[Path, str] = None, - vocoder_file: Union[Path, str] = None, + vocoder_config: Union[Path, str, None] = None, + vocoder_file: Union[Path, str, None] = None, dtype: str = "float32", device: str = "cpu", seed: int = 777, @@ -86,7 +87,6 @@ def __init__( prefer_normalized_feats: bool = False, ): """Initialize Text2Speech module.""" - assert check_argument_types() # setup model model, train_args = TTSTask.build_model_from_file( @@ -145,18 +145,18 @@ def __init__( self.decode_conf = decode_conf @torch.no_grad() + @typechecked def __call__( self, text: Union[str, torch.Tensor, np.ndarray], - speech: Union[torch.Tensor, np.ndarray] = None, - durations: Union[torch.Tensor, np.ndarray] = None, - spembs: Union[torch.Tensor, np.ndarray] = None, - sids: Union[torch.Tensor, np.ndarray] = None, - lids: Union[torch.Tensor, np.ndarray] = None, + speech: Union[torch.Tensor, np.ndarray, None] = None, + durations: Union[torch.Tensor, np.ndarray, None] = None, + spembs: Union[torch.Tensor, np.ndarray, None] = None, + sids: Union[torch.Tensor, np.ndarray, None] = None, + lids: Union[torch.Tensor, np.ndarray, None] = None, decode_conf: Optional[Dict[str, Any]] = None, ) -> Dict[str, torch.Tensor]: """Run text-to-speech.""" - assert check_argument_types() # check inputs if self.use_speech and speech is None: @@ -306,6 +306,7 @@ def from_pretrained( return Text2Speech(**kwargs) +@typechecked def inference( output_dir: str, batch_size: int, @@ -336,7 +337,6 @@ def inference( vocoder_tag: Optional[str], ): """Run text-to-speech inference.""" - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: diff --git a/espnet2/bin/uasr_extract_feature.py b/espnet2/bin/uasr_extract_feature.py index 2bcbbeaf8b4..f6f1555093b 100644 --- a/espnet2/bin/uasr_extract_feature.py +++ b/espnet2/bin/uasr_extract_feature.py @@ -6,7 +6,7 @@ from typing import Optional, Sequence, Tuple, Union from torch.nn.parallel import data_parallel -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.npy_scp import NpyScpWriter from espnet2.tasks.uasr import UASRTask @@ -93,6 +93,7 @@ def get_parser(): return parser +@typechecked def extract_feature( uasr_train_config: Optional[str], uasr_model_file: Optional[str], @@ -107,7 +108,6 @@ def extract_feature( dset: str, log_level: Union[int, str], ): - assert check_argument_types() logging.basicConfig( level=log_level, diff --git a/espnet2/bin/uasr_inference.py b/espnet2/bin/uasr_inference.py index 88102c8d650..405e45c176b 100644 --- a/espnet2/bin/uasr_inference.py +++ b/espnet2/bin/uasr_inference.py @@ -9,7 +9,7 @@ import numpy as np import torch import torch.quantization -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.fileio.datadir_writer import DatadirWriter from espnet2.tasks.lm import LMTask @@ -42,16 +42,17 @@ class Speech2Text: """ + @typechecked def __init__( self, - uasr_train_config: Union[Path, str] = None, - uasr_model_file: Union[Path, str] = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, + uasr_train_config: Union[Path, str, None] = None, + uasr_model_file: Union[Path, str, None] = None, + lm_train_config: Union[Path, str, None] = None, + lm_file: Union[Path, str, None] = None, ngram_scorer: str = "full", - ngram_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, + ngram_file: Union[Path, str, None] = None, + token_type: Optional[str] = None, + bpemodel: Optional[str] = None, device: str = "cpu", batch_size: int = 1, dtype: str = "float32", @@ -64,7 +65,6 @@ def __init__( quantize_modules: List[str] = ["Linear"], quantize_dtype: str = "qint8", ): - assert check_argument_types() if quantize_uasr_model or quantize_lm: if quantize_dtype == "float16" and torch.__version__ < LooseVersion( @@ -75,7 +75,7 @@ def __init__( "torch version < 1.5.0. Switch to qint8 dtype instead." ) - quantize_modules = set([getattr(torch.nn, q) for q in quantize_modules]) + qconfig_spec = set([getattr(torch.nn, q) for q in quantize_modules]) quantize_dtype = getattr(torch, quantize_dtype) # 1. Build UASR model @@ -91,7 +91,7 @@ def __init__( logging.info("Use quantized uasr model for decoding.") uasr_model = torch.quantization.quantize_dynamic( - uasr_model, qconfig_spec=quantize_modules, dtype=quantize_dtype + uasr_model, qconfig_spec=qconfig_spec, dtype=quantize_dtype ) decoder = UASRPrefixScorer(eos=uasr_model.eos) @@ -109,7 +109,7 @@ def __init__( logging.info("Use quantized lm for decoding.") lm = torch.quantization.quantize_dynamic( - lm, qconfig_spec=quantize_modules, dtype=quantize_dtype + lm, qconfig_spec=qconfig_spec, dtype=quantize_dtype ) scorers["lm"] = lm.lm @@ -200,6 +200,7 @@ def __init__( self.nbest = nbest @torch.no_grad() + @typechecked def __call__( self, speech: Union[torch.Tensor, np.ndarray] ) -> List[Tuple[Optional[str], List[str], List[int], Union[Hypothesis]]]: @@ -211,7 +212,6 @@ def __call__( text, token, token_int, hyp """ - assert check_argument_types() # Input as audio signal if isinstance(speech, np.ndarray): @@ -259,7 +259,6 @@ def __call__( text = None results.append((text, token, token_int, hyp)) - assert check_return_type(results) return results @staticmethod @@ -293,6 +292,7 @@ def from_pretrained( return Speech2Text(**kwargs) +@typechecked def inference( output_dir: str, batch_size: int, @@ -323,7 +323,6 @@ def inference( quantize_modules: List[str], quantize_dtype: str, ): - assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: diff --git a/espnet2/bin/uasr_inference_k2.py b/espnet2/bin/uasr_inference_k2.py index 4afc528729c..32a8ca44e6e 100755 --- a/espnet2/bin/uasr_inference_k2.py +++ b/espnet2/bin/uasr_inference_k2.py @@ -9,7 +9,7 @@ import numpy as np import torch import yaml -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.fileio.datadir_writer import DatadirWriter from espnet2.tasks.lm import LMTask @@ -63,15 +63,16 @@ class k2Speech2Text: """ + @typechecked def __init__( self, uasr_train_config: Union[Path, str], decoding_graph: str, - uasr_model_file: Union[Path, str] = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, + uasr_model_file: Union[Path, str, None] = None, + lm_train_config: Union[Path, str, None] = None, + lm_file: Union[Path, str, None] = None, + token_type: Optional[str] = None, + bpemodel: Optional[str] = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, @@ -101,7 +102,6 @@ def __init__( nbest_batch_size: int = 500, nll_batch_size: int = 100, ): - assert check_argument_types() # 1. Build UASR model uasr_model, uasr_train_args = UASRTask.build_model_from_file( @@ -162,6 +162,7 @@ def __init__( self.uasr_model_ignore_id = 0 @torch.no_grad() + @typechecked def __call__( self, speech: Union[torch.Tensor, np.ndarray] ) -> List[Tuple[Optional[str], List[str], List[int], float]]: @@ -173,7 +174,6 @@ def __call__( text, token, token_int, hyp """ - assert check_argument_types() if isinstance(speech, np.ndarray): speech = torch.tensor(speech) @@ -272,7 +272,6 @@ def __call__( text = self.tokenizer.tokens2text(token) results.append((text, token, token_int, score)) - assert check_return_type(results) return results @staticmethod @@ -306,6 +305,7 @@ def from_pretrained( return k2Speech2Text(**kwargs) +@typechecked def inference( output_dir: str, decoding_graph: str, @@ -344,7 +344,6 @@ def inference( k2_config: Optional[str], ): assert is_ctc_decoding, "Currently, only ctc_decoding graph is supported." - assert check_argument_types() if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") diff --git a/espnet2/bin/whisper_export_vocabulary.py b/espnet2/bin/whisper_export_vocabulary.py index f9b5e798313..baa880523fd 100644 --- a/espnet2/bin/whisper_export_vocabulary.py +++ b/espnet2/bin/whisper_export_vocabulary.py @@ -4,8 +4,9 @@ import os import sys from pathlib import Path +from typing import Optional -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.text.whisper_tokenizer import LANGUAGES_CODE_MAPPING from espnet2.utils.types import str2bool @@ -14,10 +15,11 @@ dirname = os.path.dirname(__file__) +@typechecked def export_vocabulary( output: str, whisper_model: str, - whisper_language: str = "en", + whisper_language: Optional[str] = "en", whisper_task: str = "transcribe", log_level: str = "INFO", add_token_file_name: str = "none", @@ -34,8 +36,6 @@ def export_vocabulary( ) raise e - assert check_argument_types() - logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", diff --git a/espnet2/diar/espnet_model.py b/espnet2/diar/espnet_model.py index 10d9f82bc79..252b11e3a8b 100644 --- a/espnet2/diar/espnet_model.py +++ b/espnet2/diar/espnet_model.py @@ -9,7 +9,7 @@ import torch import torch.nn.functional as F from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet2.asr.frontend.abs_frontend import AbsFrontend @@ -40,6 +40,7 @@ class ESPnetDiarizationModel(AbsESPnetModel): EEND-EDA: https://arxiv.org/pdf/2005.09921.pdf, https://arxiv.org/pdf/2106.10654.pdf """ + @typechecked def __init__( self, frontend: Optional[AbsFrontend], @@ -52,7 +53,6 @@ def __init__( diar_weight: float = 1.0, attractor_weight: float = 1.0, ): - assert check_argument_types() super().__init__() diff --git a/espnet2/enh/decoder/abs_decoder.py b/espnet2/enh/decoder/abs_decoder.py index 9cb21f6e642..c75e235db9b 100644 --- a/espnet2/enh/decoder/abs_decoder.py +++ b/espnet2/enh/decoder/abs_decoder.py @@ -18,7 +18,9 @@ def forward_streaming(self, input_frame: torch.Tensor): raise NotImplementedError def streaming_merge(self, chunks: torch.Tensor, ilens: torch.tensor = None): - """streaming_merge. It merges the frame-level processed audio chunks + """Stream merge. + + It merges the frame-level processed audio chunks in the streaming *simulation*. It is noted that, in real applications, the processed audio should be sent to the output channel frame by frame. You may refer to this function to manage your streaming output buffer. diff --git a/espnet2/enh/decoder/conv_decoder.py b/espnet2/enh/decoder/conv_decoder.py index 4da83601ec2..2dff150e92a 100644 --- a/espnet2/enh/decoder/conv_decoder.py +++ b/espnet2/enh/decoder/conv_decoder.py @@ -1,5 +1,3 @@ -import math - import torch from espnet2.enh.decoder.abs_decoder import AbsDecoder @@ -41,7 +39,9 @@ def forward_streaming(self, input_frame: torch.Tensor): return self.forward(input_frame, ilens=torch.LongTensor([self.kernel_size]))[0] def streaming_merge(self, chunks: torch.Tensor, ilens: torch.tensor = None): - """streaming_merge. It merges the frame-level processed audio chunks + """Stream Merge. + + It merges the frame-level processed audio chunks in the streaming *simulation*. It is noted that, in real applications, the processed audio should be sent to the output channel frame by frame. You may refer to this function to manage your streaming output buffer. diff --git a/espnet2/enh/decoder/stft_decoder.py b/espnet2/enh/decoder/stft_decoder.py index fd652f40cf5..d488c7ba99f 100644 --- a/espnet2/enh/decoder/stft_decoder.py +++ b/espnet2/enh/decoder/stft_decoder.py @@ -110,11 +110,12 @@ def _reset_config(self): def _reconfig_for_fs(self, fs): """Reconfigure iSTFT window and hop lengths for a new sampling rate + while keeping their duration fixed. Args: fs (int): new sampling rate - """ # noqa: H405 + """ assert fs % self.default_fs == 0 or self.default_fs % fs == 0 self.stft.n_fft = self.n_fft * fs // self.default_fs self.stft.win_length = self.win_length * fs // self.default_fs @@ -124,7 +125,7 @@ def _get_window_func(self): window_func = getattr(torch, f"{self.window}_window") window = window_func(self.win_length) n_pad_left = (self.n_fft - window.shape[0]) // 2 - n_pad_right = self.n_fft - window.shape[0] - n_pad_left + n_pad_right = self.n_fft - window.shape[0] - n_pad_left # noqa return window def spec_back(self, spec): diff --git a/espnet2/enh/diffusion/abs_diffusion.py b/espnet2/enh/diffusion/abs_diffusion.py index 43c162e4009..73d9a7553fc 100644 --- a/espnet2/enh/diffusion/abs_diffusion.py +++ b/espnet2/enh/diffusion/abs_diffusion.py @@ -1,6 +1,4 @@ from abc import ABC, abstractmethod -from collections import OrderedDict -from typing import Dict, Optional, Tuple import torch diff --git a/espnet2/enh/diffusion/score_based_diffusion.py b/espnet2/enh/diffusion/score_based_diffusion.py index 98fecc033ee..4ad9901bdb2 100644 --- a/espnet2/enh/diffusion/score_based_diffusion.py +++ b/espnet2/enh/diffusion/score_based_diffusion.py @@ -4,9 +4,6 @@ import math -from abc import ABC, abstractmethod -from collections import OrderedDict -from typing import Dict, Optional, Tuple import torch @@ -39,7 +36,7 @@ class ScoreModel(AbsDiffusion): def __init__(self, **kwargs): super().__init__() - score_model = kwargs["score_model"] + score_model = kwargs["score_model"] # noqa score_model_class = score_choices.get_class(kwargs["score_model"]) self.dnn = score_model_class(**kwargs["score_model_conf"]) self.sde = sde_choices.get_class(kwargs["sde"])(**kwargs["sde_conf"]) diff --git a/espnet2/enh/diffusion/sdes.py b/espnet2/enh/diffusion/sdes.py index e72c2a5406a..037e4fb8190 100644 --- a/espnet2/enh/diffusion/sdes.py +++ b/espnet2/enh/diffusion/sdes.py @@ -1,5 +1,4 @@ -""" -Abstract SDE classes, Reverse SDE, and VE/VP SDEs. +"""Abstract SDE classes, Reverse SDE, and VE/VP SDEs. Taken and adapted from https://github.com/yang-song/score_sde_pytorch @@ -39,13 +38,17 @@ def sde(self, x, t, *args): @abc.abstractmethod def marginal_prob(self, x, t, *args): """Parameters to determine the marginal distribution of - the SDE, $p_t(x|args)$.""" + + the SDE, $p_t(x|args)$. + """ pass @abc.abstractmethod def prior_sampling(self, shape, *args): """Generate one sample from the prior distribution, - $p_T(x|args)$ with shape `shape`.""" + + $p_T(x|args)$ with shape `shape`. + """ pass @abc.abstractmethod @@ -137,7 +140,9 @@ def rsde_parts(self, x, t, *args): def discretize(self, x, t, *args): """Create discretized iteration rules for the reverse - diffusion sampler.""" + + diffusion sampler. + """ f, G = discretize_fn(x, t, *args) rev_f = f - G[:, None, None, None] ** 2 * score_model(x, t, *args) * ( 0.5 if self.probability_flow else 1.0 @@ -238,7 +243,8 @@ def prior_logp(self, z): class OUVPSDE(SDE): def __init__(self, beta_min, beta_max, stiffness=1, N=1000, **ignored_kwargs): - """ + """OUVPSDE class. + !!! SGMSE authors observed instabilities around t=0.2. !!! Construct an Ornstein-Uhlenbeck Variance Preserving SDE: @@ -310,7 +316,9 @@ def prior_logp(self, z): def batch_broadcast(a, x): """Broadcasts a over all dimensions of x, except the batch dimension, - which must match.""" + + which must match. + """ if len(a.shape) != 1: a = a.squeeze() diff --git a/espnet2/enh/diffusion_enh.py b/espnet2/enh/diffusion_enh.py index 9ce9ddbd456..8732d75d178 100644 --- a/espnet2/enh/diffusion_enh.py +++ b/espnet2/enh/diffusion_enh.py @@ -1,21 +1,20 @@ """Enhancement model module.""" -import contextlib -from typing import Dict, List, Optional, OrderedDict, Tuple +from typing import Dict, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.enh.decoder.abs_decoder import AbsDecoder from espnet2.enh.diffusion.abs_diffusion import AbsDiffusion from espnet2.enh.encoder.abs_encoder import AbsEncoder from espnet2.enh.espnet_model import ESPnetEnhancementModel -from espnet2.enh.extractor.abs_extractor import AbsExtractor -from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainLoss -from espnet2.enh.loss.criterions.time_domain import TimeDomainLoss -from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper +from espnet2.enh.extractor.abs_extractor import AbsExtractor # noqa +from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainLoss # noqa +from espnet2.enh.loss.criterions.time_domain import TimeDomainLoss # noqa +from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper # noqa from espnet2.torch_utils.device_funcs import force_gatherable -from espnet2.train.abs_espnet_model import AbsESPnetModel +from espnet2.train.abs_espnet_model import AbsESPnetModel # noqa EPS = torch.finfo(torch.get_default_dtype()).eps @@ -23,6 +22,7 @@ class ESPnetDiffusionModel(ESPnetEnhancementModel): """Target Speaker Extraction Frontend model""" + @typechecked def __init__( self, encoder: AbsEncoder, @@ -33,7 +33,6 @@ def __init__( normalize: bool = False, **kwargs, ): - assert check_argument_types() super().__init__( encoder=encoder, @@ -48,7 +47,7 @@ def __init__( self.diffusion = diffusion self.decoder = decoder - # TODO: Extending the model to separation tasks. + # TODO(gituser): Extending the model to separation tasks. assert ( num_spk == 1 ), "only enhancement models are supported now, num_spk must be 1" diff --git a/espnet2/enh/encoder/abs_encoder.py b/espnet2/enh/encoder/abs_encoder.py index baa959763f1..0777be82c5d 100644 --- a/espnet2/enh/encoder/abs_encoder.py +++ b/espnet2/enh/encoder/abs_encoder.py @@ -23,7 +23,9 @@ def forward_streaming(self, input: torch.Tensor): raise NotImplementedError def streaming_frame(self, audio: torch.Tensor): - """streaming_frame. It splits the continuous audio into frame-level + """Stream frame. + + It splits the continuous audio into frame-level audio chunks in the streaming *simulation*. It is noted that this function takes the entire long audio as input for a streaming simulation. You may refer to this function to manage your streaming input @@ -34,4 +36,4 @@ def streaming_frame(self, audio: torch.Tensor): Returns: chunked: List [(B, frame_size),] """ - NotImplementedError + raise NotImplementedError diff --git a/espnet2/enh/encoder/conv_encoder.py b/espnet2/enh/encoder/conv_encoder.py index e3e3679653c..45f42432f75 100644 --- a/espnet2/enh/encoder/conv_encoder.py +++ b/espnet2/enh/encoder/conv_encoder.py @@ -1,5 +1,3 @@ -import math - import torch from espnet2.enh.encoder.abs_encoder import AbsEncoder @@ -56,7 +54,9 @@ def forward_streaming(self, input: torch.Tensor): return output def streaming_frame(self, audio: torch.Tensor): - """streaming_frame. It splits the continuous audio into frame-level + """Stream frame. + + It splits the continuous audio into frame-level audio chunks in the streaming *simulation*. It is noted that this function takes the entire long audio as input for a streaming simulation. You may refer to this function to manage your streaming input diff --git a/espnet2/enh/espnet_enh_s2t_model.py b/espnet2/enh/espnet_enh_s2t_model.py index bf26bccc47b..b969347f896 100644 --- a/espnet2/enh/espnet_enh_s2t_model.py +++ b/espnet2/enh/espnet_enh_s2t_model.py @@ -8,7 +8,7 @@ import torch.nn.functional as F from packaging.version import parse as V from scipy.optimize import linear_sum_assignment -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.espnet_model import ESPnetASRModel from espnet2.diar.espnet_model import ESPnetDiarizationModel @@ -29,6 +29,7 @@ def autocast(enabled=True): class ESPnetEnhS2TModel(AbsESPnetModel): """Joint model Enhancement and Speech to Text.""" + @typechecked def __init__( self, enh_model: ESPnetEnhancementModel, @@ -36,7 +37,6 @@ def __init__( calc_enh_loss: bool = True, bypass_enh_prob: float = 0, # 0 means do not bypass enhancement for all data ): - assert check_argument_types() super().__init__() self.enh_model = enh_model @@ -509,12 +509,12 @@ def permutation_invariant_training(self, losses: torch.Tensor): return hyp_perm, torch.stack(min_perm_loss) + @typechecked def inherite_attributes( self, inherite_enh_attrs: List[str] = [], inherite_s2t_attrs: List[str] = [], ): - assert check_argument_types() if len(inherite_enh_attrs) > 0: for attr in inherite_enh_attrs: diff --git a/espnet2/enh/espnet_model.py b/espnet2/enh/espnet_model.py index f42aa2be8b0..61deb4b6e38 100644 --- a/espnet2/enh/espnet_model.py +++ b/espnet2/enh/espnet_model.py @@ -6,7 +6,7 @@ import numpy as np import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.diar.layers.abs_mask import AbsMask from espnet2.enh.decoder.abs_decoder import AbsDecoder @@ -28,6 +28,7 @@ class ESPnetEnhancementModel(AbsESPnetModel): """Speech enhancement or separation Frontend model""" + @typechecked def __init__( self, encoder: AbsEncoder, @@ -89,7 +90,6 @@ def __init__( category_weights: list of weights for each category. Used to set loss weights for batches of different categories. """ - assert check_argument_types() super().__init__() diff --git a/espnet2/enh/espnet_model_tse.py b/espnet2/enh/espnet_model_tse.py index d578a01e099..34dedd5bad2 100644 --- a/espnet2/enh/espnet_model_tse.py +++ b/espnet2/enh/espnet_model_tse.py @@ -4,7 +4,7 @@ from typing import Dict, List, Optional, OrderedDict, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.enh.decoder.abs_decoder import AbsDecoder from espnet2.enh.encoder.abs_encoder import AbsEncoder @@ -21,6 +21,7 @@ class ESPnetExtractionModel(AbsESPnetModel): """Target Speaker Extraction Frontend model""" + @typechecked def __init__( self, encoder: AbsEncoder, @@ -32,7 +33,6 @@ def __init__( share_encoder: bool = True, extract_feats_in_collect_stats: bool = False, ): - assert check_argument_types() super().__init__() diff --git a/espnet2/enh/layers/dcunet.py b/espnet2/enh/layers/dcunet.py index af02c787fd3..6bbc8f77b00 100644 --- a/espnet2/enh/layers/dcunet.py +++ b/espnet2/enh/layers/dcunet.py @@ -70,7 +70,9 @@ def forward(self, t): class ComplexLinear(nn.Module): """A potentially complex-valued linear layer. Reduces to a regular linear - layer if `complex_valued=False`.""" + + layer if `complex_valued=False`. + """ def __init__(self, input_dim, output_dim, complex_valued): super().__init__() @@ -108,7 +110,8 @@ def torch_complex_from_reim(re, im): class ArgsComplexMultiplicationWrapper(nn.Module): """Adapted from `asteroid`'s `complex_nn.py`, allowing - args/kwargs to be passed through forward(). + + args/kwargs to be passed through forward(). Make a complex-valued module `F` from a real-valued module `f` by applying complex multiplication rules: @@ -178,6 +181,7 @@ def forward(self, x): def unet_decoder_args(encoders, *, skip_connections): """Get list of decoder arguments for upsampling (right) side of a symmetric u-net, + given the arguments used to construct the encoder. Args: encoders (tuple of length `N` of tuples of @@ -422,8 +426,8 @@ def __init__( raise NotImplementedError( "sorry, mask bounding not implemented at the moment" ) - # TODO we can't use nn.Sequential since the ComplexConvTranspose2d needs a - # second `output_size` argument + # TODO(gituser) we can't use nn.Sequential since the ComplexConvTranspose2d + # needs a second `output_size` argument # operations = (output_layer, complex_nn.BoundComplexMask(self.mask_bound)) # output_layer = nn.Sequential(*[x for x in operations if x is not None]) @@ -433,8 +437,8 @@ def __init__( self.output_layer = output_layer or nn.Identity() def forward(self, spec, t) -> Tensor: - """ - Input shape is expected to be $(batch, nfreqs, time)$, with $nfreqs - 1$ + """Input shape is expected to be $(batch, nfreqs, time)$, with $nfreqs - 1$ + divisible by $f_0 * f_1 * ... * f_N$ where $f_k$ are the frequency strides of the encoders, and $time - 1$ is divisible by $t_0 * t_1 * ... * t_N$ where $t_N$ are the time strides of the encoders. diff --git a/espnet2/enh/layers/ncsnpp.py b/espnet2/enh/layers/ncsnpp.py index 2bc91c184d7..ef6395ad0b6 100644 --- a/espnet2/enh/layers/ncsnpp.py +++ b/espnet2/enh/layers/ncsnpp.py @@ -16,8 +16,6 @@ # ncsnpp.py and ncsnpp_utils are taken from # https://github.com/sp-uhh/sgmse/ -# pylint: skip-file - import functools import numpy as np @@ -38,7 +36,9 @@ class NCSNpp(nn.Module): """NCSN++ model, adapted from https://github.com/yang-song/score_sde and - https://github.com/sp-uhh/sgmse repository""" + + https://github.com/sp-uhh/sgmse repository + """ def __init__( self, diff --git a/espnet2/enh/layers/ncsnpp_utils/layers.py b/espnet2/enh/layers/ncsnpp_utils/layers.py index 4cad02a7338..25bf6ed3485 100644 --- a/espnet2/enh/layers/ncsnpp_utils/layers.py +++ b/espnet2/enh/layers/ncsnpp_utils/layers.py @@ -13,10 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -# pylint: skip-file -"""Common layers for defining score networks. -""" -import math +"""Common layers for defining score networks.""" + import string from functools import partial diff --git a/espnet2/enh/layers/ncsnpp_utils/up_or_down_sampling.py b/espnet2/enh/layers/ncsnpp_utils/up_or_down_sampling.py index 83f95c7b767..ba19b097195 100644 --- a/espnet2/enh/layers/ncsnpp_utils/up_or_down_sampling.py +++ b/espnet2/enh/layers/ncsnpp_utils/up_or_down_sampling.py @@ -103,10 +103,7 @@ def upsample_conv_2d(x, w, k=None, factor=2, gain=1): # Check weight shape. assert len(w.shape) == 4 - convH = w.shape[2] - convW = w.shape[3] - inC = w.shape[1] - outC = w.shape[0] + _, inC, convH, convW = w.shape assert convW == convH diff --git a/espnet2/enh/separator/tfgridnetv2_separator.py b/espnet2/enh/separator/tfgridnetv2_separator.py index f1b936518a0..ed94a005c78 100644 --- a/espnet2/enh/separator/tfgridnetv2_separator.py +++ b/espnet2/enh/separator/tfgridnetv2_separator.py @@ -17,8 +17,9 @@ class TFGridNetV2(AbsSeparator): """Offline TFGridNetV2. Compared with TFGridNet, TFGridNetV2 speeds up the code - by vectorizing multiple heads in self-attention, and better dealing with - Deconv1D in each intra- and inter-block when emb_ks == emb_hs. + + by vectorizing multiple heads in self-attention, and better dealing with + Deconv1D in each intra- and inter-block when emb_ks == emb_hs. Reference: [1] Z.-Q. Wang, S. Cornell, S. Choi, Y. Lee, B.-Y. Kim, and S. Watanabe, diff --git a/espnet2/fileio/datadir_writer.py b/espnet2/fileio/datadir_writer.py index 625c73dbed7..434bc4e90d1 100644 --- a/espnet2/fileio/datadir_writer.py +++ b/espnet2/fileio/datadir_writer.py @@ -2,7 +2,7 @@ from pathlib import Path from typing import Union -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked class DatadirWriter: @@ -18,8 +18,8 @@ class DatadirWriter: """ + @typechecked def __init__(self, p: Union[Path, str]): - assert check_argument_types() self.path = Path(p) self.chilidren = {} self.fd = None @@ -29,8 +29,8 @@ def __init__(self, p: Union[Path, str]): def __enter__(self): return self + @typechecked def __getitem__(self, key: str) -> "DatadirWriter": - assert check_argument_types() if self.fd is not None: raise RuntimeError("This writer points out a file") @@ -40,11 +40,10 @@ def __getitem__(self, key: str) -> "DatadirWriter": self.has_children = True retval = self.chilidren[key] - assert check_return_type(retval) return retval + @typechecked def __setitem__(self, key: str, value: str): - assert check_argument_types() if self.has_children: raise RuntimeError("This writer points out a directory") if key in self.keys: diff --git a/espnet2/fileio/multi_sound_scp.py b/espnet2/fileio/multi_sound_scp.py index a315d1fed50..07bac9f8b91 100644 --- a/espnet2/fileio/multi_sound_scp.py +++ b/espnet2/fileio/multi_sound_scp.py @@ -2,7 +2,7 @@ from typing import Tuple import numpy as np -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.read_text import read_multi_columns_text from espnet2.fileio.sound_scp import soundfile_read @@ -32,10 +32,10 @@ class MultiSoundScpReader(collections.abc.Mapping): to the same length. """ + @typechecked def __init__( self, fname, dtype=None, always_2d: bool = False, stack_axis=0, pad=np.nan ): - assert check_argument_types() self.fname = fname self.dtype = dtype self.always_2d = always_2d diff --git a/espnet2/fileio/npy_scp.py b/espnet2/fileio/npy_scp.py index f483076a99f..9ac23ce86c2 100644 --- a/espnet2/fileio/npy_scp.py +++ b/espnet2/fileio/npy_scp.py @@ -3,7 +3,7 @@ from typing import Union import numpy as np -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.read_text import read_2columns_text @@ -24,8 +24,8 @@ class NpyScpWriter: """ + @typechecked def __init__(self, outdir: Union[Path, str], scpfile: Union[Path, str]): - assert check_argument_types() self.dir = Path(outdir) self.dir.mkdir(parents=True, exist_ok=True) scpfile = Path(scpfile) @@ -72,8 +72,8 @@ class NpyScpReader(collections.abc.Mapping): """ + @typechecked def __init__(self, fname: Union[Path, str]): - assert check_argument_types() self.fname = Path(fname) self.data = read_2columns_text(fname) diff --git a/espnet2/fileio/rand_gen_dataset.py b/espnet2/fileio/rand_gen_dataset.py index bb92336a6fe..1845ae03d79 100644 --- a/espnet2/fileio/rand_gen_dataset.py +++ b/espnet2/fileio/rand_gen_dataset.py @@ -3,7 +3,7 @@ from typing import Union import numpy as np -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.read_text import load_num_sequence_text @@ -23,13 +23,13 @@ class FloatRandomGenerateDataset(collections.abc.Mapping): """ + @typechecked def __init__( self, shape_file: Union[Path, str], dtype: Union[str, np.dtype] = "float32", loader_type: str = "csv_int", ): - assert check_argument_types() shape_file = Path(shape_file) self.utt2shape = load_num_sequence_text(shape_file, loader_type) self.dtype = np.dtype(dtype) @@ -60,6 +60,7 @@ class IntRandomGenerateDataset(collections.abc.Mapping): """ + @typechecked def __init__( self, shape_file: Union[Path, str], @@ -68,7 +69,6 @@ def __init__( dtype: Union[str, np.dtype] = "int64", loader_type: str = "csv_int", ): - assert check_argument_types() shape_file = Path(shape_file) self.utt2shape = load_num_sequence_text(shape_file, loader_type) self.dtype = np.dtype(dtype) diff --git a/espnet2/fileio/read_text.py b/espnet2/fileio/read_text.py index 26c32f1afd1..ad49db9869d 100644 --- a/espnet2/fileio/read_text.py +++ b/espnet2/fileio/read_text.py @@ -5,9 +5,10 @@ from random import randint from typing import Dict, List, Optional, Tuple, Union -from typeguard import check_argument_types +from typeguard import typechecked +@typechecked def read_2columns_text(path: Union[Path, str]) -> Dict[str, str]: """Read a text file having 2 columns as dict object. @@ -20,7 +21,6 @@ def read_2columns_text(path: Union[Path, str]) -> Dict[str, str]: {'key1': '/some/path/a.wav', 'key2': '/some/path/b.wav'} """ - assert check_argument_types() data = {} with Path(path).open("r", encoding="utf-8") as f: @@ -37,6 +37,7 @@ def read_2columns_text(path: Union[Path, str]) -> Dict[str, str]: return data +@typechecked def read_multi_columns_text( path: Union[Path, str], return_unsplit: bool = False ) -> Tuple[Dict[str, List[str]], Optional[Dict[str, str]]]: @@ -55,7 +56,6 @@ def read_multi_columns_text( 'key3': ['/some/path/c1.wav']} """ - assert check_argument_types() data = {} @@ -82,6 +82,7 @@ def read_multi_columns_text( return data, unsplit_data +@typechecked def load_num_sequence_text( path: Union[Path, str], loader_type: str = "csv_int" ) -> Dict[str, List[Union[float, int]]]: @@ -94,7 +95,6 @@ def load_num_sequence_text( >>> d = load_num_sequence_text('text') >>> np.testing.assert_array_equal(d["key1"], np.array([1, 2, 3])) """ - assert check_argument_types() if loader_type == "text_int": delimiter = " " dtype = int @@ -128,7 +128,8 @@ def load_num_sequence_text( return retval -def read_label(path: Union[Path, str]) -> Dict[str, List[Union[float, int]]]: +@typechecked +def read_label(path: Union[Path, str]) -> Dict[str, List[List[Union[str, float, int]]]]: """Read a text file indicating sequences of number Examples: @@ -138,7 +139,6 @@ def read_label(path: Union[Path, str]) -> Dict[str, List[Union[float, int]]]: >>> d = load_num_sequence_text('label') >>> np.testing.assert_array_equal(d["key1"], [0.1, 0.2, "啊"])) """ - assert check_argument_types() label = open(path, "r", encoding="utf-8") retval = {} @@ -180,11 +180,11 @@ class RandomTextReader(collections.abc.Mapping): (text start at bytes 21 and end at bytes 30 (including "\n")) """ + @typechecked def __init__( self, text_and_scp: str, ): - assert check_argument_types() super().__init__() text, text_scp = text_and_scp.split("-") diff --git a/espnet2/fileio/rttm.py b/espnet2/fileio/rttm.py index feec3a82f60..c230e7bd042 100644 --- a/espnet2/fileio/rttm.py +++ b/espnet2/fileio/rttm.py @@ -4,16 +4,16 @@ from typing import Dict, List, Tuple, Union import numpy as np -from typeguard import check_argument_types +from typeguard import typechecked +@typechecked def load_rttm_text(path: Union[Path, str]) -> Dict[str, List[Tuple[str, float, float]]]: """Read a RTTM file Note: only support speaker information now """ - assert check_argument_types() data = {} with Path(path).open("r", encoding="utf-8") as f: for linenum, line in enumerate(f, 1): @@ -65,11 +65,11 @@ class RttmReader(collections.abc.Mapping): """ + @typechecked def __init__( self, fname: str, ): - assert check_argument_types() super().__init__() self.fname = fname diff --git a/espnet2/fileio/score_scp.py b/espnet2/fileio/score_scp.py index 4756db2f2c7..3ec255a7e5b 100644 --- a/espnet2/fileio/score_scp.py +++ b/espnet2/fileio/score_scp.py @@ -4,7 +4,7 @@ from typing import Union import numpy as np -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.read_text import read_2columns_text @@ -40,12 +40,12 @@ class XMLReader(collections.abc.Mapping): >>> tempo, note_list = reader['key1'] """ + @typechecked def __init__( self, fname, dtype=np.int16, ): - assert check_argument_types() assert m21 is not None, ( "Cannot load music21 package. ", "Please install Muskit modules via ", @@ -141,12 +141,12 @@ class XMLWriter: """ + @typechecked def __init__( self, outdir: Union[Path, str], scpfile: Union[Path, str], ): - assert check_argument_types() self.dir = Path(outdir) self.dir.mkdir(parents=True, exist_ok=True) scpfile = Path(scpfile) @@ -212,13 +212,13 @@ class MIDReader(collections.abc.Mapping): >>> tempo, note_list = reader['key1'] """ + @typechecked def __init__( self, fname, add_rest=True, dtype=np.int16, ): - assert check_argument_types() assert miditoolkit is not None, ( "Cannot load miditoolkit package. ", "Please install Muskit modules via ", @@ -284,12 +284,12 @@ class SingingScoreReader(collections.abc.Mapping): """ + @typechecked def __init__( self, fname, dtype=np.int16, ): - assert check_argument_types() self.fname = fname self.dtype = dtype self.data = read_2columns_text(fname) @@ -331,12 +331,12 @@ class SingingScoreWriter: """ + @typechecked def __init__( self, outdir: Union[Path, str], scpfile: Union[Path, str], ): - assert check_argument_types() self.dir = Path(outdir) self.dir.mkdir(parents=True, exist_ok=True) scpfile = Path(scpfile) diff --git a/espnet2/fileio/sound_scp.py b/espnet2/fileio/sound_scp.py index ff9f06edd59..03335e46383 100644 --- a/espnet2/fileio/sound_scp.py +++ b/espnet2/fileio/sound_scp.py @@ -1,10 +1,10 @@ import collections.abc from pathlib import Path -from typing import List, Tuple, Union +from typing import List, Optional, Tuple, Union import numpy as np import soundfile -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.read_text import read_2columns_text, read_multi_columns_text @@ -115,6 +115,7 @@ class SoundScpReader(collections.abc.Mapping): but it increases the required amount of memory. """ + @typechecked def __init__( self, fname, @@ -123,7 +124,6 @@ def __init__( multi_columns: bool = False, concat_axis=1, ): - assert check_argument_types() self.fname = fname self.dtype = dtype self.always_2d = always_2d @@ -197,6 +197,7 @@ class SoundScpWriter: """ + @typechecked def __init__( self, outdir: Union[Path, str], @@ -205,9 +206,8 @@ def __init__( multi_columns: bool = False, output_name_format: str = "{key}.{audio_format}", output_name_format_multi_columns: str = "{key}-CH{channel}.{audio_format}", - subtype: str = None, + subtype: Optional[str] = None, ): - assert check_argument_types() self.dir = Path(outdir) self.dir.mkdir(parents=True, exist_ok=True) scpfile = Path(scpfile) diff --git a/espnet2/fileio/vad_scp.py b/espnet2/fileio/vad_scp.py index 0725bba5ba4..ffb4e1d17bf 100644 --- a/espnet2/fileio/vad_scp.py +++ b/espnet2/fileio/vad_scp.py @@ -3,7 +3,7 @@ from typing import List, Union import numpy as np -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.read_text import read_2columns_text @@ -25,12 +25,12 @@ class VADScpReader(collections.abc.Mapping): """ + @typechecked def __init__( self, fname, dtype=np.float32, ): - assert check_argument_types() self.fname = fname self.dtype = dtype self.data = read_2columns_text(fname) @@ -71,12 +71,12 @@ class VADScpWriter: """ + @typechecked def __init__( self, scpfile: Union[Path, str], dtype=None, ): - assert check_argument_types() scpfile = Path(scpfile) scpfile.parent.mkdir(parents=True, exist_ok=True) self.fscp = scpfile.open("w", encoding="utf-8") diff --git a/espnet2/gan_svs/avocodo/avocodo.py b/espnet2/gan_svs/avocodo/avocodo.py index b0836782de5..c54f2214dda 100644 --- a/espnet2/gan_svs/avocodo/avocodo.py +++ b/espnet2/gan_svs/avocodo/avocodo.py @@ -264,8 +264,7 @@ def __init__( ) def forward(self, x): - """ - Forward pass through the CoMBD block. + """Forward pass through the CoMBD block. Args: x (Tensor): Input tensor of shape (B, C_in, T_in). @@ -286,7 +285,9 @@ def forward(self, x): class CoMBD(torch.nn.Module): """CoMBD (Collaborative Multi-band Discriminator) module - from from https://arxiv.org/abs/2206.13404""" + + from from https://arxiv.org/abs/2206.13404 + """ def __init__(self, h, pqmf_list=None, use_spectral_norm=False): super(CoMBD, self).__init__() @@ -366,7 +367,8 @@ def _pqmf_forward(self, ys, ys_hat): return outs_real, outs_fake, f_maps_real, f_maps_fake def forward(self, ys, ys_hat): - """ + """Forward CoMBD. + Args: ys (List[Tensor]): List of ground truth signals of shape (B, 1, T). ys_hat (List[Tensor]): List of predicted signals of shape (B, 1, T). diff --git a/espnet2/gan_svs/espnet_model.py b/espnet2/gan_svs/espnet_model.py index 6004ca3687b..850261f1640 100644 --- a/espnet2/gan_svs/espnet_model.py +++ b/espnet2/gan_svs/espnet_model.py @@ -9,7 +9,7 @@ import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.gan_svs.abs_gan_svs import AbsGANSVS from espnet2.layers.abs_normalize import AbsNormalize @@ -34,6 +34,7 @@ def autocast(enabled=True): # NOQA class ESPnetGANSVSModel(AbsGANESPnetModel): """ESPnet model for GAN-based singing voice synthesis task.""" + @typechecked def __init__( self, text_extract: Optional[AbsFeatsExtract], @@ -50,7 +51,6 @@ def __init__( svs: AbsGANSVS, ): """Initialize ESPnetGANSVSModel module.""" - assert check_argument_types() super().__init__() self.text_extract = text_extract self.feats_extract = feats_extract diff --git a/espnet2/gan_svs/joint/joint_score2wav.py b/espnet2/gan_svs/joint/joint_score2wav.py index 097c2da1939..0302e0917c4 100644 --- a/espnet2/gan_svs/joint/joint_score2wav.py +++ b/espnet2/gan_svs/joint/joint_score2wav.py @@ -7,7 +7,7 @@ from typing import Any, Dict, Optional import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.gan_svs.abs_gan_svs import AbsGANSVS from espnet2.gan_tts.hifigan import ( @@ -61,6 +61,7 @@ class JointScore2Wav(AbsGANSVS): """General class to jointly train score2mel and vocoder parts.""" + @typechecked def __init__( self, # generator (score2mel + vocoder) related @@ -260,7 +261,6 @@ def __init__( cache_generator_outputs (bool): Whether to cache generator outputs. """ - assert check_argument_types() super().__init__() self.segment_size = segment_size self.use_pqmf = use_pqmf diff --git a/espnet2/gan_svs/pits/ying_decoder.py b/espnet2/gan_svs/pits/ying_decoder.py index 5858eba0082..f6b5cdf0485 100644 --- a/espnet2/gan_svs/pits/ying_decoder.py +++ b/espnet2/gan_svs/pits/ying_decoder.py @@ -4,7 +4,7 @@ import espnet2.gan_svs.pits.modules as modules -# TODO (Yifeng): This comment is generated by ChatGPT, which may not be accurate. +# TODO(Yifeng): This comment is generated by ChatGPT, which may not be accurate. class YingDecoder(nn.Module): """Ying decoder module.""" diff --git a/espnet2/gan_svs/uhifigan/sine_generator.py b/espnet2/gan_svs/uhifigan/sine_generator.py index e85d52b508e..29b80d88242 100644 --- a/espnet2/gan_svs/uhifigan/sine_generator.py +++ b/espnet2/gan_svs/uhifigan/sine_generator.py @@ -4,6 +4,7 @@ class SineGen(torch.nn.Module): """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, sine_amp = 0.1, noise_std = 0.003, voiced_threshold = 0, @@ -45,7 +46,9 @@ def _f02uv(self, f0): return uv def _f02sine(self, f0_values): - """f0_values: (batchsize, length, dim) + """F02 sine. + + f0_values: (batchsize, length, dim) where dim indicates fundamental tone and overtones """ # convert to F0 in rad. The interger part n can be ignored @@ -106,7 +109,9 @@ def _f02sine(self, f0_values): return sines def forward(self, f0): - """sine_tensor, uv = forward(f0) + """Forward SineGen. + + sine_tensor, uv = forward(f0) input F0: tensor(batchsize=1, length, dim=1) f0 for unvoiced steps should be 0 output sine_tensor: tensor(batchsize=1, length, dim) diff --git a/espnet2/gan_svs/uhifigan/uhifigan.py b/espnet2/gan_svs/uhifigan/uhifigan.py index 41bf5d4395f..c2ba4db7531 100644 --- a/espnet2/gan_svs/uhifigan/uhifigan.py +++ b/espnet2/gan_svs/uhifigan/uhifigan.py @@ -13,6 +13,7 @@ import numpy as np import torch import torch.nn.functional as F +from typeguard import typechecked try: from parallel_wavegan.layers import CausalConv1d, CausalConvTranspose1d @@ -27,6 +28,7 @@ class UHiFiGANGenerator(torch.nn.Module): """UHiFiGAN generator module.""" + @typechecked def __init__( self, in_channels=80, @@ -132,7 +134,7 @@ def __init__( getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), torch.nn.Dropout(dropout), ) - hidden_channels = channels + for i in range(len(downsample_scales)): for j in range(len(resblock_kernel_sizes)): self.downsamples_mrf += [ diff --git a/espnet2/gan_svs/utils/__init__.py b/espnet2/gan_svs/utils/__init__.py index 76fb6a1194b..16899207066 100644 --- a/espnet2/gan_svs/utils/__init__.py +++ b/espnet2/gan_svs/utils/__init__.py @@ -1 +1 @@ -from espnet2.gan_svs.utils.expand_f0 import expand_f0 +from espnet2.gan_svs.utils.expand_f0 import expand_f0 # noqa diff --git a/espnet2/gan_svs/utils/expand_f0.py b/espnet2/gan_svs/utils/expand_f0.py index dc6e5ed4871..a5905b0f614 100644 --- a/espnet2/gan_svs/utils/expand_f0.py +++ b/espnet2/gan_svs/utils/expand_f0.py @@ -3,9 +3,6 @@ """Function to get random segments.""" -from typing import Optional, Tuple - -import torch import torch.nn.functional as F diff --git a/espnet2/gan_svs/visinger2/ddsp.py b/espnet2/gan_svs/visinger2/ddsp.py index 3b3b3cbb6b1..72e410deb40 100644 --- a/espnet2/gan_svs/visinger2/ddsp.py +++ b/espnet2/gan_svs/visinger2/ddsp.py @@ -97,11 +97,11 @@ def extract_loudness(signal, sampling_rate, block_size, n_fft=2048): return S -# TODO (Yifeng): Some functions are not used here such as crepe, +# TODO(Yifeng): Some functions are not used here such as crepe, # maybe we can remove them later or only import used functions. def extract_pitch(signal, sampling_rate, block_size): length = signal.shape[-1] // block_size - f0 = crepe.predict( + f0 = crepe.predict( # noqa signal, sampling_rate, step_size=int(1000 * block_size / sampling_rate), diff --git a/espnet2/gan_svs/visinger2/visinger2_vocoder.py b/espnet2/gan_svs/visinger2/visinger2_vocoder.py index 75f19cb0ffa..251b2613477 100644 --- a/espnet2/gan_svs/visinger2/visinger2_vocoder.py +++ b/espnet2/gan_svs/visinger2/visinger2_vocoder.py @@ -14,6 +14,7 @@ import numpy as np import torch import torch.nn.functional as F +from typeguard import typechecked from espnet2.gan_svs.visinger2.ddsp import ( remove_above_nyquist, @@ -23,14 +24,13 @@ from espnet2.gan_tts.hifigan import ( HiFiGANMultiPeriodDiscriminator, HiFiGANMultiScaleDiscriminator, - HiFiGANMultiScaleMultiPeriodDiscriminator, - HiFiGANPeriodDiscriminator, - HiFiGANScaleDiscriminator, ) from espnet2.gan_tts.hifigan.residual_block import ResidualBlock class VISinger2VocoderGenerator(torch.nn.Module): + + @typechecked def __init__( self, in_channels: int = 80, @@ -413,7 +413,8 @@ def __init__( self.window = torch.hann_window(self.win_size) def forward(self, x, mask): - """ + """Forward Generator Noise. + Args: x (Tensor): Input tensor (B, hidden_channels, T). mask (Tensor): Mask tensor (B, 1, T). @@ -462,8 +463,7 @@ def __init__( divisors=[32, 16, 8, 4, 2, 1, 1], strides=[1, 2, 1, 2, 1, 2, 1], ): - """ - Initialize Multi-Frequency Discriminator module. + """Initialize Multi-Frequency Discriminator module. Args: hop_lengths (list): List of hop lengths. @@ -478,7 +478,7 @@ def __init__( super().__init__() - # TODO (Yifeng): Maybe use LogMelFbank instead of TorchSTFT + # TODO(Yifeng): Maybe use LogMelFbank instead of TorchSTFT self.stfts = torch.nn.ModuleList( [ TorchSTFT( @@ -511,8 +511,7 @@ def __init__( ) def forward(self, x): - """ - Forward pass of Multi-Frequency Discriminator module. + """Forward pass of Multi-Frequency Discriminator module. Args: x (Tensor): Input tensor (B, 1, T * hop_size). @@ -542,7 +541,8 @@ def __init__( divisors=[32, 16, 8, 4, 2, 1, 1], strides=[1, 2, 1, 2, 1, 2, 1], ): - """ + """Base Frequence Discriminator + Args: in_channels (int): Number of input channels. hidden_channels (int, optional): Number of channels in hidden layers. @@ -653,8 +653,7 @@ def __init__( "strides": [1, 2, 1, 2, 1, 2, 1], }, ): - """ - Discriminator module for VISinger2, including MSD, MPD, and MFD. + """Discriminator module for VISinger2, including MSD, MPD, and MFD. Args: scales (int): Number of scales to be used in the multi-scale discriminator. @@ -874,6 +873,7 @@ def complex(self, x): class MelScale(torch.nn.Module): """Turn a normal STFT into a mel frequency STFT, using a conversion + matrix. This uses triangular filter banks. User can control which device the filter bank (fb) is (e.g. fb.to(spec_f.device)). Args: @@ -918,7 +918,8 @@ def __init__( self.register_buffer("fb", fb) def forward(self, specgram: torch.Tensor) -> torch.Tensor: - """ + """Forward MelScale + Args: specgram (Tensor): A spectrogram STFT of dimension (..., freq, time). Returns: @@ -956,6 +957,7 @@ def create_fb_matrix( norm: Optional[str] = None, ) -> torch.Tensor: """Create a frequency bin conversion matrix. + Args: n_freqs (int): Number of frequencies to highlight/apply f_min (float): Minimum frequency (Hz) diff --git a/espnet2/gan_svs/vits/generator.py b/espnet2/gan_svs/vits/generator.py index afaba2dd76b..979beabfdd9 100644 --- a/espnet2/gan_svs/vits/generator.py +++ b/espnet2/gan_svs/vits/generator.py @@ -20,6 +20,7 @@ import numpy as np import torch import torch.nn.functional as F +from typeguard import typechecked from espnet2.gan_svs.avocodo import AvocodoGenerator from espnet2.gan_svs.uhifigan import UHiFiGANGenerator @@ -46,6 +47,7 @@ class VISingerGenerator(torch.nn.Module): """Generator module in VISinger.""" + @typechecked def __init__( self, vocabs: int, @@ -103,7 +105,7 @@ def __init__( vocoder_generator_type: str = "hifigan", fs: int = 22050, hop_length: int = 256, - win_length: int = 1024, + win_length: Optional[int] = 1024, n_fft: int = 1024, use_phoneme_predictor: bool = False, expand_f0_method: str = "repeat", @@ -552,8 +554,10 @@ def forward( predict_dur = predict_dur * self.sample_rate / self.hop_length # LR - decoder_input, mel_len = self.lr(x, gt_dur, use_state_info=True) - decoder_input_pitch, mel_len = self.lr(x_pitch, gt_dur, use_state_info=True) + decoder_input, mel_len = self.lr(x, gt_dur, use_state_info=True) # noqa + decoder_input_pitch, mel_len = self.lr( # noqa + x_pitch, gt_dur, use_state_info=True + ) # noqa LF0 = 2595.0 * torch.log10(1.0 + pitch / 700.0) LF0 = LF0 / 500 @@ -644,7 +648,7 @@ def forward( -1, pitch_segments_expended.shape[-1], 1 ) - sine_waves, uv, noise = self.sine_generator(pitch_segments_expended) + sine_waves, uv, noise = self.sine_generator(pitch_segments_expended) # noqa sine_waves = sine_waves.transpose(1, 2) @@ -666,7 +670,6 @@ def forward( decoder_condition = self.sin_prenet(sin) # dsp based HiFiGAN vocoder - F0_slice = get_segments(pitch, z_start_idxs, self.segment_size) dsp_slice = get_segments( dsp_o, z_start_idxs * self.hop_length, @@ -780,7 +783,9 @@ def inference( if use_teacher_forcing: # forward posterior encoder - z, m_q, logs_q, y_mask = self.posterior_encoder(feats, feats_lengths, g=g) + z, m_q, logs_q, y_mask = self.posterior_encoder( # noqa + feats, feats_lengths, g=g + ) # noqa # forward flow if self.use_flow: @@ -796,7 +801,7 @@ def inference( pitch_segments_expended = pitch_segments_expended.reshape( -1, pitch_segments_expended.shape[-1], 1 ) - sine_waves, uv, noise = self.sine_generator(pitch_segments_expended) + sine_waves, _, _ = self.sine_generator(pitch_segments_expended) sine_waves = sine_waves.transpose(1, 2) wav = self.decoder( (z * y_mask)[:, :, :max_len], excitation=sine_waves, g=g @@ -813,7 +818,7 @@ def inference( harm_x = self.dec_harm(pitch, z, y_mask) # dsp waveform - dsp_o = torch.cat([harm_x, noise_x], axis=1) + dsp_o = torch.cat([harm_x, noise_x], axis=1) # noqa # decoder_condition = torch.cat([harm_x, noise_x, sin], axis=1) decoder_condition = self.sin_prenet(sin) @@ -837,18 +842,20 @@ def inference( y_lengths = torch.clamp_min(torch.sum(predict_dur, [1]), 1).long() # LR - decoder_input, mel_len = self.lr(x, predict_dur, use_state_info=True) - decoder_input_pitch, mel_len = self.lr( + decoder_input, mel_len = self.lr( + x, predict_dur, use_state_info=True + ) # noqa + decoder_input_pitch, mel_len = self.lr( # noqa x_pitch, predict_dur, use_state_info=True - ) + ) # noqa # aam - predict_lf0, predict_bn_mask = self.f0_decoder( + predict_lf0, predict_bn_mask = self.f0_decoder( # noqa decoder_input + decoder_input_pitch, y_lengths, g=g - ) + ) # noqa if self.generator_type == "visinger2": - predict_mel, predict_bn_mask = self.mel_decoder( + predict_mel, predict_bn_mask = self.mel_decoder( # noqa decoder_input + self.f0_prenet(predict_lf0), y_lengths, g=g, @@ -911,7 +918,7 @@ def inference( harm_x = self.dec_harm(F0, z, y_mask) # dsp waveform - dsp_o = torch.cat([harm_x, noise_x], axis=1) + dsp_o = torch.cat([harm_x, noise_x], axis=1) # noqa # decoder_condition = torch.cat([harm_x, noise_x, sin], axis=1) decoder_condition = self.sin_prenet(sin) diff --git a/espnet2/gan_svs/vits/phoneme_predictor.py b/espnet2/gan_svs/vits/phoneme_predictor.py index 31d61f8672a..300986480a2 100644 --- a/espnet2/gan_svs/vits/phoneme_predictor.py +++ b/espnet2/gan_svs/vits/phoneme_predictor.py @@ -2,15 +2,15 @@ # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) import torch +from typeguard import typechecked from espnet.nets.pytorch_backend.conformer.encoder import Encoder class PhonemePredictor(torch.nn.Module): - """ - Phoneme Predictor module in VISinger. - """ + """Phoneme Predictor module in VISinger.""" + @typechecked def __init__( self, vocabs: int, @@ -32,8 +32,7 @@ def __init__( positional_dropout_rate: float = 0.0, attention_dropout_rate: float = 0.0, ): - """ - Initialize PhonemePredictor module. + """Initialize PhonemePredictor module. Args: vocabs (int): The number of vocabulary. @@ -82,8 +81,7 @@ def __init__( self.linear1 = torch.nn.Linear(hidden_channels, vocabs) def forward(self, x, x_mask): - """ - Perform forward propagation. + """Perform forward propagation. Args: x (Tensor): The input tensor of shape (B, dim, length). diff --git a/espnet2/gan_svs/vits/pitch_predictor.py b/espnet2/gan_svs/vits/pitch_predictor.py index ee7ec10a4cd..e1dcba36575 100644 --- a/espnet2/gan_svs/vits/pitch_predictor.py +++ b/espnet2/gan_svs/vits/pitch_predictor.py @@ -2,6 +2,7 @@ # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) import torch +from typeguard import typechecked from espnet.nets.pytorch_backend.conformer.encoder import Encoder from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask @@ -10,6 +11,7 @@ class Decoder(torch.nn.Module): """Pitch or Mel decoder module in VISinger 2.""" + @typechecked def __init__( self, out_channels: int = 192, @@ -31,7 +33,8 @@ def __init__( attention_dropout_rate: float = 0.0, global_channels: int = -1, ): - """ + """Initialize Decoder in VISinger 2. + Args: out_channels (int): The output dimension of the module. attention_dim (int): The dimension of the attention mechanism. @@ -85,8 +88,7 @@ def __init__( self.global_conv = torch.nn.Conv1d(global_channels, attention_dim, 1) def forward(self, x, x_lengths, g=None): - """ - Forward pass of the Decoder. + """Forward pass of the Decoder. Args: x (Tensor): Input tensor (B, 2 + attention_dim, T). diff --git a/espnet2/gan_svs/vits/prior_decoder.py b/espnet2/gan_svs/vits/prior_decoder.py index b6b11421449..d2885da1b81 100644 --- a/espnet2/gan_svs/vits/prior_decoder.py +++ b/espnet2/gan_svs/vits/prior_decoder.py @@ -2,12 +2,14 @@ # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) import torch +from typeguard import typechecked from espnet.nets.pytorch_backend.conformer.encoder import Encoder from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask class PriorDecoder(torch.nn.Module): + @typechecked def __init__( self, out_channels: int = 192 * 2, @@ -29,8 +31,7 @@ def __init__( attention_dropout_rate: float = 0.0, global_channels: int = 0, ): - """ - Initialize prior decoder module. + """Initialize prior decoder module. Args: out_channels (int): Output channels of the prior decoder. Defaults to 384. @@ -89,8 +90,7 @@ def __init__( self.conv = torch.nn.Conv1d(global_channels, attention_dim, 1) def forward(self, x, x_lengths, g=None): - """ - Forward pass of the PriorDecoder module. + """Forward pass of the PriorDecoder module. Args: x (Tensor): Input tensor (B, attention_dim + 2, T). diff --git a/espnet2/gan_svs/vits/vits.py b/espnet2/gan_svs/vits/vits.py index fb9f4b45991..063f1726337 100644 --- a/espnet2/gan_svs/vits/vits.py +++ b/espnet2/gan_svs/vits/vits.py @@ -10,7 +10,7 @@ import torch from torch.nn import functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.gan_svs.abs_gan_svs import AbsGANSVS from espnet2.gan_svs.avocodo.avocodo import ( @@ -77,6 +77,7 @@ class VITS(AbsGANSVS): """ + @typechecked def __init__( self, # generator related @@ -314,7 +315,6 @@ def __init__( cache_generator_outputs (bool): Whether to cache generator outputs. """ - assert check_argument_types() super().__init__() # define modules diff --git a/espnet2/gan_tts/espnet_model.py b/espnet2/gan_tts/espnet_model.py index 5a339aace4d..5ced908adcc 100644 --- a/espnet2/gan_tts/espnet_model.py +++ b/espnet2/gan_tts/espnet_model.py @@ -8,7 +8,7 @@ import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.gan_tts.abs_gan_tts import AbsGANTTS from espnet2.layers.abs_normalize import AbsNormalize @@ -28,6 +28,7 @@ def autocast(enabled=True): # NOQA class ESPnetGANTTSModel(AbsGANESPnetModel): """ESPnet model for GAN-based text-to-speech task.""" + @typechecked def __init__( self, feats_extract: Optional[AbsFeatsExtract], @@ -39,7 +40,6 @@ def __init__( tts: AbsGANTTS, ): """Initialize ESPnetGANTTSModel module.""" - assert check_argument_types() super().__init__() self.feats_extract = feats_extract self.normalize = normalize diff --git a/espnet2/gan_tts/jets/jets.py b/espnet2/gan_tts/jets/jets.py index e59490aad30..55940b9a815 100644 --- a/espnet2/gan_tts/jets/jets.py +++ b/espnet2/gan_tts/jets/jets.py @@ -6,7 +6,7 @@ from typing import Any, Dict, Optional import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.gan_tts.abs_gan_tts import AbsGANTTS from espnet2.gan_tts.hifigan import ( @@ -50,6 +50,7 @@ class JETS(AbsGANTTS): """ + @typechecked def __init__( self, # generator related @@ -243,7 +244,6 @@ def __init__( plot_pred_mos (bool): Whether to plot predicted MOS during the training. mos_pred_tool (str): MOS prediction tool name. """ - assert check_argument_types() super().__init__() # define modules diff --git a/espnet2/gan_tts/jets/loss.py b/espnet2/gan_tts/jets/loss.py index 74bec9d1710..4be10c9db2a 100644 --- a/espnet2/gan_tts/jets/loss.py +++ b/espnet2/gan_tts/jets/loss.py @@ -8,7 +8,7 @@ import numpy as np import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet.nets.pytorch_backend.fastspeech.duration_predictor import ( # noqa: H301 DurationPredictorLoss, @@ -17,6 +17,7 @@ class VarianceLoss(torch.nn.Module): + @typechecked def __init__(self, use_masking: bool = True, use_weighted_masking: bool = False): """Initialize JETS variance loss module. @@ -27,7 +28,6 @@ def __init__(self, use_masking: bool = True, use_weighted_masking: bool = False) calculation. """ - assert check_argument_types() super().__init__() assert (use_masking != use_weighted_masking) or not use_masking diff --git a/espnet2/gan_tts/joint/joint_text2wav.py b/espnet2/gan_tts/joint/joint_text2wav.py index b1bc0c07ca3..947f61547c2 100644 --- a/espnet2/gan_tts/joint/joint_text2wav.py +++ b/espnet2/gan_tts/joint/joint_text2wav.py @@ -6,7 +6,7 @@ from typing import Any, Dict import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.gan_tts.abs_gan_tts import AbsGANTTS from espnet2.gan_tts.hifigan import ( @@ -64,6 +64,7 @@ class JointText2Wav(AbsGANTTS): """General class to jointly train text2mel and vocoder parts.""" + @typechecked def __init__( self, # generator (text2mel + vocoder) related @@ -275,7 +276,6 @@ def __init__( cache_generator_outputs (bool): Whether to cache generator outputs. """ - assert check_argument_types() super().__init__() self.segment_size = segment_size self.use_pqmf = use_pqmf diff --git a/espnet2/gan_tts/melgan/pqmf.py b/espnet2/gan_tts/melgan/pqmf.py index 7e504b7dc71..df91b742d1c 100644 --- a/espnet2/gan_tts/melgan/pqmf.py +++ b/espnet2/gan_tts/melgan/pqmf.py @@ -10,7 +10,11 @@ import numpy as np import torch import torch.nn.functional as F -from scipy.signal import kaiser + +try: + from scipy.signal import kaiser +except ImportError: + from scipy.signal.windows import kaiser def design_prototype_filter( diff --git a/espnet2/gan_tts/utils/get_random_segments.py b/espnet2/gan_tts/utils/get_random_segments.py index 9834bf2401a..48af2136159 100644 --- a/espnet2/gan_tts/utils/get_random_segments.py +++ b/espnet2/gan_tts/utils/get_random_segments.py @@ -3,7 +3,7 @@ """Function to get random segments.""" -from typing import Optional, Tuple +from typing import Tuple import torch @@ -25,10 +25,10 @@ def get_random_segments( Tensor: Start index tensor (B,). """ - b, c, t = x.size() + batches = x.shape[0] max_start_idx = x_lengths - segment_size max_start_idx[max_start_idx < 0] = 0 - start_idxs = (torch.rand([b]).to(x.device) * max_start_idx).to( + start_idxs = (torch.rand([batches]).to(x.device) * max_start_idx).to( dtype=torch.long, ) segments = get_segments(x, start_idxs, segment_size) @@ -52,7 +52,7 @@ def get_segments( Tensor: Segmented tensor (B, C, segment_size). """ - b, c, t = x.size() + b, c, _ = x.size() segments = x.new_zeros(b, c, segment_size) for i, start_idx in enumerate(start_idxs): segments[i] = x[i, :, start_idx : start_idx + segment_size] diff --git a/espnet2/gan_tts/vits/vits.py b/espnet2/gan_tts/vits/vits.py index 2c9fa4d444d..f85dc2aabcd 100644 --- a/espnet2/gan_tts/vits/vits.py +++ b/espnet2/gan_tts/vits/vits.py @@ -8,7 +8,7 @@ from typing import Any, Dict, Optional import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.gan_tts.abs_gan_tts import AbsGANTTS from espnet2.gan_tts.hifigan import ( @@ -60,6 +60,7 @@ class VITS(AbsGANTTS): """ + @typechecked def __init__( self, # generator related @@ -217,7 +218,6 @@ def __init__( mos_pred_tool (str): MOS prediction tool name. """ - assert check_argument_types() super().__init__() # define modules diff --git a/espnet2/hubert/espnet_model.py b/espnet2/hubert/espnet_model.py index cde1dd4cbb5..de9f1746a5e 100644 --- a/espnet2/hubert/espnet_model.py +++ b/espnet2/hubert/espnet_model.py @@ -12,7 +12,7 @@ import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet2.asr.frontend.abs_frontend import AbsFrontend @@ -36,6 +36,7 @@ def autocast(enabled=True): class TorchAudioHubertPretrainModel(AbsESPnetModel): """TorchAudio Hubert Pretrain model""" + @typechecked def __init__( self, vocab_size: int, @@ -47,7 +48,6 @@ def __init__( encoder: AbsEncoder, ignore_id: int = -1, ): - assert check_argument_types() super().__init__() self.vocab_size = vocab_size @@ -266,6 +266,7 @@ def _calc_hubert_loss( class HubertPretrainModel(AbsESPnetModel): """Hubert Pretrain model""" + @typechecked def __init__( self, vocab_size: int, @@ -286,7 +287,6 @@ def __init__( pred_nomask_weight: float = 0.0, loss_weights: float = 0.0, ): - assert check_argument_types() super().__init__() # note that eos is the same as sos (equivalent ID) diff --git a/espnet2/iterators/category_iter_factory.py b/espnet2/iterators/category_iter_factory.py index 9f76fd2923f..1649fe61f9a 100644 --- a/espnet2/iterators/category_iter_factory.py +++ b/espnet2/iterators/category_iter_factory.py @@ -1,5 +1,3 @@ -import itertools -import logging import random from functools import partial from typing import Any, Sequence, Union @@ -7,7 +5,7 @@ import numpy as np import torch from torch.utils.data import DataLoader -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.iterators.abs_iter_factory import AbsIterFactory from espnet2.samplers.abs_sampler import AbsSampler @@ -46,6 +44,7 @@ class CategoryIterFactory(AbsIterFactory): """ + @typechecked def __init__( self, dataset, @@ -58,7 +57,6 @@ def __init__( collate_fn=None, pin_memory: bool = False, ): - assert check_argument_types() if not isinstance(batches, AbsSampler): self.sampler = RawSampler(batches) @@ -88,8 +86,6 @@ def build_iter(self, epoch: int, shuffle: bool = None) -> DataLoader: if self.sampler_args["num_batches"] is not None: batches = batches[: self.sampler_args.num_batches] - bs_list = [len(batch) for batch in batches] - if self.sampler_args["distributed"]: world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() diff --git a/espnet2/iterators/chunk_iter_factory.py b/espnet2/iterators/chunk_iter_factory.py index 40effae3aa0..c9efefe0eff 100644 --- a/espnet2/iterators/chunk_iter_factory.py +++ b/espnet2/iterators/chunk_iter_factory.py @@ -1,11 +1,12 @@ import logging import re from collections import defaultdict +from copy import deepcopy from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union import numpy as np import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.iterators.abs_iter_factory import AbsIterFactory from espnet2.iterators.sequence_iter_factory import SequenceIterFactory @@ -34,6 +35,7 @@ class ChunkIterFactory(AbsIterFactory): """ + @typechecked def __init__( self, dataset, @@ -51,7 +53,6 @@ def __init__( excluded_key_prefixes: Optional[List[str]] = None, default_fs: Optional[int] = None, ): - assert check_argument_types() assert all(len(x) == 1 for x in batches), "batch-size must be 1" self.per_sample_iter_factory = SequenceIterFactory( @@ -102,13 +103,14 @@ def __init__( # - exactly match one of the prefixes in `excluded_key_prefixes` # - have one of the prefixes in `excluded_key_prefixes` and end with numbers if excluded_key_prefixes is None: - excluded_key_prefixes = DEFAULT_EXCLUDED_KEY_PREFIXES + _excluded_key_prefixes = DEFAULT_EXCLUDED_KEY_PREFIXES else: + _excluded_key_prefixes = deepcopy(excluded_key_prefixes) for k in DEFAULT_EXCLUDED_KEY_PREFIXES: - if k not in excluded_key_prefixes: - excluded_key_prefixes.append(k) + if k not in _excluded_key_prefixes: + _excluded_key_prefixes.append(k) self.excluded_key_pattern = ( - "(" + "[0-9]*)|(".join(excluded_key_prefixes) + "[0-9]*)" + "(" + "[0-9]*)|(".join(_excluded_key_prefixes) + "[0-9]*)" ) if self.excluded_key_pattern: logging.info( diff --git a/espnet2/iterators/multiple_iter_factory.py b/espnet2/iterators/multiple_iter_factory.py index 29f174df9b8..8c6466fb72c 100644 --- a/espnet2/iterators/multiple_iter_factory.py +++ b/espnet2/iterators/multiple_iter_factory.py @@ -2,19 +2,19 @@ from typing import Callable, Collection, Iterator import numpy as np -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.iterators.abs_iter_factory import AbsIterFactory class MultipleIterFactory(AbsIterFactory): + @typechecked def __init__( self, build_funcs: Collection[Callable[[], AbsIterFactory]], seed: int = 0, shuffle: bool = False, ): - assert check_argument_types() self.build_funcs = list(build_funcs) self.seed = seed self.shuffle = shuffle diff --git a/espnet2/iterators/sequence_iter_factory.py b/espnet2/iterators/sequence_iter_factory.py index bd186a332a9..14fbcfc415b 100644 --- a/espnet2/iterators/sequence_iter_factory.py +++ b/espnet2/iterators/sequence_iter_factory.py @@ -1,11 +1,11 @@ import itertools import random from functools import partial -from typing import Any, Sequence, Union +from typing import Any, Optional, Sequence, Union import numpy as np from torch.utils.data import DataLoader -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.iterators.abs_iter_factory import AbsIterFactory from espnet2.samplers.abs_sampler import AbsSampler @@ -43,11 +43,12 @@ class SequenceIterFactory(AbsIterFactory): """ + @typechecked def __init__( self, dataset, batches: Union[AbsSampler, Sequence[Sequence[Any]]], - num_iters_per_epoch: int = None, + num_iters_per_epoch: Optional[int] = None, seed: int = 0, shuffle: bool = False, shuffle_within_batch: bool = False, @@ -55,7 +56,6 @@ def __init__( collate_fn=None, pin_memory: bool = False, ): - assert check_argument_types() if not isinstance(batches, AbsSampler): self.sampler = RawSampler(batches) diff --git a/espnet2/layers/create_adapter.py b/espnet2/layers/create_adapter.py index d1fbc7a0f52..a5b892c18e1 100644 --- a/espnet2/layers/create_adapter.py +++ b/espnet2/layers/create_adapter.py @@ -8,13 +8,10 @@ """ -from typing import List - import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.layers.create_adapter_fn import create_houlsby_adapter, create_lora_adapter -from espnet2.train.class_choices import ClassChoices create_adapter_fn_table = { "lora": create_lora_adapter, @@ -22,6 +19,7 @@ } +@typechecked def create_adapter( model: torch.nn.Module, adapter: str, @@ -37,7 +35,6 @@ def create_adapter( e.g. {"rank": 8, "alpha": 8, ...} for lora """ - assert check_argument_types() assert adapter in create_adapter_fn_table, f"Adapter {adapter} is not supported." create_adapter_fn = create_adapter_fn_table[adapter] create_adapter_fn(model=model, **adapter_conf) diff --git a/espnet2/layers/create_adapter_fn.py b/espnet2/layers/create_adapter_fn.py index e75bf6754d8..165f4853bd8 100644 --- a/espnet2/layers/create_adapter_fn.py +++ b/espnet2/layers/create_adapter_fn.py @@ -1,7 +1,7 @@ -from typing import List +from typing import List, Optional import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.frontend.s3prl import S3prlFrontend from espnet2.layers.create_adapter_utils import ( @@ -24,7 +24,7 @@ is_transformers_available = False try: - import s3prl + import s3prl # noqa from s3prl.upstream.wav2vec2.wav2vec2_model import TransformerSentenceEncoderLayer is_s3prl_available = True @@ -39,6 +39,7 @@ is_lora_available = False +@typechecked def create_houlsby_adapter( model: torch.nn.Module, bottleneck: int = 32, @@ -55,7 +56,6 @@ def create_houlsby_adapter( "Error: S3PRL is not properly installed." "Please install S3PRL: cd ${MAIN_ROOT}/tools && make s3prl.done" ) - assert check_argument_types() assert hasattr(model, "frontend") and isinstance( model.frontend, S3prlFrontend ), "Only support S3PRL frontend now !!" @@ -82,13 +82,14 @@ def create_houlsby_adapter( raise ValueError(f"Target layers {target_layers} not found in the base model.") +@typechecked def create_lora_adapter( model: torch.nn.Module, rank: int = 8, alpha: int = 8, dropout_rate: float = 0.0, target_modules: List[str] = ["query"], - bias_type: str = "none", + bias_type: Optional[str] = "none", ): """Create LoRA adapter for the base model. @@ -111,7 +112,6 @@ def create_lora_adapter( """ - assert check_argument_types() if not is_lora_available: raise ImportError( "Requiring loralib. Install loralib following: " @@ -125,7 +125,7 @@ def create_lora_adapter( if not check_target_module_exists(key, target_modules): continue - # TODO is this a good way to check the target module? + # TODO(gituser) is this a good way to check the target module? # check_target_module_exists needs only one of the target modules # to be in the key, but what if one key exists and another doesn't? # Should this case raise an error? @@ -151,13 +151,14 @@ def create_lora_adapter( model.eval() +@typechecked def create_new_houlsby_module(target_module: torch.nn.Module, bottleneck: int): - """Create a new houlsby adapter module for the given target module\n. + """Create a new houlsby adapter module for the given target module. + Currently, only support: Wav2Vec2EncoderLayerStableLayerNorm & TransformerSentenceEncoderLayer """ - assert check_argument_types() if isinstance(target_module, Wav2Vec2EncoderLayerStableLayerNorm): input_size = target_module.layer_norm.normalized_shape[0] @@ -219,11 +220,11 @@ def create_new_houlsby_module(target_module: torch.nn.Module, bottleneck: int): return adapter_added_layer +@typechecked def create_new_lora_module( target_module: torch.nn.Module, rank: int, alpha: int, dropout_rate: float ): """Create a new lora module for the given target module.""" - assert check_argument_types() bias = hasattr(target_module, "bias") and target_module.bias is not None if isinstance(target_module, torch.nn.Embedding): diff --git a/espnet2/layers/create_adapter_utils.py b/espnet2/layers/create_adapter_utils.py index 71c31db741a..22b929a4b81 100644 --- a/espnet2/layers/create_adapter_utils.py +++ b/espnet2/layers/create_adapter_utils.py @@ -1,9 +1,10 @@ from typing import List import torch -from typeguard import check_argument_types +from typeguard import typechecked +@typechecked def replace_module( parent_module: torch.nn.Module, child_name: str, @@ -11,8 +12,7 @@ def replace_module( new_module: torch.nn.Module, ): """Replace the target module with the new module.""" - assert check_argument_types() - # TODO add hook and whether requires_grad to them + # TODO(gituser) add hook and whether requires_grad to them device = old_module.weight.device setattr(parent_module, child_name, new_module) @@ -25,15 +25,15 @@ def replace_module( new_module.to(device) +@typechecked def check_target_module_exists(key: str, target_modules: List[str]): """Check if the target_modules matchs the given key.""" - assert check_argument_types() return any([key.endswith(target_key) for target_key in target_modules]) +@typechecked def get_submodules(model: torch.nn.Module, key: str): """Return the submodules of the given key.""" - assert check_argument_types() parent_module = model.get_submodule(".".join(key.split(".")[:-1])) target_name = key.split(".")[-1] target_module = model.get_submodule(key) diff --git a/espnet2/layers/global_mvn.py b/espnet2/layers/global_mvn.py index c77b7b557a1..27fe77f36ee 100644 --- a/espnet2/layers/global_mvn.py +++ b/espnet2/layers/global_mvn.py @@ -3,7 +3,7 @@ import numpy as np import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.layers.abs_normalize import AbsNormalize from espnet2.layers.inversible_interface import InversibleInterface @@ -22,6 +22,7 @@ class GlobalMVN(AbsNormalize, InversibleInterface): eps: """ + @typechecked def __init__( self, stats_file: Union[Path, str], @@ -29,7 +30,6 @@ def __init__( norm_vars: bool = True, eps: float = 1.0e-20, ): - assert check_argument_types() super().__init__() self.norm_means = norm_means self.norm_vars = norm_vars diff --git a/espnet2/layers/houlsby_adapter_layer.py b/espnet2/layers/houlsby_adapter_layer.py index c9231051d63..b02a6c44c85 100644 --- a/espnet2/layers/houlsby_adapter_layer.py +++ b/espnet2/layers/houlsby_adapter_layer.py @@ -2,7 +2,7 @@ import torch.nn as nn try: - import s3prl + import s3prl # noqa from s3prl.upstream.wav2vec2.wav2vec2_model import TransformerSentenceEncoderLayer is_s3prl_available = True @@ -33,8 +33,8 @@ def forward(self, x): else: class HoulsbyTransformerSentenceEncoderLayer(TransformerSentenceEncoderLayer): - """ - Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained + """Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained + models. """ @@ -59,8 +59,8 @@ def forward( need_weights: bool = False, att_args=None, ): - """ - LayerNorm is applied either before or after the self-attention/ffn + """LayerNorm is applied either before or after the self-attention/ffn + modules similar to the original Transformer imlementation. """ residual = x diff --git a/espnet2/layers/label_aggregation.py b/espnet2/layers/label_aggregation.py index 402b33af145..7d44ae2e2e5 100644 --- a/espnet2/layers/label_aggregation.py +++ b/espnet2/layers/label_aggregation.py @@ -1,19 +1,19 @@ from typing import Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet.nets.pytorch_backend.nets_utils import make_pad_mask class LabelAggregate(torch.nn.Module): + @typechecked def __init__( self, win_length: int = 512, hop_length: int = 128, center: bool = True, ): - assert check_argument_types() super().__init__() self.win_length = win_length diff --git a/espnet2/layers/mask_along_axis.py b/espnet2/layers/mask_along_axis.py index 96bd269113d..297b0b1e155 100644 --- a/espnet2/layers/mask_along_axis.py +++ b/espnet2/layers/mask_along_axis.py @@ -2,7 +2,7 @@ from typing import Sequence, Union import torch -from typeguard import check_argument_types +from typeguard import typechecked def mask_along_axis( @@ -69,6 +69,7 @@ def mask_along_axis( class MaskAlongAxis(torch.nn.Module): + @typechecked def __init__( self, mask_width_range: Union[int, Sequence[int]] = (0, 30), @@ -76,7 +77,6 @@ def __init__( dim: Union[int, str] = "time", replace_with_zero: bool = True, ): - assert check_argument_types() if isinstance(mask_width_range, int): mask_width_range = (0, mask_width_range) if len(mask_width_range) != 2: @@ -136,6 +136,7 @@ class MaskAlongAxisVariableMaxWidth(torch.nn.Module): max_width = max_width_ratio * seq_len """ + @typechecked def __init__( self, mask_width_ratio_range: Union[float, Sequence[float]] = (0.0, 0.05), @@ -143,7 +144,6 @@ def __init__( dim: Union[int, str] = "time", replace_with_zero: bool = True, ): - assert check_argument_types() if isinstance(mask_width_ratio_range, float): mask_width_ratio_range = (0.0, mask_width_ratio_range) if len(mask_width_ratio_range) != 2: diff --git a/espnet2/layers/sinc_conv.py b/espnet2/layers/sinc_conv.py index a31683474b4..195e6029d56 100644 --- a/espnet2/layers/sinc_conv.py +++ b/espnet2/layers/sinc_conv.py @@ -7,7 +7,7 @@ from typing import Union import torch -from typeguard import check_argument_types +from typeguard import typechecked class LogCompression(torch.nn.Module): @@ -48,6 +48,7 @@ class SincConv(torch.nn.Module): and not on the input values, which is different to traditional ASR. """ + @typechecked def __init__( self, in_channels: int, @@ -72,7 +73,6 @@ def __init__( window_func: Window function on the filter, one of ["hamming", "none"]. fs (str, int, float): Sample rate of the input data """ - assert check_argument_types() super().__init__() window_funcs = { "none": self.none_window, @@ -198,6 +198,7 @@ def invert(x): return 700.0 * (torch.exp(torch.div(x, 1125.0)) - 1.0) @classmethod + @typechecked def bank(cls, channels: int, fs: float) -> torch.Tensor: """Obtain initialization values for the mel scale. @@ -209,7 +210,6 @@ def bank(cls, channels: int, fs: float) -> torch.Tensor: torch.Tensor: Filter start frequencíes. torch.Tensor: Filter stop frequencies. """ - assert check_argument_types() # min and max bandpass edge frequencies min_frequency = torch.tensor(30.0) max_frequency = torch.tensor(fs * 0.5) @@ -247,6 +247,7 @@ def invert(x): return f * 1000.0 @classmethod + @typechecked def bank(cls, channels: int, fs: float) -> torch.Tensor: """Obtain initialization values for the Bark scale. @@ -258,7 +259,6 @@ def bank(cls, channels: int, fs: float) -> torch.Tensor: torch.Tensor: Filter start frequencíes. torch.Tensor: Filter stop frequencíes. """ - assert check_argument_types() # min and max BARK center frequencies by approximation min_center_frequency = torch.tensor(70.0) max_center_frequency = torch.tensor(fs * 0.45) diff --git a/espnet2/layers/stft.py b/espnet2/layers/stft.py index ed7de887c59..869c96dd29c 100644 --- a/espnet2/layers/stft.py +++ b/espnet2/layers/stft.py @@ -5,7 +5,7 @@ import torch from packaging.version import parse as V from torch_complex.tensor import ComplexTensor -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.enh.layers.complex_utils import to_complex from espnet2.layers.inversible_interface import InversibleInterface @@ -15,17 +15,17 @@ class Stft(torch.nn.Module, InversibleInterface): + @typechecked def __init__( self, n_fft: int = 512, - win_length: int = None, + win_length: Optional[int] = None, hop_length: int = 128, window: Optional[str] = "hann", center: bool = True, normalized: bool = False, onesided: bool = True, ): - assert check_argument_types() super().__init__() self.n_fft = n_fft if win_length is None: diff --git a/espnet2/layers/utterance_mvn.py b/espnet2/layers/utterance_mvn.py index b1d50b7aea6..b8a932947a2 100644 --- a/espnet2/layers/utterance_mvn.py +++ b/espnet2/layers/utterance_mvn.py @@ -1,20 +1,20 @@ from typing import Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.layers.abs_normalize import AbsNormalize from espnet.nets.pytorch_backend.nets_utils import make_pad_mask class UtteranceMVN(AbsNormalize): + @typechecked def __init__( self, norm_means: bool = True, norm_vars: bool = False, eps: float = 1.0e-20, ): - assert check_argument_types() super().__init__() self.norm_means = norm_means self.norm_vars = norm_vars diff --git a/espnet2/lm/espnet_model.py b/espnet2/lm/espnet_model.py index bbaecb8d8ee..54f100bf30b 100644 --- a/espnet2/lm/espnet_model.py +++ b/espnet2/lm/espnet_model.py @@ -2,7 +2,7 @@ import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.lm.abs_model import AbsLM from espnet2.torch_utils.device_funcs import force_gatherable @@ -11,8 +11,8 @@ class ESPnetLanguageModel(AbsESPnetModel): + @typechecked def __init__(self, lm: AbsLM, vocab_size: int, ignore_id: int = 0): - assert check_argument_types() super().__init__() self.lm = lm self.sos = vocab_size - 1 diff --git a/espnet2/lm/espnet_model_multitask.py b/espnet2/lm/espnet_model_multitask.py index aa964075796..a703cbda093 100644 --- a/espnet2/lm/espnet_model_multitask.py +++ b/espnet2/lm/espnet_model_multitask.py @@ -2,7 +2,7 @@ import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.lm.abs_model import AbsLM from espnet2.torch_utils.device_funcs import force_gatherable @@ -14,6 +14,7 @@ class ESPnetMultitaskLanguageModel(AbsESPnetModel): + @typechecked def __init__( self, lm: AbsLM, @@ -25,7 +26,6 @@ def __init__( sos_syms: List[str] = ["", ""], eos_sym: str = "", ): - assert check_argument_types() super().__init__() self.lm = lm self.sos_ids = [token_list.index(t) for t in sos_syms] @@ -50,6 +50,7 @@ def nll( max_length: Optional[int] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: """Compute negative log likelihood (nll) + NOTE(yifan): We only use nll to calculate perplexity, so there is no condition in each sentence. diff --git a/espnet2/lm/huggingface_pretrained_opt_lm.py b/espnet2/lm/huggingface_pretrained_opt_lm.py index ba8f301bcc0..5453aa124ff 100644 --- a/espnet2/lm/huggingface_pretrained_opt_lm.py +++ b/espnet2/lm/huggingface_pretrained_opt_lm.py @@ -1,23 +1,22 @@ import copy import logging -import re from typing import Any, List, Tuple import torch import torch.nn as nn -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.lm.abs_model import AbsLM from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask class HuggingfaceOPTModel(AbsLM): + @typechecked def __init__( self, vocab_size: int, opt_name: str, ): - assert check_argument_types() super().__init__() try: from transformers import OPTModel @@ -129,10 +128,8 @@ def batch_score( n_batch = len(ys) n_layers = len(self.decoder.decoder.layers) if states[0] is None: - batch_state = None _use_cache = True else: - batch_state = None _use_cache = False # batch decoding diff --git a/espnet2/lm/seq_rnn_lm.py b/espnet2/lm/seq_rnn_lm.py index 5569248015c..4b378b9abe0 100644 --- a/espnet2/lm/seq_rnn_lm.py +++ b/espnet2/lm/seq_rnn_lm.py @@ -1,10 +1,10 @@ """Sequential implementation of Recurrent Neural Network Language Model.""" -from typing import Tuple, Union +from typing import Optional, Tuple, Union import torch import torch.nn as nn -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.lm.abs_model import AbsLM @@ -17,18 +17,18 @@ class SequentialRNNLM(AbsLM): """ + @typechecked def __init__( self, vocab_size: int, unit: int = 650, - nhid: int = None, + nhid: Optional[int] = None, nlayers: int = 2, dropout_rate: float = 0.0, tie_weights: bool = False, rnn_type: str = "lstm", ignore_id: int = 0, ): - assert check_argument_types() super().__init__() ninp = unit diff --git a/espnet2/main_funcs/average_nbest_models.py b/espnet2/main_funcs/average_nbest_models.py index 18ebb8b296e..a9ce1af941d 100644 --- a/espnet2/main_funcs/average_nbest_models.py +++ b/espnet2/main_funcs/average_nbest_models.py @@ -4,12 +4,13 @@ from typing import Collection, Optional, Sequence, Union import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.train.reporter import Reporter @torch.no_grad() +@typechecked def average_nbest_models( output_dir: Path, reporter: Reporter, @@ -27,7 +28,6 @@ def average_nbest_models( nbest: Number of best model files to be averaged suffix: A suffix added to the averaged model file name """ - assert check_argument_types() if isinstance(nbest, int): nbests = [nbest] else: diff --git a/espnet2/main_funcs/collect_stats.py b/espnet2/main_funcs/collect_stats.py index 0725f4e9a49..567f07f3430 100644 --- a/espnet2/main_funcs/collect_stats.py +++ b/espnet2/main_funcs/collect_stats.py @@ -7,7 +7,7 @@ import torch from torch.nn.parallel import data_parallel from torch.utils.data import DataLoader -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.datadir_writer import DatadirWriter from espnet2.fileio.npy_scp import NpyScpWriter @@ -17,6 +17,7 @@ @torch.no_grad() +@typechecked def collect_stats( model: Union[AbsESPnetModel, None], train_iter: DataLoader and Iterable[Tuple[List[str], Dict[str, torch.Tensor]]], @@ -33,7 +34,6 @@ def collect_stats( This method is used before executing train(). """ - assert check_argument_types() npy_scp_writers = {} for itr, mode in zip([train_iter, valid_iter], ["train", "valid"]): diff --git a/espnet2/mt/espnet_model.py b/espnet2/mt/espnet_model.py index 0501ed3bf55..dd5684fe661 100644 --- a/espnet2/mt/espnet_model.py +++ b/espnet2/mt/espnet_model.py @@ -4,7 +4,7 @@ import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.decoder.abs_decoder import AbsDecoder from espnet2.asr.encoder.abs_encoder import AbsEncoder @@ -32,6 +32,7 @@ def autocast(enabled=True): class ESPnetMTModel(AbsESPnetModel): """Encoder-Decoder model""" + @typechecked def __init__( self, vocab_size: int, @@ -53,7 +54,6 @@ def __init__( share_decoder_input_output_embed: bool = False, share_encoder_decoder_input_embed: bool = False, ): - assert check_argument_types() super().__init__() # note that eos is the same as sos (equivalent ID) diff --git a/espnet2/mt/frontend/embedding.py b/espnet2/mt/frontend/embedding.py index cdcfa549811..fb905e7d1e2 100644 --- a/espnet2/mt/frontend/embedding.py +++ b/espnet2/mt/frontend/embedding.py @@ -7,7 +7,7 @@ from typing import Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.frontend.abs_frontend import AbsFrontend from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding @@ -16,6 +16,7 @@ class Embedding(AbsFrontend): """Embedding Frontend for text based inputs.""" + @typechecked def __init__( self, input_size: int = 400, @@ -31,7 +32,6 @@ def __init__( pos_enc_class: PositionalEncoding or ScaledPositionalEncoding positional_dropout_rate: dropout rate after adding positional encoding """ - assert check_argument_types() super().__init__() self.embed_dim = embed_dim # TODO(sdalmia): check for padding idx diff --git a/espnet2/optimizers/sgd.py b/espnet2/optimizers/sgd.py index 3f0d3d1c906..2d62585ef55 100644 --- a/espnet2/optimizers/sgd.py +++ b/espnet2/optimizers/sgd.py @@ -1,5 +1,5 @@ import torch -from typeguard import check_argument_types +from typeguard import typechecked class SGD(torch.optim.SGD): @@ -12,6 +12,7 @@ class SGD(torch.optim.SGD): I can't understand why only SGD.lr doesn't have the default value. """ + @typechecked def __init__( self, params, @@ -21,7 +22,6 @@ def __init__( weight_decay: float = 0.0, nesterov: bool = False, ): - assert check_argument_types() super().__init__( params, lr=lr, diff --git a/espnet2/s2st/aux_attention/abs_aux_attention.py b/espnet2/s2st/aux_attention/abs_aux_attention.py index edf81338c5b..066305ca1ae 100644 --- a/espnet2/s2st/aux_attention/abs_aux_attention.py +++ b/espnet2/s2st/aux_attention/abs_aux_attention.py @@ -7,7 +7,9 @@ class AbsS2STAuxAttention(torch.nn.Module, ABC): """Base class for all S2ST auxiliary attention modules. - Refer to https://arxiv.org/abs/2107.08661""" + + Refer to https://arxiv.org/abs/2107.08661 + """ # the name will be the key that appears in the reporter @property diff --git a/espnet2/s2st/aux_attention/multihead.py b/espnet2/s2st/aux_attention/multihead.py index b0a500d4e3b..f26a7bc7d9d 100644 --- a/espnet2/s2st/aux_attention/multihead.py +++ b/espnet2/s2st/aux_attention/multihead.py @@ -1,15 +1,14 @@ import torch -import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.s2st.aux_attention.abs_aux_attention import AbsS2STAuxAttention -from espnet2.utils.types import str2bool from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention class MultiHeadAttention(AbsS2STAuxAttention): """Multihead Attention for S2ST.""" + @typechecked def __init__( self, n_head: int = 4, @@ -17,7 +16,6 @@ def __init__( dropout_rate: float = 0.0, ): super().__init__() - assert check_argument_types() self.attn = MultiHeadedAttention( n_head=n_head, n_feat=n_feat, @@ -32,6 +30,7 @@ def forward( mask: torch.Tensor, ): """Forward. + Args: query (torch.Tensor): Query tensor (#batch, time1, size). key (torch.Tensor): Key tensor (#batch, time2, size). diff --git a/espnet2/s2st/espnet_model.py b/espnet2/s2st/espnet_model.py index 1e6943a8c69..94370ccd181 100644 --- a/espnet2/s2st/espnet_model.py +++ b/espnet2/s2st/espnet_model.py @@ -4,7 +4,7 @@ import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.decoder.abs_decoder import AbsDecoder @@ -39,6 +39,7 @@ def autocast(enabled=True): class ESPnetS2STModel(AbsESPnetModel): """ESPnet speech-to-speech translation model""" + @typechecked def __init__( self, s2st_type: str, @@ -72,7 +73,6 @@ def __init__( sym_blank: str = "", extract_feats_in_collect_stats: bool = True, ): - assert check_argument_types() super().__init__() self.sos = tgt_vocab_size - 1 if tgt_vocab_size else None @@ -653,6 +653,7 @@ def forward( loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device) return loss, stats, weight + @typechecked def inference( self, src_speech: torch.Tensor, @@ -670,7 +671,6 @@ def inference( forward_window: int = 3, use_teacher_forcing: bool = False, ) -> Dict[str, torch.Tensor]: - assert check_argument_types() # 0. Target feature extract # NOTE(jiatong): only for teaching-forcing in spectrogram @@ -1010,7 +1010,7 @@ def _calc_ctc_loss( ctc = self.st_ctc else: raise RuntimeError( - "Cannot recognize the ctc-type (need 'src'/'tgt', but found ".format( + "Cannot recognize the ctc-type: need 'src'/'tgt', but found {}".format( ctc_type ) ) diff --git a/espnet2/s2st/losses/attention_loss.py b/espnet2/s2st/losses/attention_loss.py index 4d617b1ad34..c52c205173c 100644 --- a/espnet2/s2st/losses/attention_loss.py +++ b/espnet2/s2st/losses/attention_loss.py @@ -1,6 +1,5 @@ import torch -import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.s2st.losses.abs_loss import AbsS2STLoss from espnet2.utils.types import str2bool @@ -12,6 +11,7 @@ class S2STAttentionLoss(AbsS2STLoss): """attention-based label smoothing loss for S2ST.""" + @typechecked def __init__( self, vocab_size: int, @@ -22,7 +22,6 @@ def __init__( criterion: torch.nn.Module = torch.nn.KLDivLoss(reduction="none"), ): super().__init__() - assert check_argument_types() self.weight = weight self.loss = LabelSmoothingLoss( size=vocab_size, @@ -38,6 +37,7 @@ def forward( token_y: torch.Tensor, ): """Forward. + Args: """ if self.weight > 0: diff --git a/espnet2/s2st/losses/ctc_loss.py b/espnet2/s2st/losses/ctc_loss.py index e4f7a0867e9..a01fee2cc6c 100644 --- a/espnet2/s2st/losses/ctc_loss.py +++ b/espnet2/s2st/losses/ctc_loss.py @@ -1,6 +1,4 @@ -import torch -import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.s2st.losses.abs_loss import AbsS2STLoss @@ -8,6 +6,7 @@ class S2STCTCLoss(AbsS2STLoss): """CTC-based loss for S2ST.""" + @typechecked def __init__( self, weight: float = 1.0, @@ -15,7 +14,6 @@ def __init__( # Note(Jiatong): dummy CTC loss, only providing weight # for final loss calculation super().__init__() - assert check_argument_types() self.weight = weight def forward(loss): diff --git a/espnet2/s2st/losses/guided_attention_loss.py b/espnet2/s2st/losses/guided_attention_loss.py index b05016883c5..a4992ed1488 100644 --- a/espnet2/s2st/losses/guided_attention_loss.py +++ b/espnet2/s2st/losses/guided_attention_loss.py @@ -1,16 +1,14 @@ import torch -import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.s2st.losses.abs_loss import AbsS2STLoss -from espnet2.utils.types import str2bool from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import GuidedAttentionLoss -from espnet.nets.pytorch_backend.nets_utils import to_device class S2STGuidedAttentionLoss(AbsS2STLoss): """Tacotron-based loss for S2ST.""" + @typechecked def __init__( self, weight: float = 1.0, @@ -18,7 +16,6 @@ def __init__( alpha: float = 1.0, ): super().__init__() - assert check_argument_types() self.weight = weight self.loss = GuidedAttentionLoss( sigma=sigma, diff --git a/espnet2/s2st/losses/tacotron_loss.py b/espnet2/s2st/losses/tacotron_loss.py index 9ab1bda1714..f7cf6b57ca9 100644 --- a/espnet2/s2st/losses/tacotron_loss.py +++ b/espnet2/s2st/losses/tacotron_loss.py @@ -1,16 +1,15 @@ import torch -import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.s2st.losses.abs_loss import AbsS2STLoss from espnet2.utils.types import str2bool from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import Tacotron2Loss -from espnet.nets.pytorch_backend.nets_utils import to_device class S2STTacotron2Loss(AbsS2STLoss): """Tacotron-based loss for S2ST.""" + @typechecked def __init__( self, weight: float = 1.0, @@ -20,7 +19,6 @@ def __init__( bce_pos_weight: float = 20.0, ): super().__init__() - assert check_argument_types() self.weight = weight self.loss_type = loss_type self.loss = Tacotron2Loss( diff --git a/espnet2/s2st/synthesizer/discrete_synthesizer.py b/espnet2/s2st/synthesizer/discrete_synthesizer.py index 8f9399bc849..66b2d5c9459 100644 --- a/espnet2/s2st/synthesizer/discrete_synthesizer.py +++ b/espnet2/s2st/synthesizer/discrete_synthesizer.py @@ -4,16 +4,14 @@ """Translatotron Synthesizer related modules for ESPnet2.""" -import logging -from typing import Any, Dict, List, Optional, Sequence, Tuple +from typing import Any, List, Optional, Tuple import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.decoder.transformer_decoder import TransformerDecoder from espnet2.s2st.synthesizer.abs_synthesizer import AbsSynthesizer -from espnet2.torch_utils.device_funcs import force_gatherable from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask from espnet.nets.scorer_interface import BatchScorerInterface @@ -32,6 +30,7 @@ class TransformerDiscreteSynthesizer(AbsSynthesizer, BatchScorerInterface): """ + @typechecked def __init__( self, # decoder related @@ -83,17 +82,16 @@ def __init__( assume that spembs will be provided as the input. spk_embed_integration_type (str): How to integrate speaker embedding. """ - assert check_argument_types() super().__init__() self.spks = None if spks is not None and spks > 1: self.spks = spks - self.sid_emb = torch.nn.Embedding(spks, encoder_output_size) + self.sid_emb = torch.nn.Embedding(spks, idim) self.langs = None if langs is not None and langs > 1: self.langs = langs - self.lid_emb = torch.nn.Embedding(langs, encoder_output_size) + self.lid_emb = torch.nn.Embedding(langs, idim) self.spk_embed_dim = None if spk_embed_dim is not None and spk_embed_dim > 0: @@ -105,7 +103,7 @@ def __init__( dec_idim = idim + spk_embed_dim elif self.spk_embed_integration_type == "add": dec_idim = idim - self.projection = torch.nn.Linear(self.spk_embed_dim, encoder_output_size) + self.projection = torch.nn.Linear(self.spk_embed_dim, dec_idim) else: raise ValueError(f"{spk_embed_integration_type} is not supported.") @@ -237,8 +235,8 @@ def forward_one_step( tgt: torch.Tensor, tgt_mask: torch.Tensor, memory: torch.Tensor, - *, cache: List[torch.Tensor] = None, + **kwargs, ) -> Tuple[torch.Tensor, List[torch.Tensor]]: """Forward one step. @@ -256,13 +254,13 @@ def forward_one_step( # FIXME(jiatong): the spk/lang embedding may be execute too many times # consider add before the search if self.spks is not None: - sid_embs = self.sid_emb(sids.view(-1)) + sid_embs = self.sid_emb(self.spks.view(-1)) memory = memory + sid_embs.unsqueeze(1) if self.langs is not None: - lid_embs = self.lid_emb(lids.view(-1)) + lid_embs = self.lid_emb(self.langs.view(-1)) memory = memory + lid_embs.unsqueeze(1) if self.spk_embed_dim is not None: - memory = self._integrate_with_spk_embed(memory, spembs) + memory = self._integrate_with_spk_embed(memory, self.spk_embed_dim) return self.decoder.forward_one_step(tgt, tgt_mask, memory, cache=cache) diff --git a/espnet2/s2st/synthesizer/translatotron.py b/espnet2/s2st/synthesizer/translatotron.py index b9d8fb0f06d..de755ae20de 100644 --- a/espnet2/s2st/synthesizer/translatotron.py +++ b/espnet2/s2st/synthesizer/translatotron.py @@ -5,19 +5,13 @@ """Translatotron Synthesizer related modules for ESPnet2.""" import logging -from typing import Dict, Optional, Sequence, Tuple +from typing import Dict, Optional, Tuple import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.s2st.synthesizer.abs_synthesizer import AbsSynthesizer -from espnet2.torch_utils.device_funcs import force_gatherable -from espnet2.tts.gst.style_encoder import StyleEncoder -from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import ( - GuidedAttentionLoss, - Tacotron2Loss, -) from espnet.nets.pytorch_backend.nets_utils import make_pad_mask from espnet.nets.pytorch_backend.rnn.attentions import ( AttForward, @@ -26,7 +20,6 @@ AttMultiHeadAdd, ) from espnet.nets.pytorch_backend.tacotron2.decoder import Decoder -from espnet.nets.pytorch_backend.tacotron2.encoder import Encoder class Translatotron(AbsSynthesizer): @@ -41,6 +34,7 @@ class Translatotron(AbsSynthesizer): """ + @typechecked def __init__( self, # network structure related @@ -60,7 +54,7 @@ def __init__( postnet_layers: int = 5, postnet_chans: int = 512, postnet_filts: int = 5, - output_activation: str = None, + output_activation: Optional[str] = None, use_batch_norm: bool = True, use_concate: bool = True, use_residual: bool = False, @@ -106,7 +100,6 @@ def __init__( dropout_rate (float): Dropout rate. zoneout_rate (float): Zoneout rate. """ - assert check_argument_types() super().__init__() # store hyperparameters diff --git a/espnet2/s2st/synthesizer/translatotron2.py b/espnet2/s2st/synthesizer/translatotron2.py index bac2170fa49..633a4c6264a 100644 --- a/espnet2/s2st/synthesizer/translatotron2.py +++ b/espnet2/s2st/synthesizer/translatotron2.py @@ -3,30 +3,17 @@ """Translatotron2 related modules for ESPnet2.""" -import logging -from typing import Dict, Optional, Sequence, Tuple +from typing import Optional +import numpy as np import torch import torch.nn.functional as F -from typeguard import check_argument_types +from torch import nn +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence from espnet2.s2st.synthesizer.abs_synthesizer import AbsSynthesizer -from espnet2.torch_utils.device_funcs import force_gatherable -from espnet2.torch_utils.initialize import initialize -from espnet2.tts.fastspeech2.loss import FastSpeech2Loss -from espnet2.tts.fastspeech2.variance_predictor import VariancePredictor -from espnet2.tts.gst.style_encoder import StyleEncoder -from espnet.nets.pytorch_backend.conformer.encoder import Encoder as ConformerEncoder -from espnet.nets.pytorch_backend.fastspeech.duration_predictor import DurationPredictor -from espnet.nets.pytorch_backend.fastspeech.length_regulator import LengthRegulator -from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask, make_pad_mask -from espnet.nets.pytorch_backend.tacotron2.decoder import Postnet -from espnet.nets.pytorch_backend.transformer.embedding import ( - PositionalEncoding, - ScaledPositionalEncoding, -) -from espnet.nets.pytorch_backend.transformer.encoder import ( - Encoder as TransformerEncoder, +from espnet.nets.pytorch_backend.fastspeech.duration_predictor import ( + DurationPredictor as FastDurationPredictor, ) @@ -104,7 +91,7 @@ def __init__(self, idim, units=128, num_layers=2, dropout=0.5): ] ) - self.dropout = nn.Dropout(p=dropout_p) + self.dropout = nn.Dropout(p=dropout) self.activation = nn.ReLU() def forward(self, x): @@ -117,21 +104,22 @@ class DurationPredictor(nn.Module): """Non-Attentive Tacotron (NAT) Duration Predictor module.""" def __init__(self, cfg): - super(DurationPredictor, self).__init__() + super(FastDurationPredictor, self).__init__() self.lstm = nn.LSTM( - units, + cfg.units, int(cfg.duration_lstm_dim / 2), 2, batch_first=True, bidirectional=True, ) - self.proj = LinearNorm(cfg.duration_lstm_dim, 1) + self.proj = nn.LinearNorm(cfg.duration_lstm_dim, 1) self.relu = nn.ReLU() def forward(self, encoder_outputs, input_lengths=None): - """ + """Forward Duration Predictor + :param encoder_outputs: [batch_size, hidden_length, encoder_lstm_dim] :param input_lengths: [batch_size, hidden_length] :return: [batch_size, hidden_length] @@ -158,7 +146,8 @@ def forward(self, encoder_outputs, input_lengths=None): class GaussianUpsampling(nn.Module): - """ + """Gaussian Upsample. + Non-attention Tacotron: - https://arxiv.org/abs/2010.04301 this source code is implemenation of the ExpressiveTacotron from BridgetteSong diff --git a/espnet2/s2st/synthesizer/unity_synthesizer.py b/espnet2/s2st/synthesizer/unity_synthesizer.py index 9b45975d69e..8515844b5d6 100644 --- a/espnet2/s2st/synthesizer/unity_synthesizer.py +++ b/espnet2/s2st/synthesizer/unity_synthesizer.py @@ -4,16 +4,15 @@ """Translatotron Synthesizer related modules for ESPnet2.""" -import logging -from typing import Dict, Optional, Sequence, Tuple +from typing import Optional, Tuple import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.decoder.transformer_decoder import TransformerDecoder from espnet2.s2st.synthesizer.abs_synthesizer import AbsSynthesizer -from espnet2.torch_utils.device_funcs import force_gatherable +from espnet.nets.pytorch_backend.nets_utils import make_pad_mask from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding @@ -27,6 +26,7 @@ class UnitYSynthesizer(AbsSynthesizer): """ + @typechecked def __init__( self, # decoder related @@ -78,7 +78,6 @@ def __init__( assume that spembs will be provided as the input. spk_embed_integration_type (str): How to integrate speaker embedding. """ - assert check_argument_types() super().__init__() self.spks = None diff --git a/espnet2/s2st/tgt_feats_extract/linear_spectrogram.py b/espnet2/s2st/tgt_feats_extract/linear_spectrogram.py index d2cab1fbbfd..415dfe3d7d2 100644 --- a/espnet2/s2st/tgt_feats_extract/linear_spectrogram.py +++ b/espnet2/s2st/tgt_feats_extract/linear_spectrogram.py @@ -1,7 +1,7 @@ from typing import Any, Dict, Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.layers.stft import Stft from espnet2.s2st.tgt_feats_extract.abs_tgt_feats_extract import AbsTgtFeatsExtract @@ -13,6 +13,7 @@ class LinearSpectrogram(AbsTgtFeatsExtract): Stft -> amplitude-spec """ + @typechecked def __init__( self, n_fft: int = 1024, @@ -23,7 +24,6 @@ def __init__( normalized: bool = False, onesided: bool = True, ): - assert check_argument_types() super().__init__() self.n_fft = n_fft self.hop_length = hop_length diff --git a/espnet2/s2st/tgt_feats_extract/log_mel_fbank.py b/espnet2/s2st/tgt_feats_extract/log_mel_fbank.py index 133d2316d17..80fbacd2c4b 100644 --- a/espnet2/s2st/tgt_feats_extract/log_mel_fbank.py +++ b/espnet2/s2st/tgt_feats_extract/log_mel_fbank.py @@ -2,7 +2,7 @@ import humanfriendly import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.layers.log_mel import LogMel from espnet2.layers.stft import Stft @@ -15,6 +15,7 @@ class LogMelFbank(AbsTgtFeatsExtract): Stft -> amplitude-spec -> Log-Mel-Fbank """ + @typechecked def __init__( self, fs: Union[int, str] = 16000, @@ -31,7 +32,6 @@ def __init__( htk: bool = False, log_base: Optional[float] = 10.0, ): - assert check_argument_types() super().__init__() if isinstance(fs, str): fs = humanfriendly.parse_size(fs) diff --git a/espnet2/s2st/tgt_feats_extract/log_spectrogram.py b/espnet2/s2st/tgt_feats_extract/log_spectrogram.py index cd5dfbe9137..4666516c18f 100644 --- a/espnet2/s2st/tgt_feats_extract/log_spectrogram.py +++ b/espnet2/s2st/tgt_feats_extract/log_spectrogram.py @@ -1,7 +1,7 @@ from typing import Any, Dict, Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.layers.stft import Stft from espnet2.s2st.tgt_feats_extract.abs_tgt_feats_extract import AbsTgtFeatsExtract @@ -13,6 +13,7 @@ class LogSpectrogram(AbsTgtFeatsExtract): Stft -> log-amplitude-spec """ + @typechecked def __init__( self, n_fft: int = 1024, @@ -23,7 +24,6 @@ def __init__( normalized: bool = False, onesided: bool = True, ): - assert check_argument_types() super().__init__() self.n_fft = n_fft self.hop_length = hop_length diff --git a/espnet2/s2t/espnet_model.py b/espnet2/s2t/espnet_model.py index 59bc5c4586a..f8e938a2b45 100644 --- a/espnet2/s2t/espnet_model.py +++ b/espnet2/s2t/espnet_model.py @@ -3,7 +3,7 @@ import torch from torch.cuda.amp import autocast -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.decoder.abs_decoder import AbsDecoder @@ -25,6 +25,7 @@ class ESPnetS2TModel(AbsESPnetModel): """CTC-attention hybrid Encoder-Decoder model""" + @typechecked def __init__( self, vocab_size: int, @@ -52,7 +53,6 @@ def __init__( sym_na: str = "", # not available extract_feats_in_collect_stats: bool = True, ): - assert check_argument_types() assert 0.0 <= ctc_weight <= 1.0, ctc_weight assert 0.0 <= interctc_weight < 1.0, interctc_weight diff --git a/espnet2/samplers/build_batch_sampler.py b/espnet2/samplers/build_batch_sampler.py index 62910929ccd..86cbc889381 100644 --- a/espnet2/samplers/build_batch_sampler.py +++ b/espnet2/samplers/build_batch_sampler.py @@ -1,6 +1,6 @@ -from typing import List, Sequence, Tuple, Union +from typing import List, Optional, Sequence, Tuple, Union -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.samplers.abs_sampler import AbsSampler from espnet2.samplers.folded_batch_sampler import FoldedBatchSampler @@ -69,6 +69,7 @@ ) +@typechecked def build_batch_sampler( type: str, batch_size: int, @@ -80,7 +81,7 @@ def build_batch_sampler( min_batch_size: int = 1, fold_lengths: Sequence[int] = (), padding: bool = True, - utt2category_file: str = None, + utt2category_file: Optional[str] = None, ) -> AbsSampler: """Helper function to instantiate BatchSampler. @@ -100,7 +101,6 @@ def build_batch_sampler( padding: Whether sequences are input as a padded tensor or not. used for "numel" mode """ - assert check_argument_types() if len(shape_files) == 0: raise ValueError("No shape file are given") @@ -160,5 +160,4 @@ def build_batch_sampler( else: raise ValueError(f"Not supported: {type}") - assert check_return_type(retval) return retval diff --git a/espnet2/samplers/category_balanced_sampler.py b/espnet2/samplers/category_balanced_sampler.py index 869bc273f58..b7b4398b482 100644 --- a/espnet2/samplers/category_balanced_sampler.py +++ b/espnet2/samplers/category_balanced_sampler.py @@ -15,11 +15,11 @@ # utterance_id_c 512,80\n", import random from collections import Counter -from typing import Iterator, List, Sequence, Tuple, Union +from typing import Iterator, Optional, Tuple -from typeguard import check_argument_types +from typeguard import typechecked -from espnet2.fileio.read_text import load_num_sequence_text, read_2columns_text +from espnet2.fileio.read_text import read_2columns_text from espnet2.samplers.abs_sampler import AbsSampler @@ -28,16 +28,16 @@ def round_down(num, divisor): class CategoryBalancedSampler(AbsSampler): + @typechecked def __init__( self, batch_size: int, min_batch_size: int = 1, drop_last: bool = False, - category2utt_file: str = None, + category2utt_file: Optional[str] = None, epoch: int = 1, **kwargs, ): - assert check_argument_types() assert batch_size > 0 random.seed(epoch) diff --git a/espnet2/samplers/folded_batch_sampler.py b/espnet2/samplers/folded_batch_sampler.py index 554caa7c74c..7edf9da4755 100644 --- a/espnet2/samplers/folded_batch_sampler.py +++ b/espnet2/samplers/folded_batch_sampler.py @@ -1,12 +1,13 @@ -from typing import Iterator, List, Sequence, Tuple, Union +from typing import Iterator, List, Optional, Sequence, Tuple, Union -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.read_text import load_num_sequence_text, read_2columns_text from espnet2.samplers.abs_sampler import AbsSampler class FoldedBatchSampler(AbsSampler): + @typechecked def __init__( self, batch_size: int, @@ -16,9 +17,8 @@ def __init__( sort_in_batch: str = "descending", sort_batch: str = "ascending", drop_last: bool = False, - utt2category_file: str = None, + utt2category_file: Optional[str] = None, ): - assert check_argument_types() assert batch_size > 0 if sort_batch != "ascending" and sort_batch != "descending": raise ValueError( diff --git a/espnet2/samplers/length_batch_sampler.py b/espnet2/samplers/length_batch_sampler.py index 5e1cf6e3e6d..7d532db2e3c 100644 --- a/espnet2/samplers/length_batch_sampler.py +++ b/espnet2/samplers/length_batch_sampler.py @@ -1,12 +1,13 @@ from typing import Iterator, List, Tuple, Union -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.read_text import load_num_sequence_text from espnet2.samplers.abs_sampler import AbsSampler class LengthBatchSampler(AbsSampler): + @typechecked def __init__( self, batch_bins: int, @@ -17,7 +18,6 @@ def __init__( drop_last: bool = False, padding: bool = True, ): - assert check_argument_types() assert batch_bins > 0 if sort_batch != "ascending" and sort_batch != "descending": raise ValueError( diff --git a/espnet2/samplers/num_elements_batch_sampler.py b/espnet2/samplers/num_elements_batch_sampler.py index 31569e2e81f..540942a8304 100644 --- a/espnet2/samplers/num_elements_batch_sampler.py +++ b/espnet2/samplers/num_elements_batch_sampler.py @@ -1,13 +1,14 @@ from typing import Iterator, List, Tuple, Union import numpy as np -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.read_text import load_num_sequence_text from espnet2.samplers.abs_sampler import AbsSampler class NumElementsBatchSampler(AbsSampler): + @typechecked def __init__( self, batch_bins: int, @@ -18,7 +19,6 @@ def __init__( drop_last: bool = False, padding: bool = True, ): - assert check_argument_types() assert batch_bins > 0 if sort_batch != "ascending" and sort_batch != "descending": raise ValueError( diff --git a/espnet2/samplers/sorted_batch_sampler.py b/espnet2/samplers/sorted_batch_sampler.py index be26aa56010..b30a527e570 100644 --- a/espnet2/samplers/sorted_batch_sampler.py +++ b/espnet2/samplers/sorted_batch_sampler.py @@ -1,7 +1,7 @@ import logging from typing import Iterator, Tuple -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.read_text import load_num_sequence_text from espnet2.samplers.abs_sampler import AbsSampler @@ -17,6 +17,7 @@ class SortedBatchSampler(AbsSampler): sort_batch: """ + @typechecked def __init__( self, batch_size: int, @@ -25,7 +26,6 @@ def __init__( sort_batch: str = "ascending", drop_last: bool = False, ): - assert check_argument_types() assert batch_size > 0 self.batch_size = batch_size self.shape_file = shape_file diff --git a/espnet2/samplers/unsorted_batch_sampler.py b/espnet2/samplers/unsorted_batch_sampler.py index ed8add71282..023883b5cb7 100644 --- a/espnet2/samplers/unsorted_batch_sampler.py +++ b/espnet2/samplers/unsorted_batch_sampler.py @@ -1,7 +1,7 @@ import logging -from typing import Iterator, Tuple +from typing import Iterator, Optional, Tuple -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.fileio.read_text import read_2columns_text from espnet2.samplers.abs_sampler import AbsSampler @@ -20,14 +20,14 @@ class UnsortedBatchSampler(AbsSampler): key_file: """ + @typechecked def __init__( self, batch_size: int, key_file: str, drop_last: bool = False, - utt2category_file: str = None, + utt2category_file: Optional[str] = None, ): - assert check_argument_types() assert batch_size > 0 self.batch_size = batch_size self.key_file = key_file diff --git a/espnet2/schedulers/cosine_anneal_warmup_restart.py b/espnet2/schedulers/cosine_anneal_warmup_restart.py index d5af5c9bb71..56346cfe9f0 100644 --- a/espnet2/schedulers/cosine_anneal_warmup_restart.py +++ b/espnet2/schedulers/cosine_anneal_warmup_restart.py @@ -13,7 +13,8 @@ class CosineAnnealingWarmupRestarts(_LRScheduler, AbsBatchStepScheduler): - """ + """Cosine Annealing Warmup Restart. + optimizer (Optimizer): Wrapped optimizer. first_cycle_steps (int): First cycle step size. cycle_mult(float): Cycle steps magnification. Default: -1. diff --git a/espnet2/schedulers/noam_lr.py b/espnet2/schedulers/noam_lr.py index 44888dbd7aa..1645c4c3810 100644 --- a/espnet2/schedulers/noam_lr.py +++ b/espnet2/schedulers/noam_lr.py @@ -5,7 +5,7 @@ import torch from torch.optim.lr_scheduler import _LRScheduler -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.schedulers.abs_scheduler import AbsBatchStepScheduler @@ -25,6 +25,7 @@ class NoamLR(_LRScheduler, AbsBatchStepScheduler): """ + @typechecked def __init__( self, optimizer: torch.optim.Optimizer, @@ -32,7 +33,6 @@ def __init__( warmup_steps: Union[int, float] = 25000, last_epoch: int = -1, ): - assert check_argument_types() self.model_size = model_size self.warmup_steps = warmup_steps diff --git a/espnet2/schedulers/piecewise_linear_warmup_lr.py b/espnet2/schedulers/piecewise_linear_warmup_lr.py index 8b17fbe9493..527a43f94db 100644 --- a/espnet2/schedulers/piecewise_linear_warmup_lr.py +++ b/espnet2/schedulers/piecewise_linear_warmup_lr.py @@ -5,7 +5,7 @@ import numpy as np import torch from torch.optim.lr_scheduler import _LRScheduler -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.schedulers.abs_scheduler import AbsBatchStepScheduler @@ -18,6 +18,7 @@ class PiecewiseLinearWarmupLR(_LRScheduler, AbsBatchStepScheduler): """ + @typechecked def __init__( self, optimizer: torch.optim.Optimizer, @@ -25,7 +26,6 @@ def __init__( warmup_lr_list: List[float] = [0.0, 0.001], last_epoch: int = -1, ): - assert check_argument_types() self.warmup_steps_list = warmup_steps_list self.warmup_lr_list = warmup_lr_list diff --git a/espnet2/schedulers/warmup_lr.py b/espnet2/schedulers/warmup_lr.py index 904aaff5ca9..40f9e65522a 100644 --- a/espnet2/schedulers/warmup_lr.py +++ b/espnet2/schedulers/warmup_lr.py @@ -4,7 +4,7 @@ import torch from torch.optim.lr_scheduler import _LRScheduler -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.schedulers.abs_scheduler import AbsBatchStepScheduler @@ -25,13 +25,13 @@ class WarmupLR(_LRScheduler, AbsBatchStepScheduler): """ + @typechecked def __init__( self, optimizer: torch.optim.Optimizer, warmup_steps: Union[int, float] = 25000, last_epoch: int = -1, ): - assert check_argument_types() self.warmup_steps = warmup_steps # __init__() must be invoked before setting field diff --git a/espnet2/schedulers/warmup_reducelronplateau.py b/espnet2/schedulers/warmup_reducelronplateau.py index 720bbd20d85..0c1ef80da9f 100644 --- a/espnet2/schedulers/warmup_reducelronplateau.py +++ b/espnet2/schedulers/warmup_reducelronplateau.py @@ -4,7 +4,7 @@ import torch from torch import inf -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.schedulers.abs_scheduler import ( AbsBatchStepScheduler, @@ -35,6 +35,7 @@ class WarmupReduceLROnPlateau(AbsBatchStepScheduler, AbsValEpochStepScheduler): """ + @typechecked def __init__( self, optimizer: torch.optim.Optimizer, @@ -51,7 +52,6 @@ def __init__( eps=1e-8, verbose=False, ): - assert check_argument_types() self.warmup_steps = warmup_steps self.step_num = 0 self.lr_scale = warmup_steps**-1 diff --git a/espnet2/schedulers/warmup_step_lr.py b/espnet2/schedulers/warmup_step_lr.py index e2874b6d7d4..3096e9b0da5 100644 --- a/espnet2/schedulers/warmup_step_lr.py +++ b/espnet2/schedulers/warmup_step_lr.py @@ -4,7 +4,7 @@ import torch from torch.optim.lr_scheduler import _LRScheduler -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.schedulers.abs_scheduler import AbsBatchStepScheduler @@ -28,6 +28,7 @@ class WarmupStepLR(_LRScheduler, AbsBatchStepScheduler): """ + @typechecked def __init__( self, optimizer: torch.optim.Optimizer, @@ -39,7 +40,6 @@ def __init__( gamma: float = 0.1, last_epoch: int = -1, ): - assert check_argument_types() self.warmup_steps = warmup_steps self.step_num = 0 diff --git a/espnet2/slu/espnet_model.py b/espnet2/slu/espnet_model.py index 1ef20103648..e76d79225c0 100644 --- a/espnet2/slu/espnet_model.py +++ b/espnet2/slu/espnet_model.py @@ -3,7 +3,7 @@ import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.decoder.abs_decoder import AbsDecoder @@ -35,6 +35,7 @@ def autocast(enabled=True): class ESPnetSLUModel(ESPnetASRModel): """CTC-attention hybrid Encoder-Decoder model""" + @typechecked def __init__( self, vocab_size: int, @@ -50,7 +51,7 @@ def __init__( joint_network: Optional[torch.nn.Module], postdecoder: Optional[AbsPostDecoder] = None, deliberationencoder: Optional[AbsPostEncoder] = None, - transcript_token_list: Union[Tuple[str, ...], List[str]] = None, + transcript_token_list: Union[Tuple[str, ...], List[str], None] = None, ctc_weight: float = 0.5, interctc_weight: float = 0.0, ignore_id: int = -1, @@ -64,7 +65,6 @@ def __init__( two_pass: bool = False, pre_postencoder_norm: bool = False, ): - assert check_argument_types() assert 0.0 <= ctc_weight <= 1.0, ctc_weight assert 0.0 <= interctc_weight < 1.0, interctc_weight diff --git a/espnet2/slu/postdecoder/hugging_face_transformers_postdecoder.py b/espnet2/slu/postdecoder/hugging_face_transformers_postdecoder.py index bf49cbda3ae..5bf035fff47 100644 --- a/espnet2/slu/postdecoder/hugging_face_transformers_postdecoder.py +++ b/espnet2/slu/postdecoder/hugging_face_transformers_postdecoder.py @@ -15,19 +15,19 @@ import logging import torch -from typeguard import check_argument_types +from typeguard import typechecked class HuggingFaceTransformersPostDecoder(AbsPostDecoder): """Hugging Face Transformers PostEncoder.""" + @typechecked def __init__( self, model_name_or_path: str, output_size=256, ): """Initialize the module.""" - assert check_argument_types() super().__init__() if not is_transformers_available: raise ImportError( diff --git a/espnet2/slu/postencoder/conformer_postencoder.py b/espnet2/slu/postencoder/conformer_postencoder.py index 0d771801454..53edbd84d37 100644 --- a/espnet2/slu/postencoder/conformer_postencoder.py +++ b/espnet2/slu/postencoder/conformer_postencoder.py @@ -7,7 +7,7 @@ from typing import Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder from espnet.nets.pytorch_backend.conformer.convolution import ConvolutionModule @@ -72,6 +72,7 @@ class ConformerPostEncoder(AbsPostEncoder): """ + @typechecked def __init__( self, input_size: int, @@ -97,7 +98,6 @@ def __init__( cnn_module_kernel: int = 31, padding_idx: int = -1, ): - assert check_argument_types() super().__init__() self._output_size = output_size diff --git a/espnet2/slu/postencoder/transformer_postencoder.py b/espnet2/slu/postencoder/transformer_postencoder.py index 72bd8e34d90..861e2d131ab 100644 --- a/espnet2/slu/postencoder/transformer_postencoder.py +++ b/espnet2/slu/postencoder/transformer_postencoder.py @@ -5,7 +5,7 @@ from typing import Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder from espnet.nets.pytorch_backend.nets_utils import make_pad_mask @@ -48,6 +48,7 @@ class TransformerPostEncoder(AbsPostEncoder): padding_idx: padding_idx for input_layer=embed """ + @typechecked def __init__( self, input_size: int, @@ -66,7 +67,6 @@ def __init__( positionwise_conv_kernel_size: int = 1, padding_idx: int = -1, ): - assert check_argument_types() super().__init__() self._output_size = output_size diff --git a/espnet2/spk/encoder/conformer_encoder.py b/espnet2/spk/encoder/conformer_encoder.py index 383fdf7c20f..75df5e96c02 100644 --- a/espnet2/spk/encoder/conformer_encoder.py +++ b/espnet2/spk/encoder/conformer_encoder.py @@ -7,7 +7,7 @@ from typing import List, Optional, Tuple, Union import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet.nets.pytorch_backend.conformer.convolution import ConvolutionModule @@ -39,14 +39,12 @@ Conv2dSubsampling2, Conv2dSubsampling6, Conv2dSubsampling8, - TooShortUttError, - check_short_utt, ) class MfaConformerEncoder(AbsEncoder): - """ - Conformer encoder module for MFA-Conformer. + """Conformer encoder module for MFA-Conformer. + Paper: Y. Zhang et al., ``Mfa-conformer: Multi-scale feature aggregation conformer for automatic speaker verification,'' in Proc. INTERSPEECH, 2022. @@ -78,6 +76,7 @@ class MfaConformerEncoder(AbsEncoder): """ + @typechecked def __init__( self, input_size: int, @@ -103,8 +102,8 @@ def __init__( stochastic_depth_rate: Union[float, List[float]] = 0.0, layer_drop_rate: float = 0.0, max_pos_emb_len: int = 5000, + padding_idx: Optional[int] = None, ): - assert check_argument_types() super().__init__() self._output_size = output_size * num_blocks @@ -308,7 +307,7 @@ def forward( xs_pad, _ = self.embed(x, masks) else: raise NotImplementedError( - f"Supposed to be one of the Conv" f"subsampling layers" + "Supposed to be one of the Conv subsampling layers" ) intermediate_outs = [] diff --git a/espnet2/spk/encoder/ecapa_tdnn_encoder.py b/espnet2/spk/encoder/ecapa_tdnn_encoder.py index 892d9aa9ebf..ad9c48bcdcf 100644 --- a/espnet2/spk/encoder/ecapa_tdnn_encoder.py +++ b/espnet2/spk/encoder/ecapa_tdnn_encoder.py @@ -7,15 +7,15 @@ import torch import torch.nn as nn -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet2.spk.layers.ecapa_block import EcapaBlock class EcapaTdnnEncoder(AbsEncoder): - """ - ECAPA-TDNN encoder. Extracts frame-level ECAPA-TDNN embeddings from + """ECAPA-TDNN encoder. Extracts frame-level ECAPA-TDNN embeddings from + mel-filterbank energy or MFCC features. Paper: B Desplanques at el., ``ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification,'' @@ -29,6 +29,7 @@ class EcapaTdnnEncoder(AbsEncoder): output_size: output embedding dimension. """ + @typechecked def __init__( self, input_size: int, @@ -38,10 +39,9 @@ def __init__( output_size: int = 1536, **kwargs, ): - assert check_argument_types() super().__init__() if block == "EcapaBlock": - block = EcapaBlock + block: type = EcapaBlock else: raise ValueError(f"unsupported block, got: {block}") self._output_size = output_size diff --git a/espnet2/spk/encoder/identity_encoder.py b/espnet2/spk/encoder/identity_encoder.py index 360010acb62..1fc25d6e7fe 100644 --- a/espnet2/spk/encoder/identity_encoder.py +++ b/espnet2/spk/encoder/identity_encoder.py @@ -4,16 +4,13 @@ """RawNet3 Encoder""" import torch -import torch.nn as nn -from typeguard import check_argument_types from espnet2.asr.encoder.abs_encoder import AbsEncoder -from espnet2.spk.layers.rawnet_block import Bottle2neck class IdentityEncoder(AbsEncoder): - """ - Identity encoder. Does nothing, just passes frontend feature to the pooling. + """Identity encoder. Does nothing, just passes frontend feature to the pooling. + Expected to be used for cases when frontend already has a good representation (e.g., SSL features). diff --git a/espnet2/spk/encoder/rawnet3_encoder.py b/espnet2/spk/encoder/rawnet3_encoder.py index 3e1aabe29e5..5b38ee189c1 100644 --- a/espnet2/spk/encoder/rawnet3_encoder.py +++ b/espnet2/spk/encoder/rawnet3_encoder.py @@ -5,15 +5,15 @@ import torch import torch.nn as nn -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet2.spk.layers.rawnet_block import Bottle2neck class RawNet3Encoder(AbsEncoder): - """ - RawNet3 encoder. Extracts frame-level RawNet embeddings from raw waveform. + """RawNet3 encoder. Extracts frame-level RawNet embeddings from raw waveform. + paper: J. Jung et al., "Pushing the limits of raw waveform speaker recognition", in Proc. INTERSPEECH, 2022. @@ -25,6 +25,7 @@ class RawNet3Encoder(AbsEncoder): output_size: ouptut embedding dimension. """ + @typechecked def __init__( self, input_size: int, @@ -34,10 +35,9 @@ def __init__( output_size: int = 1536, **kwargs, ): - assert check_argument_types() super().__init__() if block == "Bottle2neck": - block = Bottle2neck + block: type = Bottle2neck else: raise ValueError(f"unsupported block, got: {block}") diff --git a/espnet2/spk/encoder/ska_tdnn_encoder.py b/espnet2/spk/encoder/ska_tdnn_encoder.py index 25548758082..ddaab6bb7df 100644 --- a/espnet2/spk/encoder/ska_tdnn_encoder.py +++ b/espnet2/spk/encoder/ska_tdnn_encoder.py @@ -1,5 +1,3 @@ -#! /usr/bin/python -# -*- encoding: utf-8 -*- # SKA-TDNN, original code from: https://github.com/msh9184/ska-tdnn # adapted for ESPnet-SPK by Jee-weon Jung import math @@ -7,8 +5,7 @@ import torch import torch.nn as nn -import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder @@ -168,7 +165,8 @@ def __init__(self, channel=128, reduction=4, L=16, num_kernels=2): self.softmax = nn.Softmax(dim=0) def forward(self, x, convs): - """ + """Forward function. + Input: [B, C, T] Split: [K, B, C, T] Fues: [B, C, T] @@ -240,7 +238,8 @@ def __init__( self.softmax = nn.Softmax(dim=0) def forward(self, x): - """ + """Forward function. + Input: [B, C, F, T] Split: [K, B, C, F, T] Fues: [B, C, F, T] @@ -312,7 +311,8 @@ def __init__( self.softmax = nn.Softmax(dim=0) def forward(self, x): - """ + """Forward Function. + Input: [B, C, F, T] Split: [K, B, C, F, T] Fuse: [B, C, F, T] @@ -339,8 +339,8 @@ def forward(self, x): class SkaTdnnEncoder(AbsEncoder): - """ - SKA-TDNN encoder. Extracts frame-level SKA-TDNN embeddings from features. + """SKA-TDNN encoder. Extracts frame-level SKA-TDNN embeddings from features. + Paper: S. Mun, J. Jung et al., "Frequency and Multi-Scale Selective Kernel Attention for Speaker Verification,' in Proc. IEEE SLT 2022. @@ -352,6 +352,7 @@ class SkaTdnnEncoder(AbsEncoder): output_size: ouptut embedding dimension. """ + @typechecked def __init__( self, input_size: int, @@ -363,11 +364,10 @@ def __init__( output_size: int = 1536, **kwargs, ): - assert check_argument_types() super().__init__() if block == "Bottle2neck": - block = Bottle2neck + block: type = Bottle2neck else: raise ValueError(f"unsupported block, got: {block}") diff --git a/espnet2/spk/encoder/xvector_encoder.py b/espnet2/spk/encoder/xvector_encoder.py index 77acbd5752d..ffdeb7d0dd5 100644 --- a/espnet2/spk/encoder/xvector_encoder.py +++ b/espnet2/spk/encoder/xvector_encoder.py @@ -1,22 +1,17 @@ -#! /usr/bin/python -# -*- encoding: utf-8 -*- # x-vector, cross checked with SpeechBrain implementation: # https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/lobes/models/Xvector.py # adapted for ESPnet-SPK by Jee-weon Jung -import math -from collections import OrderedDict from typing import List -import torch import torch.nn as nn -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder class XvectorEncoder(AbsEncoder): - """ - x-vector encoder. Extracts frame-level x-vector embeddings from features. + """X-vector encoder. Extracts frame-level x-vector embeddings from features. + Paper: D. Snyder et al., "X-vectors: Robust dnn embeddings for speaker recognition," in Proc. IEEE ICASSP, 2018. @@ -26,6 +21,7 @@ class XvectorEncoder(AbsEncoder): output_size: ouptut embedding dimension. """ + @typechecked def __init__( self, input_size: int, @@ -36,7 +32,6 @@ def __init__( dilations: List = [1, 2, 3, 1, 1], **kwargs, ): - assert check_argument_types() super().__init__() self._output_size = output_size in_channels = [input_size] + [ndim] * 4 diff --git a/espnet2/spk/espnet_model.py b/espnet2/spk/espnet_model.py index 321229c092e..b968b4031ae 100644 --- a/espnet2/spk/espnet_model.py +++ b/espnet2/spk/espnet_model.py @@ -4,13 +4,12 @@ from typing import Dict, Optional, Tuple, Union import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet2.asr.frontend.abs_frontend import AbsFrontend from espnet2.asr.specaug.abs_specaug import AbsSpecAug from espnet2.layers.abs_normalize import AbsNormalize -from espnet2.spk.loss.aamsoftmax import AAMSoftmax from espnet2.spk.loss.abs_loss import AbsLoss from espnet2.spk.pooling.abs_pooling import AbsPooling from espnet2.spk.projector.abs_projector import AbsProjector @@ -19,8 +18,8 @@ class ESPnetSpeakerModel(AbsESPnetModel): - """ - Speaker embedding extraction model. + """Speaker embedding extraction model. + Core model for diverse speaker-related tasks (e.g., verification, open-set identification, diarization) @@ -39,6 +38,7 @@ class ESPnetSpeakerModel(AbsESPnetModel): (e.g., ASR, SE, target speaker extraction). """ + @typechecked def __init__( self, frontend: Optional[AbsFrontend], @@ -49,7 +49,6 @@ def __init__( projector: Optional[AbsProjector], loss: Optional[AbsLoss], ): - assert check_argument_types() super().__init__() @@ -61,16 +60,19 @@ def __init__( self.projector = projector self.loss = loss + @typechecked def forward( self, speech: torch.Tensor, - spk_labels: torch.Tensor = None, - task_tokens: torch.Tensor = None, + spk_labels: Optional[torch.Tensor] = None, + task_tokens: Optional[torch.Tensor] = None, extract_embd: bool = False, **kwargs, - ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]: - """ - Feed-forward through encoder layers and aggregate into utterance-level + ) -> Union[ + Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor], torch.Tensor + ]: + """Feed-forward through encoder layers and aggregate into utterance-level + feature. Args: diff --git a/espnet2/spk/layers/rawnet_block.py b/espnet2/spk/layers/rawnet_block.py index 5461c62873f..fd640403aa0 100755 --- a/espnet2/spk/layers/rawnet_block.py +++ b/espnet2/spk/layers/rawnet_block.py @@ -6,8 +6,7 @@ class AFMS(nn.Module): - """ - Alpha-Feature map scaling, added to the output of each residual block[1,2]. + """Alpha-Feature map scaling, added to the output of each residual block[1,2]. Reference: [1] RawNet2 : https://www.isca-speech.org/archive/Interspeech_2020/pdfs/1011.pdf diff --git a/espnet2/spk/loss/aamsoftmax.py b/espnet2/spk/loss/aamsoftmax.py index 4005fa03e85..1bc412bafbf 100644 --- a/espnet2/spk/loss/aamsoftmax.py +++ b/espnet2/spk/loss/aamsoftmax.py @@ -13,8 +13,7 @@ class AAMSoftmax(AbsLoss): - """ - Additive angular margin softmax. + """Additive angular margin softmax. Paper: Deng, Jiankang, et al. "Arcface: Additive angular margin loss for deep face recognition." Proceedings of the IEEE/CVF conference on computer diff --git a/espnet2/spk/loss/aamsoftmax_subcenter_intertopk.py b/espnet2/spk/loss/aamsoftmax_subcenter_intertopk.py index 1dfdf17fdd0..7de85c3be6a 100644 --- a/espnet2/spk/loss/aamsoftmax_subcenter_intertopk.py +++ b/espnet2/spk/loss/aamsoftmax_subcenter_intertopk.py @@ -1,5 +1,3 @@ -#! /usr/bin/python -# -*- encoding: utf-8 -*- # code from WeSpeaker: https://github.com/wenet-e2e/wespeaker/blob/ # c9ec537b53fe1e04525be74b2550ee95bed3a891/wespeaker/models/projections.py#L243 @@ -14,6 +12,7 @@ class ArcMarginProduct_intertopk_subcenter(AbsLoss): r"""Implement of large margin arc distance with intertopk and subcenter: + Reference: MULTI-QUERY MULTI-HEAD ATTENTION POOLING AND INTER-TOPK PENALTY FOR SPEAKER VERIFICATION. diff --git a/espnet2/spk/loss/abs_loss.py b/espnet2/spk/loss/abs_loss.py index fdf3abf1bdb..8c616daea8b 100644 --- a/espnet2/spk/loss/abs_loss.py +++ b/espnet2/spk/loss/abs_loss.py @@ -1,8 +1,6 @@ -#! /usr/bin/python -# -*- encoding: utf-8 -*- # code from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/aamsoftmax.py # Adapted from https://github.com/wujiyang/Face_Pytorch (Apache License) -from abc import ABC, abstractmethod +from abc import abstractmethod import torch import torch.nn as nn diff --git a/espnet2/spk/pooling/chn_attn_stat_pooling.py b/espnet2/spk/pooling/chn_attn_stat_pooling.py index c7f79921c92..04de37c6a4b 100644 --- a/espnet2/spk/pooling/chn_attn_stat_pooling.py +++ b/espnet2/spk/pooling/chn_attn_stat_pooling.py @@ -5,8 +5,8 @@ class ChnAttnStatPooling(AbsPooling): - """ - Aggregates frame-level features to single utterance-level feature. + """Aggregates frame-level features to single utterance-level feature. + Proposed in B.Desplanques et al., "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification" diff --git a/espnet2/spk/pooling/mean_pooling.py b/espnet2/spk/pooling/mean_pooling.py index a9a0942ec6a..b205960e3fc 100644 --- a/espnet2/spk/pooling/mean_pooling.py +++ b/espnet2/spk/pooling/mean_pooling.py @@ -4,8 +4,7 @@ class MeanPooling(AbsPooling): - """ - Average frame-level features to a single utterance-level feature. + """Average frame-level features to a single utterance-level feature. args: input_size: dimensionality of the input frame-level embeddings. diff --git a/espnet2/spk/pooling/stat_pooling.py b/espnet2/spk/pooling/stat_pooling.py index d50d72e2e60..97f1f23a7e9 100644 --- a/espnet2/spk/pooling/stat_pooling.py +++ b/espnet2/spk/pooling/stat_pooling.py @@ -4,8 +4,8 @@ class StatsPooling(AbsPooling): - """ - Aggregates frame-level features to single utterance-level feature. + """Aggregates frame-level features to single utterance-level feature. + Proposed in D. Snyder et al., "X-vectors: Robust dnn embeddings for speaker recognition" diff --git a/espnet2/st/espnet_model.py b/espnet2/st/espnet_model.py index e377bbfcb0c..56435c7b6e6 100644 --- a/espnet2/st/espnet_model.py +++ b/espnet2/st/espnet_model.py @@ -7,7 +7,7 @@ import torch from packaging.version import parse as V from torch.nn.utils.rnn import pad_sequence -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.decoder.abs_decoder import AbsDecoder @@ -40,6 +40,7 @@ def autocast(enabled=True): class ESPnetSTModel(AbsESPnetModel): """CTC-attention hybrid Encoder-Decoder model""" + @typechecked def __init__( self, vocab_size: int, @@ -82,7 +83,6 @@ def __init__( tgt_sym_eos: str = "", lang_token_id: int = -1, ): - assert check_argument_types() assert 0.0 <= asr_weight < 1.0, "asr_weight should be [0.0, 1.0)" assert 0.0 <= mt_weight < 1.0, "mt_weight should be [0.0, 1.0)" assert 0.0 <= mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]" diff --git a/espnet2/svs/espnet_model.py b/espnet2/svs/espnet_model.py index 6618d862a0a..66843f7db12 100644 --- a/espnet2/svs/espnet_model.py +++ b/espnet2/svs/espnet_model.py @@ -10,7 +10,7 @@ from typing import Dict, Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.layers.abs_normalize import AbsNormalize from espnet2.layers.inversible_interface import InversibleInterface @@ -35,6 +35,7 @@ def autocast(enabled=True): # NOQA class ESPnetSVSModel(AbsESPnetModel): """ESPnet model for singing voice synthesis task.""" + @typechecked def __init__( self, text_extract: Optional[AbsFeatsExtract], @@ -51,7 +52,6 @@ def __init__( svs: AbsSVS, ): """Initialize ESPnetSVSModel module.""" - assert check_argument_types() super().__init__() self.text_extract = text_extract self.feats_extract = feats_extract diff --git a/espnet2/svs/feats_extract/score_feats_extract.py b/espnet2/svs/feats_extract/score_feats_extract.py index 7aea42c89e8..7118ef42eb9 100644 --- a/espnet2/svs/feats_extract/score_feats_extract.py +++ b/espnet2/svs/feats_extract/score_feats_extract.py @@ -1,7 +1,7 @@ from typing import Any, Dict, Optional, Tuple, Union import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.tts.feats_extract.abs_feats_extract import AbsFeatsExtract from espnet.nets.pytorch_backend.nets_utils import make_pad_mask @@ -17,6 +17,7 @@ def ListsToTensor(xs): class FrameScoreFeats(AbsFeatsExtract): + @typechecked def __init__( self, fs: Union[int, str] = 22050, @@ -28,7 +29,6 @@ def __init__( ): if win_length is None: win_length = n_fft - assert check_argument_types() super().__init__() self.fs = fs @@ -154,6 +154,7 @@ def forward( class SyllableScoreFeats(AbsFeatsExtract): + @typechecked def __init__( self, fs: Union[int, str] = 22050, @@ -165,7 +166,6 @@ def __init__( ): if win_length is None: win_length = n_fft - assert check_argument_types() super().__init__() self.fs = fs diff --git a/espnet2/svs/naive_rnn/naive_rnn.py b/espnet2/svs/naive_rnn/naive_rnn.py index 919a9843a57..97914c33d79 100644 --- a/espnet2/svs/naive_rnn/naive_rnn.py +++ b/espnet2/svs/naive_rnn/naive_rnn.py @@ -7,7 +7,7 @@ import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.svs.abs_svs import AbsSVS from espnet2.torch_utils.device_funcs import force_gatherable @@ -89,6 +89,7 @@ class NaiveRNN(AbsSVS): predict the singing voice features """ + @typechecked def __init__( self, # network structure related @@ -167,7 +168,6 @@ def __init__( loss_type (str): Loss function type ("L1", "L2", or "L1+L2"). """ - assert check_argument_types() super().__init__() # store hyperparameters diff --git a/espnet2/svs/naive_rnn/naive_rnn_dp.py b/espnet2/svs/naive_rnn/naive_rnn_dp.py index 68b42b8c0d3..f72f0a903a5 100644 --- a/espnet2/svs/naive_rnn/naive_rnn_dp.py +++ b/espnet2/svs/naive_rnn/naive_rnn_dp.py @@ -7,7 +7,7 @@ import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.svs.abs_svs import AbsSVS from espnet2.torch_utils.device_funcs import force_gatherable @@ -31,6 +31,7 @@ class NaiveRNNDP(AbsSVS): predict the singing voice features """ + @typechecked def __init__( self, # network structure related @@ -116,7 +117,6 @@ def __init__( loss calculation. """ - assert check_argument_types() super().__init__() # store hyperparameters diff --git a/espnet2/svs/singing_tacotron/decoder.py b/espnet2/svs/singing_tacotron/decoder.py index d4a698c57e5..42f529d9c8a 100644 --- a/espnet2/svs/singing_tacotron/decoder.py +++ b/espnet2/svs/singing_tacotron/decoder.py @@ -9,7 +9,6 @@ import six import torch -import torch.nn.functional as F from espnet.nets.pytorch_backend.rnn.attentions import AttForwardTA from espnet.nets.pytorch_backend.tacotron2.decoder import Postnet, Prenet, ZoneOutCell @@ -251,6 +250,7 @@ def inference( forward_window=3, ): """Generate the sequence of features given the sequences of characters. + Args: h (Tensor): Input sequence of encoder hidden states (T, C). trans_token (Tensor): Global transition token for duration. diff --git a/espnet2/svs/singing_tacotron/singing_tacotron.py b/espnet2/svs/singing_tacotron/singing_tacotron.py index 861ce8583db..cdc1ec2ced7 100644 --- a/espnet2/svs/singing_tacotron/singing_tacotron.py +++ b/espnet2/svs/singing_tacotron/singing_tacotron.py @@ -7,11 +7,9 @@ import logging from typing import Dict, Optional, Sequence, Tuple -import six import torch import torch.nn.functional as F -from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.svs.abs_svs import AbsSVS from espnet2.svs.singing_tacotron.decoder import Decoder @@ -45,6 +43,7 @@ class singing_tacotron(AbsSVS): """ + @typechecked def __init__( self, # network structure related @@ -70,7 +69,7 @@ def __init__( postnet_layers: int = 5, postnet_chans: int = 512, postnet_filts: int = 5, - output_activation: str = None, + output_activation: Optional[str] = None, use_batch_norm: bool = True, use_concate: bool = True, use_residual: bool = False, @@ -156,7 +155,6 @@ def __init__( guided_attn_loss_lambda (float): Lambda in guided attention loss. """ - assert check_argument_types() super().__init__() # store hyperparameters diff --git a/espnet2/svs/xiaoice/XiaoiceSing.py b/espnet2/svs/xiaoice/XiaoiceSing.py index 588ddb2438f..a39fa9f8d6d 100644 --- a/espnet2/svs/xiaoice/XiaoiceSing.py +++ b/espnet2/svs/xiaoice/XiaoiceSing.py @@ -9,7 +9,7 @@ import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.svs.abs_svs import AbsSVS from espnet2.svs.xiaoice.loss import XiaoiceSing2Loss @@ -48,6 +48,7 @@ class XiaoiceSing(AbsSVS): https://arxiv.org/pdf/2006.06261.pdf """ + @typechecked def __init__( self, # network structure related @@ -182,7 +183,6 @@ def __init__( lambda_vuv (float): Loss scaling coefficient for VUV loss. """ - assert check_argument_types() super().__init__() # store hyperparameters diff --git a/espnet2/svs/xiaoice/loss.py b/espnet2/svs/xiaoice/loss.py index 8f8a850f6a1..c545e58dd75 100644 --- a/espnet2/svs/xiaoice/loss.py +++ b/espnet2/svs/xiaoice/loss.py @@ -7,7 +7,7 @@ from typing import Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet.nets.pytorch_backend.fastspeech.duration_predictor import ( # noqa: H301 DurationPredictorLoss, @@ -18,6 +18,7 @@ class XiaoiceSing2Loss(torch.nn.Module): """Loss function module for FastSpeech2.""" + @typechecked def __init__(self, use_masking: bool = True, use_weighted_masking: bool = False): """Initialize feed-forward Transformer loss module. @@ -28,7 +29,6 @@ def __init__(self, use_masking: bool = True, use_weighted_masking: bool = False) calculation. """ - assert check_argument_types() super().__init__() assert (use_masking != use_weighted_masking) or not use_masking diff --git a/espnet2/tasks/abs_task.py b/espnet2/tasks/abs_task.py index d181774808a..72f6aa1245f 100644 --- a/espnet2/tasks/abs_task.py +++ b/espnet2/tasks/abs_task.py @@ -19,7 +19,7 @@ import yaml from packaging.version import parse as V from torch.utils.data import DataLoader -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet import __version__ from espnet2.iterators.abs_iter_factory import AbsIterFactory @@ -278,8 +278,8 @@ def build_model(cls, args: argparse.Namespace) -> AbsESPnetModel: raise NotImplementedError @classmethod + @typechecked def get_parser(cls) -> config_argparse.ArgumentParser: - assert check_argument_types() class ArgumentDefaultsRawTextHelpFormatter( argparse.RawTextHelpFormatter, @@ -961,7 +961,6 @@ class ArgumentDefaultsRawTextHelpFormatter( cls.trainer.add_arguments(parser) cls.add_task_arguments(parser) - assert check_return_type(parser) return parser @classmethod @@ -1004,6 +1003,7 @@ def exclude_opts(cls) -> Tuple[str, ...]: return "required", "print_config", "config", "ngpu" @classmethod + @typechecked def get_default_config(cls) -> Dict[str, Any]: """Return the configuration as dict. @@ -1017,7 +1017,6 @@ def get_class_type(name: str, classes: dict): return _cls # This method is used only for --print_config - assert check_argument_types() parser = cls.get_parser() args, _ = parser.parse_known_args() config = vars(args) @@ -1071,8 +1070,8 @@ def get_class_type(name: str, classes: dict): return config @classmethod + @typechecked def check_required_command_args(cls, args: argparse.Namespace): - assert check_argument_types() for k in vars(args): if "-" in k: raise RuntimeError(f'Use "_" instead of "-": parser.get_parser("{k}")') @@ -1093,6 +1092,7 @@ def check_required_command_args(cls, args: argparse.Namespace): sys.exit(2) @classmethod + @typechecked def check_task_requirements( cls, dataset: Union[AbsDataset, IterableESPnetDataset], @@ -1101,7 +1101,6 @@ def check_task_requirements( inference: bool = False, ) -> None: """Check if the dataset satisfy the requirement of current Task""" - assert check_argument_types() mes = ( f"If you intend to use an additional input, modify " f'"{cls.__name__}.required_data_names()" or ' @@ -1127,15 +1126,19 @@ def check_task_requirements( ) @classmethod + @typechecked def print_config(cls, file=sys.stdout) -> None: - assert check_argument_types() # Shows the config: e.g. python train.py asr --print_config config = cls.get_default_config() file.write(yaml_no_alias_safe_dump(config, indent=4, sort_keys=False)) @classmethod - def main(cls, args: argparse.Namespace = None, cmd: Sequence[str] = None): - assert check_argument_types() + @typechecked + def main( + cls, + args: Optional[argparse.Namespace] = None, + cmd: Optional[Sequence[str]] = None, + ): print(get_commandline_args(), file=sys.stderr) if args is None: parser = cls.get_parser() @@ -1205,8 +1208,8 @@ def main(cls, args: argparse.Namespace = None, cmd: Sequence[str] = None): pass @classmethod + @typechecked def main_worker(cls, args: argparse.Namespace): - assert check_argument_types() # 0. Init distributed process distributed_option = build_dataclass(DistributedOption, args) @@ -1573,12 +1576,13 @@ def build_iter_options( ) @classmethod + @typechecked def build_iter_factory( cls, args: argparse.Namespace, distributed_option: DistributedOption, mode: str, - kwargs: dict = None, + kwargs: Optional[dict] = None, ) -> AbsIterFactory: """Build a factory object of mini-batch iterator. @@ -1604,7 +1608,6 @@ def build_iter_factory( - 4 epoch with "--num_iters_per_epoch" == 1 """ - assert check_argument_types() iter_options = cls.build_iter_options(args, distributed_option, mode) # Overwrite iter_options if any kwargs is given @@ -1645,10 +1648,10 @@ def build_iter_factory( raise RuntimeError(f"Not supported: iterator_type={iterator_type}") @classmethod + @typechecked def build_sequence_iter_factory( cls, args: argparse.Namespace, iter_options: IteratorOptions, mode: str ) -> AbsIterFactory: - assert check_argument_types() dataset = ESPnetDataset( iter_options.data_path_and_name_and_type, @@ -1727,10 +1730,10 @@ def build_sequence_iter_factory( ) @classmethod + @typechecked def build_category_iter_factory( cls, args: argparse.Namespace, iter_options: IteratorOptions, mode: str ) -> AbsIterFactory: - assert check_argument_types() dataset = ESPnetDataset( iter_options.data_path_and_name_and_type, @@ -1811,13 +1814,13 @@ def build_category_iter_factory( ) @classmethod + @typechecked def build_chunk_iter_factory( cls, args: argparse.Namespace, iter_options: IteratorOptions, mode: str, ) -> AbsIterFactory: - assert check_argument_types() dataset = ESPnetDataset( iter_options.data_path_and_name_and_type, @@ -1922,10 +1925,10 @@ def build_task_iter_factory( raise NotImplementedError @classmethod + @typechecked def build_multiple_iter_factory( cls, args: argparse.Namespace, distributed_option: DistributedOption, mode: str ): - assert check_argument_types() iter_options = cls.build_iter_options(args, distributed_option, mode) assert len(iter_options.data_path_and_name_and_type) > 0, len( iter_options.data_path_and_name_and_type @@ -2008,22 +2011,22 @@ def build_multiple_iter_factory( ) @classmethod + @typechecked def build_streaming_iterator( cls, data_path_and_name_and_type, preprocess_fn, collate_fn, - key_file: str = None, + key_file: Optional[str] = None, batch_size: int = 1, dtype: str = np.float32, num_workers: int = 1, allow_variable_data_keys: bool = False, ngpu: int = 0, inference: bool = False, - mode: str = None, + mode: Optional[str] = None, ) -> DataLoader: """Build DataLoader using iterable dataset""" - assert check_argument_types() # For backward compatibility for pytorch DataLoader if collate_fn is not None: kwargs = dict(collate_fn=collate_fn) @@ -2054,10 +2057,11 @@ def build_streaming_iterator( # ~~~~~~~~~ The methods below are mainly used for inference ~~~~~~~~~ @classmethod + @typechecked def build_model_from_file( cls, - config_file: Union[Path, str] = None, - model_file: Union[Path, str] = None, + config_file: Optional[Union[Path, str]] = None, + model_file: Optional[Union[Path, str]] = None, device: str = "cpu", ) -> Tuple[AbsESPnetModel, argparse.Namespace]: """Build model from the files. @@ -2070,7 +2074,6 @@ def build_model_from_file( device: Device type, "cpu", "cuda", or "cuda:N". """ - assert check_argument_types() if config_file is None: assert model_file is not None, ( "The argument 'model_file' must be provided " diff --git a/espnet2/tasks/asr.py b/espnet2/tasks/asr.py index 5a6ef27cdc7..ab617ad7105 100644 --- a/espnet2/tasks/asr.py +++ b/espnet2/tasks/asr.py @@ -4,7 +4,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.decoder.abs_decoder import AbsDecoder @@ -403,19 +403,19 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() if args.use_preprocessor: try: _ = getattr(args, "preprocessor") @@ -467,7 +467,6 @@ def build_preprocess_fn( ) else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -492,12 +491,11 @@ def optional_data_names( retval = tuple(retval) logging.info(f"Optional Data Names: {retval }") - assert check_return_type(retval) return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetASRModel: - assert check_argument_types() if isinstance(args.token_list, str): with open(args.token_list, encoding="utf-8") as f: token_list = [line.rstrip() for line in f] @@ -632,5 +630,4 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetASRModel: if args.init is not None: initialize(model, args.init) - assert check_return_type(model) return model diff --git a/espnet2/tasks/asr_transducer.py b/espnet2/tasks/asr_transducer.py index 3139577732e..0906b45cfd6 100644 --- a/espnet2/tasks/asr_transducer.py +++ b/espnet2/tasks/asr_transducer.py @@ -2,12 +2,11 @@ import argparse import logging -import os from typing import Callable, Collection, Dict, List, Optional, Tuple import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.frontend.abs_frontend import AbsFrontend from espnet2.asr.frontend.default import DefaultFrontend @@ -224,6 +223,7 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], @@ -239,11 +239,11 @@ def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ : Callable collate function. """ - assert check_argument_types() return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: @@ -258,7 +258,6 @@ def build_preprocess_fn( : Callable pre-processing function. """ - assert check_argument_types() if args.use_preprocessor: retval = CommonPreprocessor( @@ -287,7 +286,6 @@ def build_preprocess_fn( else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -328,11 +326,11 @@ def optional_data_names( """ retval = () - assert check_return_type(retval) return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetASRTransducerModel: """Required data depending on task mode. @@ -344,7 +342,6 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetASRTransducerModel: model: ASR Transducer model. """ - assert check_argument_types() if isinstance(args.token_list, str): with open(args.token_list, encoding="utf-8") as f: @@ -431,6 +428,4 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetASRTransducerModel: "Initialization part will be reworked in a short future.", ) - assert check_return_type(model) - return model diff --git a/espnet2/tasks/asvspoof.py b/espnet2/tasks/asvspoof.py index 63ae02aab81..dc55f0cf7b5 100644 --- a/espnet2/tasks/asvspoof.py +++ b/espnet2/tasks/asvspoof.py @@ -1,14 +1,13 @@ import argparse -import logging from typing import Callable, Collection, Dict, List, Optional, Tuple import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder -# TODO1 (checkpoint 2): import conformer class class +# TODO(checkpoint1): import conformer class class from espnet2.asr.encoder.transformer_encoder import TransformerEncoder from espnet2.asr.frontend.abs_frontend import AbsFrontend from espnet2.asr.frontend.default import DefaultFrontend @@ -82,7 +81,7 @@ encoder_choices = ClassChoices( "encoder", classes=dict( - # TODO2 (checkpoint 2): add conformer option in encoder + # TODO(checkpoint2): add conformer option in encoder transformer=TransformerEncoder, ), type_check=AbsEncoder, @@ -184,26 +183,25 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() if args.use_preprocessor: retval = CommonPreprocessor( train=train, ) else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -222,12 +220,11 @@ def optional_data_names( cls, train: bool = True, inference: bool = False ) -> Tuple[str, ...]: retval = () - assert check_return_type(retval) return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetASVSpoofModel: - assert check_argument_types() # 1. frontend if args.input_size is None: @@ -307,5 +304,4 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetASVSpoofModel: if args.init is not None: initialize(model, args.init) - assert check_return_type(model) return model diff --git a/espnet2/tasks/diar.py b/espnet2/tasks/diar.py index c25e4f94e73..7fed5805a89 100644 --- a/espnet2/tasks/diar.py +++ b/espnet2/tasks/diar.py @@ -3,7 +3,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet2.asr.encoder.conformer_encoder import ConformerEncoder @@ -173,25 +173,24 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() if args.use_preprocessor: # FIXME (jiatong): add more argument here retval = CommonPreprocessor(train=train) else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -211,12 +210,11 @@ def optional_data_names( ) -> Tuple[str, ...]: # (Note: jiatong): no optional data names for now retval = () - assert check_return_type(retval) return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetDiarizationModel: - assert check_argument_types() # 1. frontend if args.input_size is None: @@ -295,5 +293,4 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetDiarizationModel: if args.init is not None: initialize(model, args.init) - assert check_return_type(model) return model diff --git a/espnet2/tasks/enh.py b/espnet2/tasks/enh.py index 543b03031b5..7353fc21c7f 100644 --- a/espnet2/tasks/enh.py +++ b/espnet2/tasks/enh.py @@ -5,7 +5,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.diar.layers.abs_mask import AbsMask from espnet2.diar.layers.multi_mask import MultiMask @@ -394,19 +394,19 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() return CommonCollateFn(float_pad_value=0.0, int_pad_value=0) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() use_preprocessor = getattr(args, "preprocessor", None) is not None @@ -468,7 +468,6 @@ def build_preprocess_fn( ) else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -492,12 +491,11 @@ def optional_data_names( retval += ["noise_ref{}".format(n) for n in range(1, MAX_REFERENCE_NUM + 1)] retval += ["category"] retval = tuple(retval) - assert check_return_type(retval) return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetEnhancementModel: - assert check_argument_types() encoder = encoder_choices.get_class(args.encoder)(**args.encoder_conf) separator = separator_choices.get_class(args.separator)( @@ -554,7 +552,6 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetEnhancementModel: if args.init is not None: initialize(model, args.init) - assert check_return_type(model) return model @classmethod diff --git a/espnet2/tasks/enh_s2t.py b/espnet2/tasks/enh_s2t.py index 8a3ade155ae..5aa9b3bfbe6 100644 --- a/espnet2/tasks/enh_s2t.py +++ b/espnet2/tasks/enh_s2t.py @@ -5,7 +5,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.espnet_model import ESPnetASRModel @@ -431,19 +431,19 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() if args.use_preprocessor: if "st" in args.subtask_series: retval = MutliTokenizerCommonPreprocessor( @@ -486,7 +486,6 @@ def build_preprocess_fn( ) else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -511,12 +510,11 @@ def optional_data_names( retval += ["text_spk{}".format(n) for n in range(1, MAX_REFERENCE_NUM + 1)] retval += ["src_text"] retval = tuple(retval) - assert check_return_type(retval) return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetEnhS2TModel: - assert check_argument_types() # Build submodels in the order of subtask_series model_conf = args.model_conf.copy() @@ -553,5 +551,4 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetEnhS2TModel: if args.init is not None: initialize(model, args.init) - assert check_return_type(model) return model diff --git a/espnet2/tasks/enh_tse.py b/espnet2/tasks/enh_tse.py index f5069522073..3816464f427 100644 --- a/espnet2/tasks/enh_tse.py +++ b/espnet2/tasks/enh_tse.py @@ -3,7 +3,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.enh.espnet_model_tse import ESPnetExtractionModel from espnet2.enh.extractor.abs_extractor import AbsExtractor @@ -256,19 +256,19 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() return CommonCollateFn(float_pad_value=0.0, int_pad_value=0) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() kwargs = dict( train_spk2enroll=args.train_spk2enroll, enroll_segment=getattr(args, "enroll_segment", None), @@ -295,7 +295,6 @@ def build_preprocess_fn( ) kwargs.update(args.preprocessor_conf) retval = TSEPreprocessor(train=train, **kwargs) - assert check_return_type(retval) return retval @classmethod @@ -324,12 +323,11 @@ def optional_data_names( ] retval += ["category"] retval = tuple(retval) - assert check_return_type(retval) return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetExtractionModel: - assert check_argument_types() encoder = encoder_choices.get_class(args.encoder)(**args.encoder_conf) extractor = extractor_choices.get_class(args.extractor)( @@ -364,5 +362,4 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetExtractionModel: if args.init is not None: initialize(model, args.init) - assert check_return_type(model) return model diff --git a/espnet2/tasks/gan_svs.py b/espnet2/tasks/gan_svs.py index 0ed1bcf95b8..f83421b5592 100644 --- a/espnet2/tasks/gan_svs.py +++ b/espnet2/tasks/gan_svs.py @@ -10,7 +10,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.gan_svs.abs_gan_svs import AbsGANSVS from espnet2.gan_svs.espnet_model import ESPnetGANSVSModel @@ -154,9 +154,9 @@ class GANSVSTask(AbsTask): trainer = GANTrainer @classmethod + @typechecked def add_task_arguments(cls, parser: argparse.ArgumentParser): # NOTE(kamo): Use '_' instead of '-' to avoid confusion - assert check_argument_types() group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used @@ -236,11 +236,11 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() return CommonCollateFn( float_pad_value=0.0, int_pad_value=0, @@ -248,10 +248,10 @@ def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ ) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array], float], Dict[str, np.ndarray]]]: - assert check_argument_types() if args.use_preprocessor: retval = SVSPreprocessor( train=train, @@ -266,8 +266,6 @@ def build_preprocess_fn( ) else: retval = None - # FIXME (jiatong): sometimes checking is not working here - # assert check_return_type(retval) return retval # TODO(Yuning): check new names @@ -303,8 +301,8 @@ def optional_data_names( return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetGANSVSModel: - assert check_argument_types() if isinstance(args.token_list, str): with open(args.token_list, encoding="utf-8") as f: token_list = [line.rstrip() for line in f] @@ -413,7 +411,6 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetGANSVSModel: svs=svs, **args.model_conf, ) - assert check_return_type(model) return model @classmethod diff --git a/espnet2/tasks/gan_tts.py b/espnet2/tasks/gan_tts.py index 24dc6c12109..8971ad0e94a 100644 --- a/espnet2/tasks/gan_tts.py +++ b/espnet2/tasks/gan_tts.py @@ -9,7 +9,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.gan_tts.abs_gan_tts import AbsGANTTS from espnet2.gan_tts.espnet_model import ESPnetGANTTSModel @@ -129,9 +129,9 @@ class GANTTSTask(AbsTask): trainer = GANTrainer @classmethod + @typechecked def add_task_arguments(cls, parser: argparse.ArgumentParser): # NOTE(kamo): Use '_' instead of '-' to avoid confusion - assert check_argument_types() group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used @@ -204,11 +204,11 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() return CommonCollateFn( float_pad_value=0.0, int_pad_value=0, @@ -216,10 +216,10 @@ def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ ) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() if args.use_preprocessor: retval = CommonPreprocessor( train=train, @@ -232,7 +232,6 @@ def build_preprocess_fn( ) else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -273,8 +272,8 @@ def optional_data_names( return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetGANTTSModel: - assert check_argument_types() if isinstance(args.token_list, str): with open(args.token_list, encoding="utf-8") as f: token_list = [line[0] + line[1:].rstrip() for line in f] @@ -359,7 +358,6 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetGANTTSModel: tts=tts, **args.model_conf, ) - assert check_return_type(model) return model @classmethod diff --git a/espnet2/tasks/hubert.py b/espnet2/tasks/hubert.py index bd7daebc347..cb1476c92fe 100644 --- a/espnet2/tasks/hubert.py +++ b/espnet2/tasks/hubert.py @@ -12,7 +12,7 @@ import humanfriendly import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet2.asr.encoder.hubert_encoder import ( # noqa: H301 @@ -269,11 +269,11 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() # default sampling rate is 16000 fs = args.frontend_conf.get("fs", 16000) @@ -308,10 +308,10 @@ def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ ) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() if args.use_preprocessor: retval = CommonPreprocessor( train=train, @@ -333,7 +333,6 @@ def build_preprocess_fn( ) else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -352,14 +351,13 @@ def optional_data_names( cls, train: bool = True, inference: bool = False ) -> Tuple[str, ...]: retval = () - assert check_return_type(retval) return retval @classmethod + @typechecked def build_model( cls, args: argparse.Namespace ) -> Union[HubertPretrainModel, TorchAudioHubertPretrainModel]: - assert check_argument_types() if isinstance(args.token_list, str): with open(args.token_list, encoding="utf-8") as f: token_list = [line.rstrip() for line in f] @@ -436,5 +434,4 @@ def build_model( if args.init is not None: initialize(model, args.init) - assert check_return_type(model) return model diff --git a/espnet2/tasks/lm.py b/espnet2/tasks/lm.py index 37f78651302..8cfb9a3d195 100644 --- a/espnet2/tasks/lm.py +++ b/espnet2/tasks/lm.py @@ -4,7 +4,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.lm.abs_model import AbsLM from espnet2.lm.espnet_model import ESPnetLanguageModel @@ -20,8 +20,6 @@ from espnet2.train.collate_fn import CommonCollateFn from espnet2.train.preprocessor import CommonPreprocessor from espnet2.train.trainer import Trainer -from espnet2.utils.get_default_kwargs import get_default_kwargs -from espnet2.utils.nested_dict_action import NestedDictAction from espnet2.utils.types import str2bool, str_or_none lm_choices = ClassChoices( @@ -61,9 +59,9 @@ class LMTask(AbsTask): trainer = Trainer @classmethod + @typechecked def add_task_arguments(cls, parser: argparse.ArgumentParser): # NOTE(kamo): Use '_' instead of '-' to avoid confusion - assert check_argument_types() group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used @@ -137,22 +135,21 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): # e.g. --encoder and --encoder_conf class_choices.add_arguments(group) - assert check_return_type(parser) return parser @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() return CommonCollateFn(int_pad_value=0) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() if args.use_preprocessor: retval = CommonPreprocessor( train=train, @@ -165,7 +162,6 @@ def build_preprocess_fn( ) else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -183,10 +179,10 @@ def optional_data_names( return retval @classmethod + @typechecked def build_model( cls, args: argparse.Namespace ) -> Union[ESPnetLanguageModel, ESPnetMultitaskLanguageModel]: - assert check_argument_types() if isinstance(args.token_list, str): with open(args.token_list, encoding="utf-8") as f: token_list = [line.rstrip() for line in f] @@ -231,5 +227,4 @@ def build_model( # loading opt parameters model.lm.reload_pretrained_parameters() - assert check_return_type(model) return model diff --git a/espnet2/tasks/mt.py b/espnet2/tasks/mt.py index d0a2371dcb8..f38de84e79f 100644 --- a/espnet2/tasks/mt.py +++ b/espnet2/tasks/mt.py @@ -4,7 +4,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.decoder.abs_decoder import AbsDecoder @@ -272,19 +272,19 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() if args.use_preprocessor: retval = MutliTokenizerCommonPreprocessor( train=train, @@ -306,7 +306,6 @@ def build_preprocess_fn( ) else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -328,12 +327,11 @@ def optional_data_names( retval = () else: retval = () - assert check_return_type(retval) return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetMTModel: - assert check_argument_types() if isinstance(args.token_list, str): with open(args.token_list, encoding="utf-8") as f: token_list = [line.rstrip() for line in f] @@ -451,5 +449,4 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetMTModel: if args.init is not None: initialize(model, args.init) - assert check_return_type(model) return model diff --git a/espnet2/tasks/s2st.py b/espnet2/tasks/s2st.py index 6052a412d97..aedf36af586 100644 --- a/espnet2/tasks/s2st.py +++ b/espnet2/tasks/s2st.py @@ -6,7 +6,7 @@ import numpy as np import torch import yaml -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.decoder.abs_decoder import AbsDecoder @@ -270,11 +270,6 @@ class S2STTask(STTask): @classmethod def add_task_arguments(cls, parser: argparse.ArgumentParser): group = parser.add_argument_group(description="Task related") - - # NOTE(kamo): add_arguments(..., required=True) can't be used - # to provide --print_config mode. Instead of it, do as - required = parser.get_default("required") - group.add_argument( "--s2st_type", type=str, @@ -489,19 +484,19 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() if args.src_token_type == "none": args.src_token_type = None if args.unit_token_list is None: @@ -553,7 +548,6 @@ def build_preprocess_fn( ) else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -575,12 +569,11 @@ def optional_data_names( retval = ("src_text", "tgt_text") else: retval = ("tgt_speech",) - assert check_return_type(retval) return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetS2STModel: - assert check_argument_types() if args.tgt_token_list is not None: if isinstance(args.tgt_token_list, str): with open(args.tgt_token_list, encoding="utf-8") as f: @@ -835,7 +828,6 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetS2STModel: if args.init is not None: initialize(model, args.init) - assert check_return_type(model) return model @classmethod diff --git a/espnet2/tasks/s2t.py b/espnet2/tasks/s2t.py index 6273c3480fe..6d25e5ce878 100644 --- a/espnet2/tasks/s2t.py +++ b/espnet2/tasks/s2t.py @@ -4,7 +4,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.decoder.abs_decoder import AbsDecoder @@ -362,19 +362,19 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() if args.use_preprocessor: try: _ = getattr(args, "preprocessor") @@ -417,7 +417,6 @@ def build_preprocess_fn( ) else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -443,12 +442,11 @@ def optional_data_names( retval = tuple(retval) logging.info(f"Optional Data Names: {retval}") - assert check_return_type(retval) return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetS2TModel: - assert check_argument_types() if isinstance(args.token_list, str): with open(args.token_list, encoding="utf-8") as f: token_list = [line.rstrip() for line in f] @@ -555,5 +553,4 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetS2TModel: if args.init is not None: initialize(model, args.init) - assert check_return_type(model) return model diff --git a/espnet2/tasks/slu.py b/espnet2/tasks/slu.py index 78fa11eb177..57cc9f3ffcb 100644 --- a/espnet2/tasks/slu.py +++ b/espnet2/tasks/slu.py @@ -3,7 +3,7 @@ from typing import Callable, Dict, Optional, Tuple import numpy as np -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.decoder.abs_decoder import AbsDecoder @@ -371,10 +371,10 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() if args.use_preprocessor: retval = SLUPreprocessor( train=train, @@ -401,7 +401,6 @@ def build_preprocess_fn( ) else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -420,12 +419,11 @@ def optional_data_names( cls, train: bool = True, inference: bool = False ) -> Tuple[str, ...]: retval = ("transcript",) - assert check_return_type(retval) return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetSLUModel: - assert check_argument_types() if isinstance(args.token_list, str): with open(args.token_list, encoding="utf-8") as f: token_list = [line.rstrip() for line in f] @@ -583,5 +581,4 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetSLUModel: if args.init is not None: initialize(model, args.init) - assert check_return_type(model) return model diff --git a/espnet2/tasks/spk.py b/espnet2/tasks/spk.py index 76c23b0d63e..035c3a4ef06 100644 --- a/espnet2/tasks/spk.py +++ b/espnet2/tasks/spk.py @@ -3,7 +3,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.encoder.abs_encoder import AbsEncoder from espnet2.asr.frontend.abs_frontend import AbsFrontend @@ -29,7 +29,6 @@ from espnet2.spk.loss.aamsoftmax_subcenter_intertopk import ( ArcMarginProduct_intertopk_subcenter, ) -from espnet2.spk.loss.abs_loss import AbsLoss from espnet2.spk.pooling.abs_pooling import AbsPooling from espnet2.spk.pooling.chn_attn_stat_pooling import ChnAttnStatPooling from espnet2.spk.pooling.mean_pooling import MeanPooling @@ -246,18 +245,18 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() return CommonCollateFn() @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() if args.use_preprocessor: if train: retval = preprocessor_choices.get_class(args.preprocessor)( @@ -273,7 +272,6 @@ def build_preprocess_fn( else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -296,12 +294,11 @@ def optional_data_names( # trial pair in the validation/inference phase. retval = ("speech2", "trial", "spk_labels", "task_tokens") - assert check_return_type(retval) return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetSpeakerModel: - assert check_argument_types() if args.frontend is not None: frontend_class = frontend_choices.get_class(args.frontend) @@ -358,5 +355,4 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetSpeakerModel: if args.init is not None: initialize(model, args.init) - assert check_return_type(model) return model diff --git a/espnet2/tasks/st.py b/espnet2/tasks/st.py index 7a3ea8f354e..4396cdcac2b 100644 --- a/espnet2/tasks/st.py +++ b/espnet2/tasks/st.py @@ -4,7 +4,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.ctc import CTC from espnet2.asr.decoder.abs_decoder import AbsDecoder @@ -482,19 +482,19 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() if args.src_token_type == "none": args.src_token_type = None @@ -529,7 +529,6 @@ def build_preprocess_fn( ) else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -551,12 +550,11 @@ def optional_data_names( retval = ("src_text",) else: retval = () - assert check_return_type(retval) return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> Union[ESPnetSTModel]: - assert check_argument_types() if isinstance(args.token_list, str): with open(args.token_list, encoding="utf-8") as f: token_list = [line.rstrip() for line in f] @@ -767,5 +765,4 @@ def build_model(cls, args: argparse.Namespace) -> Union[ESPnetSTModel]: if args.init is not None: initialize(model, args.init) - assert check_return_type(model) return model diff --git a/espnet2/tasks/svs.py b/espnet2/tasks/svs.py index 63594bcc0e6..61d41f95020 100644 --- a/espnet2/tasks/svs.py +++ b/espnet2/tasks/svs.py @@ -8,7 +8,7 @@ import numpy as np import torch import yaml -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.gan_svs.joint import JointScore2Wav from espnet2.gan_svs.vits import VITS @@ -164,9 +164,9 @@ class SVSTask(AbsTask): trainer = Trainer @classmethod + @typechecked def add_task_arguments(cls, parser: argparse.ArgumentParser): # NOTE(kamo): Use '_' instead of '-' to avoid confusion - assert check_argument_types() group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used @@ -258,11 +258,11 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() return CommonCollateFn( float_pad_value=0.0, int_pad_value=0, @@ -270,10 +270,10 @@ def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ ) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool - ) -> Optional[Callable[[str, Dict[str, np.array], float], Dict[str, np.ndarray]]]: - assert check_argument_types() + ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: if args.use_preprocessor: retval = SVSPreprocessor( train=train, @@ -288,8 +288,7 @@ def build_preprocess_fn( ) else: retval = None - # FIXME (jiatong): sometimes checking is not working here - # assert check_return_type(retval) + return retval @classmethod @@ -324,8 +323,8 @@ def optional_data_names( return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetSVSModel: - assert check_argument_types() if isinstance(args.token_list, str): with open(args.token_list, encoding="utf-8") as f: token_list = [line.rstrip() for line in f] @@ -439,7 +438,6 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetSVSModel: svs=svs, **args.model_conf, ) - assert check_return_type(model) return model @classmethod diff --git a/espnet2/tasks/tts.py b/espnet2/tasks/tts.py index ffaaaaa3aed..88df6ba4b0d 100644 --- a/espnet2/tasks/tts.py +++ b/espnet2/tasks/tts.py @@ -8,7 +8,7 @@ import numpy as np import torch import yaml -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.gan_tts.jets import JETS from espnet2.gan_tts.joint import JointText2Wav @@ -129,9 +129,9 @@ class TTSTask(AbsTask): trainer = Trainer @classmethod + @typechecked def add_task_arguments(cls, parser: argparse.ArgumentParser): # NOTE(kamo): Use '_' instead of '-' to avoid confusion - assert check_argument_types() group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used @@ -204,11 +204,11 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() return CommonCollateFn( float_pad_value=0.0, int_pad_value=0, @@ -216,10 +216,10 @@ def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ ) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() if args.use_preprocessor: retval = CommonPreprocessor( train=train, @@ -232,7 +232,6 @@ def build_preprocess_fn( ) else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -273,8 +272,8 @@ def optional_data_names( return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetTTSModel: - assert check_argument_types() if isinstance(args.token_list, str): with open(args.token_list, encoding="utf-8") as f: token_list = [line[0] + line[1:].rstrip() for line in f] @@ -365,7 +364,6 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetTTSModel: tts=tts, **args.model_conf, ) - assert check_return_type(model) return model @classmethod diff --git a/espnet2/tasks/uasr.py b/espnet2/tasks/uasr.py index 3af91ed6773..f717723823c 100644 --- a/espnet2/tasks/uasr.py +++ b/espnet2/tasks/uasr.py @@ -4,7 +4,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.asr.frontend.abs_frontend import AbsFrontend from espnet2.asr.frontend.default import DefaultFrontend @@ -219,19 +219,19 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser): class_choices.add_arguments(group) @classmethod + @typechecked def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[ [Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol return CommonCollateFn(float_pad_value=0.0, int_pad_value=args.int_pad_value) @classmethod + @typechecked def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() if args.use_preprocessor: retval = CommonPreprocessor( train=train, @@ -244,7 +244,6 @@ def build_preprocess_fn( ) else: retval = None - assert check_return_type(retval) return retval @classmethod @@ -263,12 +262,11 @@ def optional_data_names( cls, train: bool = True, inference: bool = False ) -> Tuple[str, ...]: retval = ("pseudo_labels", "input_cluster_id") - assert check_return_type(retval) return retval @classmethod + @typechecked def build_model(cls, args: argparse.Namespace) -> ESPnetUASRModel: - assert check_argument_types() if isinstance(args.token_list, str): with open(args.token_list, encoding="utf-8") as f: token_list = [line.rstrip() for line in f] @@ -376,7 +374,6 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetUASRModel: if args.init is not None: initialize(model, args.init) - assert check_return_type(model) return model @classmethod diff --git a/espnet2/text/build_tokenizer.py b/espnet2/text/build_tokenizer.py index aedfa64835c..464b132fa9f 100644 --- a/espnet2/text/build_tokenizer.py +++ b/espnet2/text/build_tokenizer.py @@ -1,7 +1,7 @@ from pathlib import Path -from typing import Dict, Iterable, Union +from typing import Dict, Iterable, Optional, Union -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.text.abs_tokenizer import AbsTokenizer from espnet2.text.char_tokenizer import CharTokenizer @@ -12,24 +12,24 @@ from espnet2.text.word_tokenizer import WordTokenizer +@typechecked def build_tokenizer( token_type: str, - bpemodel: Union[Path, str, Iterable[str]] = None, - non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, + bpemodel: Optional[Union[Path, str, Iterable[str]]] = None, + non_linguistic_symbols: Optional[Union[Path, str, Iterable[str]]] = None, remove_non_linguistic_symbols: bool = False, space_symbol: str = "", - delimiter: str = None, - g2p_type: str = None, - nonsplit_symbol: Iterable[str] = None, + delimiter: Optional[str] = None, + g2p_type: Optional[str] = None, + nonsplit_symbol: Optional[Iterable[str]] = None, # tokenization encode (text2token) args, e.g. BPE dropout, only applied in training - encode_kwargs: Dict = None, + encode_kwargs: Optional[Dict] = None, # only use for whisper - whisper_language: str = None, - whisper_task: str = None, + whisper_language: Optional[str] = None, + whisper_task: Optional[str] = None, sot_asr: bool = False, ) -> AbsTokenizer: """A helper function to instantiate Tokenizer""" - assert check_argument_types() if token_type == "bpe": if bpemodel is None: raise ValueError('bpemodel is required if token_type = "bpe"') diff --git a/espnet2/text/char_tokenizer.py b/espnet2/text/char_tokenizer.py index 8f41637a07a..dccbf35f890 100644 --- a/espnet2/text/char_tokenizer.py +++ b/espnet2/text/char_tokenizer.py @@ -1,21 +1,21 @@ import warnings from pathlib import Path -from typing import Iterable, List, Union +from typing import Iterable, List, Optional, Union -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.text.abs_tokenizer import AbsTokenizer class CharTokenizer(AbsTokenizer): + @typechecked def __init__( self, - non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, + non_linguistic_symbols: Optional[Union[Path, str, Iterable[str]]] = None, space_symbol: str = "", remove_non_linguistic_symbols: bool = False, - nonsplit_symbols: Iterable[str] = None, + nonsplit_symbols: Optional[Iterable[str]] = None, ): - assert check_argument_types() self.space_symbol = space_symbol if non_linguistic_symbols is None: self.non_linguistic_symbols = set() diff --git a/espnet2/text/cleaner.py b/espnet2/text/cleaner.py index bc47f3228d2..16df99493b4 100644 --- a/espnet2/text/cleaner.py +++ b/espnet2/text/cleaner.py @@ -1,8 +1,8 @@ -from typing import Collection +from typing import Collection, Optional import tacotron_cleaner.cleaners from jaconv import jaconv -from typeguard import check_argument_types +from typeguard import typechecked try: from vietnamese_cleaner import vietnamese_cleaners @@ -27,8 +27,8 @@ class TextCleaner: """ - def __init__(self, cleaner_types: Collection[str] = None): - assert check_argument_types() + @typechecked + def __init__(self, cleaner_types: Optional[Collection[str]] = None): if cleaner_types is None: self.cleaner_types = [] diff --git a/espnet2/text/hugging_face_token_id_converter.py b/espnet2/text/hugging_face_token_id_converter.py index fd221ef680b..ca10ca1d08a 100644 --- a/espnet2/text/hugging_face_token_id_converter.py +++ b/espnet2/text/hugging_face_token_id_converter.py @@ -1,7 +1,7 @@ from typing import Iterable, List, Union import numpy as np -from typeguard import check_argument_types +from typeguard import typechecked try: from transformers import AutoTokenizer @@ -12,11 +12,11 @@ class HuggingFaceTokenIDConverter: + @typechecked def __init__( self, model_name_or_path: str, ): - assert check_argument_types() if not is_transformers_available: raise ImportError( diff --git a/espnet2/text/hugging_face_tokenizer.py b/espnet2/text/hugging_face_tokenizer.py index 29376306dfa..a6967b162a8 100644 --- a/espnet2/text/hugging_face_tokenizer.py +++ b/espnet2/text/hugging_face_tokenizer.py @@ -1,7 +1,7 @@ from pathlib import Path from typing import Iterable, List, Union -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.text.abs_tokenizer import AbsTokenizer @@ -14,8 +14,8 @@ class HuggingFaceTokenizer(AbsTokenizer): + @typechecked def __init__(self, model: Union[Path, str]): - assert check_argument_types() if not is_transformers_available: raise ImportError( diff --git a/espnet2/text/phoneme_tokenizer.py b/espnet2/text/phoneme_tokenizer.py index efff69fe47c..d603d0586fb 100644 --- a/espnet2/text/phoneme_tokenizer.py +++ b/espnet2/text/phoneme_tokenizer.py @@ -7,7 +7,7 @@ import g2p_en import jamo from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.text.abs_tokenizer import AbsTokenizer @@ -436,14 +436,14 @@ def __call__(self, text) -> List[str]: class PhonemeTokenizer(AbsTokenizer): + @typechecked def __init__( self, g2p_type: Union[None, str], - non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, + non_linguistic_symbols: Union[None, Path, str, Iterable[str]] = None, space_symbol: str = "", remove_non_linguistic_symbols: bool = False, ): - assert check_argument_types() if g2p_type is None: self.g2p = split_by_space elif g2p_type == "g2p_en": diff --git a/espnet2/text/sentencepiece_tokenizer.py b/espnet2/text/sentencepiece_tokenizer.py index 891534ccd35..4872643bea8 100644 --- a/espnet2/text/sentencepiece_tokenizer.py +++ b/espnet2/text/sentencepiece_tokenizer.py @@ -2,14 +2,14 @@ from typing import Dict, Iterable, List, Union import sentencepiece as spm -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.text.abs_tokenizer import AbsTokenizer class SentencepiecesTokenizer(AbsTokenizer): + @typechecked def __init__(self, model: Union[Path, str], encode_kwargs: Dict = dict()): - assert check_argument_types() self.model = str(model) # NOTE(kamo): # Don't build SentencePieceProcessor in __init__() diff --git a/espnet2/text/token_id_converter.py b/espnet2/text/token_id_converter.py index 940b80aeace..07e14a3ad16 100644 --- a/espnet2/text/token_id_converter.py +++ b/espnet2/text/token_id_converter.py @@ -2,16 +2,16 @@ from typing import Dict, Iterable, List, Union import numpy as np -from typeguard import check_argument_types +from typeguard import typechecked class TokenIDConverter: + @typechecked def __init__( self, token_list: Union[Path, str, Iterable[str]], unk_symbol: str = "", ): - assert check_argument_types() if isinstance(token_list, (Path, str)): token_list = Path(token_list) diff --git a/espnet2/text/whisper_token_id_converter.py b/espnet2/text/whisper_token_id_converter.py index 6be4ee2dee1..f86e773c39b 100644 --- a/espnet2/text/whisper_token_id_converter.py +++ b/espnet2/text/whisper_token_id_converter.py @@ -1,9 +1,9 @@ import copy import os -from typing import Iterable, List, Union +from typing import Iterable, List, Optional, Union import numpy as np -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.text.whisper_tokenizer import LANGUAGES_CODE_MAPPING @@ -18,16 +18,16 @@ class OpenAIWhisperTokenIDConverter: + @typechecked def __init__( self, model_type: str, - language: str = "en", + language: Optional[str] = "en", task: str = "transcribe", - added_tokens_txt: str = None, + added_tokens_txt: Optional[str] = None, sot: bool = False, speaker_change_symbol: str = "", ): - assert check_argument_types() try: import whisper.tokenizer diff --git a/espnet2/text/whisper_tokenizer.py b/espnet2/text/whisper_tokenizer.py index d7ffdca684f..9b3e9635147 100644 --- a/espnet2/text/whisper_tokenizer.py +++ b/espnet2/text/whisper_tokenizer.py @@ -1,8 +1,8 @@ import copy import os -from typing import Iterable, List +from typing import Iterable, List, Optional -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.text.abs_tokenizer import AbsTokenizer @@ -36,6 +36,7 @@ class OpenAIWhisperTokenizer(AbsTokenizer): + @typechecked def __init__( self, model_type: str, @@ -43,9 +44,8 @@ def __init__( task: str = "transcribe", sot: bool = False, speaker_change_symbol: str = "", - added_tokens_txt: str = None, + added_tokens_txt: Optional[str] = None, ): - assert check_argument_types() try: import whisper.tokenizer diff --git a/espnet2/text/word_tokenizer.py b/espnet2/text/word_tokenizer.py index 30873ef7297..945e821af63 100644 --- a/espnet2/text/word_tokenizer.py +++ b/espnet2/text/word_tokenizer.py @@ -1,20 +1,20 @@ import warnings from pathlib import Path -from typing import Iterable, List, Union +from typing import Iterable, List, Optional, Union -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.text.abs_tokenizer import AbsTokenizer class WordTokenizer(AbsTokenizer): + @typechecked def __init__( self, - delimiter: str = None, - non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, + delimiter: Optional[str] = None, + non_linguistic_symbols: Union[Path, str, Iterable[str], None] = None, remove_non_linguistic_symbols: bool = False, ): - assert check_argument_types() self.delimiter = delimiter if not remove_non_linguistic_symbols and non_linguistic_symbols is not None: diff --git a/espnet2/torch_utils/forward_adaptor.py b/espnet2/torch_utils/forward_adaptor.py index 114af785113..c429af7c73c 100644 --- a/espnet2/torch_utils/forward_adaptor.py +++ b/espnet2/torch_utils/forward_adaptor.py @@ -1,5 +1,5 @@ import torch -from typeguard import check_argument_types +from typeguard import typechecked class ForwardAdaptor(torch.nn.Module): @@ -20,8 +20,8 @@ class ForwardAdaptor(torch.nn.Module): >>> model(x) """ + @typechecked def __init__(self, module: torch.nn.Module, name: str): - assert check_argument_types() super().__init__() self.module = module self.name = name diff --git a/espnet2/torch_utils/initialize.py b/espnet2/torch_utils/initialize.py index e271132f364..938fce549f8 100644 --- a/espnet2/torch_utils/initialize.py +++ b/espnet2/torch_utils/initialize.py @@ -6,9 +6,10 @@ import math import torch -from typeguard import check_argument_types +from typeguard import typechecked +@typechecked def initialize(model: torch.nn.Module, init: str): """Initialize weights of a neural network module. @@ -21,7 +22,6 @@ def initialize(model: torch.nn.Module, init: str): model: Target. init: Method of initialization. """ - assert check_argument_types() if init == "chainer": # 1. lecun_normal_init_parameters diff --git a/espnet2/train/class_choices.py b/espnet2/train/class_choices.py index 412b33f8453..27576c76ef3 100644 --- a/espnet2/train/class_choices.py +++ b/espnet2/train/class_choices.py @@ -1,6 +1,6 @@ -from typing import Mapping, Optional, Tuple +from typing import Mapping, Optional, Tuple, Type -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.utils.nested_dict_action import NestedDictAction from espnet2.utils.types import str_or_none @@ -29,15 +29,15 @@ class ClassChoices: """ + @typechecked def __init__( self, name: str, - classes: Mapping[str, type], - type_check: type = None, - default: str = None, + classes: Mapping[str, Type], + type_check: Optional[Type] = None, + default: Optional[str] = None, optional: bool = False, ): - assert check_argument_types() self.name = name self.base_type = type_check self.classes = {k.lower(): v for k, v in classes.items()} @@ -60,13 +60,12 @@ def choices(self) -> Tuple[Optional[str], ...]: else: return retval + @typechecked def get_class(self, name: Optional[str]) -> Optional[type]: - assert check_argument_types() if name is None or (self.optional and name.lower() == ("none", "null", "nil")): retval = None elif name.lower() in self.classes: class_obj = self.classes[name] - assert check_return_type(class_obj) retval = class_obj else: raise ValueError( diff --git a/espnet2/train/collate_fn.py b/espnet2/train/collate_fn.py index 4cf4d308027..503d5902361 100644 --- a/espnet2/train/collate_fn.py +++ b/espnet2/train/collate_fn.py @@ -3,7 +3,7 @@ import numpy as np import torch -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet.nets.pytorch_backend.nets_utils import pad_list @@ -11,13 +11,13 @@ class CommonCollateFn: """Functor class of common_collate_fn()""" + @typechecked def __init__( self, float_pad_value: Union[float, int] = 0.0, int_pad_value: int = -32768, not_sequence: Collection[str] = (), ): - assert check_argument_types() self.float_pad_value = float_pad_value self.int_pad_value = int_pad_value self.not_sequence = set(not_sequence) @@ -42,6 +42,7 @@ def __call__( class HuBERTCollateFn(CommonCollateFn): """Functor class of common_collate_fn()""" + @typechecked def __init__( self, float_pad_value: Union[float, int] = 0.0, @@ -55,7 +56,6 @@ def __init__( window_shift: float = 20, sample_rate: float = 16, ): - assert check_argument_types() super().__init__( float_pad_value=float_pad_value, int_pad_value=int_pad_value, @@ -179,6 +179,7 @@ def _crop_audio_label( return waveform, label, length +@typechecked def common_collate_fn( data: Collection[Tuple[str, Dict[str, np.ndarray]]], float_pad_value: Union[float, int] = 0.0, @@ -202,7 +203,6 @@ def common_collate_fn( that of the dataset as they are. """ - assert check_argument_types() uttids = [u for u, _ in data] data = [d for _, d in data] @@ -236,5 +236,4 @@ def common_collate_fn( output[key + "_lengths"] = lens output = (uttids, output) - assert check_return_type(output) return output diff --git a/espnet2/train/dataset.py b/espnet2/train/dataset.py index 200ccf4c4e1..4b9a973c3f7 100644 --- a/espnet2/train/dataset.py +++ b/espnet2/train/dataset.py @@ -5,7 +5,7 @@ import numbers import re from abc import ABC, abstractmethod -from typing import Any, Callable, Collection, Dict, Mapping, Tuple, Union +from typing import Any, Callable, Collection, Dict, Mapping, Optional, Tuple, Union import h5py import humanfriendly @@ -13,7 +13,7 @@ import numpy as np import torch from torch.utils.data.dataset import Dataset -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.fileio.multi_sound_scp import MultiSoundScpReader from espnet2.fileio.npy_scp import NpyScpReader @@ -34,8 +34,8 @@ class AdapterForSoundScpReader(collections.abc.Mapping): + @typechecked def __init__(self, loader, dtype=None, allow_multi_rates=False): - assert check_argument_types() self.loader = loader self.dtype = dtype self.rate = None @@ -109,8 +109,8 @@ def __getitem__(self, key) -> np.ndarray: class AdapterForSingingScoreScpReader(collections.abc.Mapping): + @typechecked def __init__(self, loader): - assert check_argument_types() self.loader = loader def keys(self): @@ -135,8 +135,8 @@ def __getitem__(self, key: str) -> np.ndarray: class AdapterForLabelScpReader(collections.abc.Mapping): + @typechecked def __init__(self, loader): - assert check_argument_types() self.loader = loader def keys(self): @@ -428,11 +428,12 @@ class ESPnetDataset(AbsDataset): {'input': per_utt_array, 'output': per_utt_array} """ + @typechecked def __init__( self, path_name_type_list: Collection[Tuple[str, str, str]], - preprocess: Callable[ - [str, Dict[str, np.ndarray]], Dict[str, np.ndarray] + preprocess: Optional[ + Callable[[str, Dict[str, np.ndarray]], Dict[str, np.ndarray]] ] = None, float_dtype: str = "float32", int_dtype: str = "long", @@ -440,7 +441,6 @@ def __init__( max_cache_fd: int = 0, allow_multi_rates: bool = False, ): - assert check_argument_types() if len(path_name_type_list) == 0: raise ValueError( '1 or more elements are required for "path_name_type_list"' @@ -535,8 +535,8 @@ def __repr__(self): _mes += f"\n preprocess: {self.preprocess})" return _mes + @typechecked def __getitem__(self, uid: Union[str, int]) -> Tuple[str, Dict[str, np.ndarray]]: - assert check_argument_types() # Change integer-id to string-id if isinstance(uid, int): @@ -604,5 +604,4 @@ def __getitem__(self, uid: Union[str, int]) -> Tuple[str, Dict[str, np.ndarray]] self.cache[uid] = data retval = uid, data - assert check_return_type(retval) return retval diff --git a/espnet2/train/distributed_utils.py b/espnet2/train/distributed_utils.py index 3f2c56c69f2..8036d691979 100644 --- a/espnet2/train/distributed_utils.py +++ b/espnet2/train/distributed_utils.py @@ -197,7 +197,7 @@ def free_port(): return sock.getsockname()[1] -def get_rank(prior=None, launcher: str = None) -> Optional[int]: +def get_rank(prior=None, launcher: Optional[str] = None) -> Optional[int]: if prior is None: if launcher == "slurm": if not is_in_slurm_step(): @@ -217,7 +217,7 @@ def get_rank(prior=None, launcher: str = None) -> Optional[int]: return _int_or_none(os.environ.get("RANK")) -def get_world_size(prior=None, launcher: str = None) -> int: +def get_world_size(prior=None, launcher: Optional[str] = None) -> int: if prior is None: if launcher == "slurm": if not is_in_slurm_step(): @@ -237,7 +237,7 @@ def get_world_size(prior=None, launcher: str = None) -> int: return int(os.environ.get("WORLD_SIZE", "1")) -def get_local_rank(prior=None, launcher: str = None) -> Optional[int]: +def get_local_rank(prior=None, launcher: Optional[str] = None) -> Optional[int]: # LOCAL_RANK is same as GPU device id if prior is None: @@ -280,7 +280,7 @@ def get_local_rank(prior=None, launcher: str = None) -> Optional[int]: return None -def get_master_addr(prior=None, launcher: str = None) -> Optional[str]: +def get_master_addr(prior=None, launcher: Optional[str] = None) -> Optional[str]: if prior is None: if launcher == "slurm": if not is_in_slurm_step(): @@ -303,7 +303,7 @@ def get_master_port(prior=None) -> Optional[int]: return _int_or_none(os.environ.get("MASTER_PORT")) -def get_node_rank(prior=None, launcher: str = None) -> Optional[int]: +def get_node_rank(prior=None, launcher: Optional[str] = None) -> Optional[int]: """Get Node Rank. Use for "multiprocessing distributed" mode. @@ -336,7 +336,7 @@ def get_node_rank(prior=None, launcher: str = None) -> Optional[int]: return _int_or_none(os.environ.get("RANK")) -def get_num_nodes(prior=None, launcher: str = None) -> Optional[int]: +def get_num_nodes(prior=None, launcher: Optional[str] = None) -> Optional[int]: """Get the number of nodes. Use for "multiprocessing distributed" mode. diff --git a/espnet2/train/gan_trainer.py b/espnet2/train/gan_trainer.py index 0e013e24e2e..221780dc3e5 100644 --- a/espnet2/train/gan_trainer.py +++ b/espnet2/train/gan_trainer.py @@ -12,7 +12,7 @@ import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.schedulers.abs_scheduler import AbsBatchStepScheduler, AbsScheduler from espnet2.torch_utils.device_funcs import to_device @@ -58,9 +58,9 @@ class GANTrainer(Trainer): """ @classmethod + @typechecked def build_options(cls, args: argparse.Namespace) -> TrainerOptions: """Build options consumed by train(), eval(), and plot_attention().""" - assert check_argument_types() return build_dataclass(GANTrainerOptions, args) @classmethod @@ -74,6 +74,7 @@ def add_arguments(cls, parser: argparse.ArgumentParser): ) @classmethod + @typechecked def train_one_epoch( cls, model: torch.nn.Module, @@ -87,7 +88,6 @@ def train_one_epoch( distributed_option: DistributedOption, ) -> bool: """Train one epoch.""" - assert check_argument_types() grad_noise = options.grad_noise accum_grad = options.accum_grad @@ -307,6 +307,7 @@ def train_one_epoch( @classmethod @torch.no_grad() + @typechecked def validate_one_epoch( cls, model: torch.nn.Module, @@ -316,7 +317,6 @@ def validate_one_epoch( distributed_option: DistributedOption, ) -> None: """Validate one epoch.""" - assert check_argument_types() ngpu = options.ngpu no_forward_run = options.no_forward_run distributed = distributed_option.distributed diff --git a/espnet2/train/iterable_dataset.py b/espnet2/train/iterable_dataset.py index 670583952d4..c1bb6a482b1 100644 --- a/espnet2/train/iterable_dataset.py +++ b/espnet2/train/iterable_dataset.py @@ -3,14 +3,14 @@ import copy from io import StringIO from pathlib import Path -from typing import Callable, Collection, Dict, Iterator, Tuple, Union +from typing import Callable, Collection, Dict, Iterator, Optional, Tuple, Union import kaldiio import numpy as np import soundfile import torch from torch.utils.data.dataset import IterableDataset -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.train.dataset import ESPnetDataset @@ -76,17 +76,17 @@ class IterableESPnetDataset(IterableDataset): {'input': per_utt_array, 'output': per_utt_array} """ + @typechecked def __init__( self, path_name_type_list: Collection[Tuple[str, str, str]], - preprocess: Callable[ - [str, Dict[str, np.ndarray]], Dict[str, np.ndarray] + preprocess: Optional[ + Callable[[str, Dict[str, np.ndarray]], Dict[str, np.ndarray]] ] = None, float_dtype: str = "float32", int_dtype: str = "long", - key_file: str = None, + key_file: Optional[str] = None, ): - assert check_argument_types() if len(path_name_type_list) == 0: raise ValueError( '1 or more elements are required for "path_name_type_list"' diff --git a/espnet2/train/preprocessor.py b/espnet2/train/preprocessor.py index 36bbbc93b8e..4c19814f796 100644 --- a/espnet2/train/preprocessor.py +++ b/espnet2/train/preprocessor.py @@ -10,7 +10,7 @@ import numpy as np import scipy.signal import soundfile -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked from espnet2.layers.augmentation import DataAugmentation from espnet2.text.build_tokenizer import build_tokenizer @@ -139,18 +139,18 @@ def __init__( train: bool, use_lang_prompt: bool = False, use_nlp_prompt: bool = False, - token_type: str = None, + token_type: Optional[str] = None, token_list: Union[Path, str, Iterable[str]] = None, bpemodel: Union[Path, str, Iterable[str]] = None, text_cleaner: Collection[str] = None, - g2p_type: str = None, + g2p_type: Optional[str] = None, unk_symbol: str = "", space_symbol: str = "", non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, - delimiter: str = None, - rir_scp: str = None, + delimiter: Optional[str] = None, + rir_scp: Optional[str] = None, rir_apply_prob: float = 1.0, - noise_scp: str = None, + noise_scp: Optional[str] = None, noise_apply_prob: float = 1.0, noise_db_range: str = "3_10", short_noise_thres: float = 0.5, @@ -164,8 +164,8 @@ def __init__( data_aug_num: List[int] = [1, 1], data_aug_prob: float = 0.0, # only use for whisper - whisper_language: str = None, - whisper_task: str = None, + whisper_language: Optional[str] = None, + whisper_task: Optional[str] = None, ): super().__init__(train) self.train = train @@ -369,10 +369,10 @@ def _add_noise( speech = speech + scale * noise return speech, noise + @typechecked def _speech_process( self, data: Dict[str, Union[str, np.ndarray]] ) -> Dict[str, Union[str, np.ndarray]]: - assert check_argument_types() if self.speech_name in data: if self.train and (self.rirs is not None or self.noises is not None): speech = data[self.speech_name] @@ -418,7 +418,6 @@ def _speech_process( speech = data[self.speech_name] ma = np.max(np.abs(speech)) data[self.speech_name] = speech * self.speech_volume_normalize / ma - assert check_return_type(data) return data def _text_process( @@ -486,13 +485,12 @@ def _text_process( tokens = self.tokenizer.text2tokens(text) text_ints = self.token_id_converter.tokens2ids(tokens) data[name] = np.array(text_ints, dtype=np.int64) - assert check_return_type(data) return data + @typechecked def __call__( self, uid: str, data: Dict[str, Union[str, np.ndarray]] ) -> Dict[str, np.ndarray]: - assert check_argument_types() data = self._speech_process(data) data = self._text_process(data) @@ -503,19 +501,19 @@ class SLUPreprocessor(CommonPreprocessor): def __init__( self, train: bool, - token_type: str = None, + token_type: Optional[str] = None, token_list: Union[Path, str, Iterable[str]] = None, transcript_token_list: Union[Path, str, Iterable[str]] = None, bpemodel: Union[Path, str, Iterable[str]] = None, text_cleaner: Collection[str] = None, - g2p_type: str = None, + g2p_type: Optional[str] = None, unk_symbol: str = "", space_symbol: str = "", non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, - delimiter: str = None, - rir_scp: str = None, + delimiter: Optional[str] = None, + rir_scp: Optional[str] = None, rir_apply_prob: float = 1.0, - noise_scp: str = None, + noise_scp: Optional[str] = None, noise_apply_prob: float = 1.0, noise_db_range: str = "3_10", short_noise_thres: float = 0.5, @@ -585,7 +583,6 @@ def _text_process( tokens = self.transcript_tokenizer.text2tokens(text) text_ints = self.transcript_token_id_converter.tokens2ids(tokens) data["transcript"] = np.array(text_ints, dtype=np.int64) - assert check_return_type(data) return data @@ -595,18 +592,18 @@ def __init__( train: bool, use_lang_prompt: bool = False, use_nlp_prompt: bool = False, - token_type: str = None, + token_type: Optional[str] = None, token_list: Union[Path, str, Iterable[str]] = None, bpemodel: Union[Path, str, Iterable[str]] = None, text_cleaner: Collection[str] = None, - g2p_type: str = None, + g2p_type: Optional[str] = None, unk_symbol: str = "", space_symbol: str = "", non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, - delimiter: str = None, - rir_scp: str = None, + delimiter: Optional[str] = None, + rir_scp: Optional[str] = None, rir_apply_prob: float = 1.0, - noise_scp: str = None, + noise_scp: Optional[str] = None, noise_apply_prob: float = 1.0, noise_db_range: str = "3_10", short_noise_thres: float = 0.5, @@ -620,8 +617,8 @@ def __init__( data_aug_num: List[int] = [1, 1], data_aug_prob: float = 0.0, # only use for whisper - whisper_language: str = None, - whisper_task: str = None, + whisper_language: Optional[str] = None, + whisper_task: Optional[str] = None, ): super().__init__( train=train, @@ -700,13 +697,12 @@ def _text_process( tokens = self.tokenizer.text2tokens(text) text_ints = self.token_id_converter.tokens2ids(tokens) data[name] = np.array(text_ints, dtype=np.int64) - assert check_return_type(data) return data + @typechecked def __call__( self, uid: str, data: Dict[str, Union[str, np.ndarray]] ) -> Dict[str, np.ndarray]: - assert check_argument_types() data = self._speech_process(data) data = self._text_process(data) @@ -725,10 +721,10 @@ def __init__( unk_symbol: str = "", space_symbol: str = "", non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, - delimiter: str = None, - rir_scp: str = None, + delimiter: Optional[str] = None, + rir_scp: Optional[str] = None, rir_apply_prob: float = 1.0, - noise_scp: str = None, + noise_scp: Optional[str] = None, noise_apply_prob: float = 1.0, noise_db_range: str = "3_10", short_noise_thres: float = 0.5, @@ -742,7 +738,7 @@ def __init__( data_aug_prob: float = 0.0, # only use for whisper whisper_language: List[str] = None, - whisper_task: str = None, + whisper_task: Optional[str] = None, ): # TODO(jiatong): sync with Kamo and Jing on interface for preprocessor super().__init__( @@ -844,7 +840,6 @@ def _text_process( tokens = self.tokenizer[i].text2tokens(text) text_ints = self.token_id_converter[i].tokens2ids(tokens) data[text_name] = np.array(text_ints, dtype=np.int64) - assert check_return_type(data) return data @@ -852,13 +847,13 @@ class DynamicMixingPreprocessor(AbsPreprocessor): def __init__( self, train: bool, - source_scp: str = None, + source_scp: Optional[str] = None, ref_num: int = 2, dynamic_mixing_gain_db: float = 0.0, speech_name: str = "speech_mix", speech_ref_name_prefix: str = "speech_ref", - mixture_source_name: str = None, - utt2spk: str = None, + mixture_source_name: Optional[str] = None, + utt2spk: Optional[str] = None, categories: Optional[List] = None, ): super().__init__(train) @@ -1003,7 +998,6 @@ def __call__( if self.train: data = self._mix_speech_(uid, data) - assert check_return_type(data) return data @@ -1013,9 +1007,9 @@ class EnhPreprocessor(CommonPreprocessor): def __init__( self, train: bool, - rir_scp: str = None, + rir_scp: Optional[str] = None, rir_apply_prob: float = 1.0, - noise_scp: str = None, + noise_scp: Optional[str] = None, noise_apply_prob: float = 1.0, noise_db_range: str = "3_10", short_noise_thres: float = 0.5, @@ -1229,13 +1223,12 @@ def _random_crop_range( break return start, start + tgt_length + @typechecked def _speech_process( self, uid: str, data: Dict[str, Union[str, np.ndarray]] ) -> Dict[str, Union[str, np.ndarray]]: - assert check_argument_types() if self.speech_name not in data: - assert check_return_type(data) return data num_spk = self.num_spk @@ -1452,13 +1445,12 @@ def _speech_process( assert data[k].shape == speech_mix.shape data[k] = data[k][..., chs] - assert check_return_type(data) return data + @typechecked def __call__( self, uid: str, data: Dict[str, Union[str, np.ndarray]] ) -> Dict[str, np.ndarray]: - assert check_argument_types() data = self._speech_process(uid, data) data = self._text_process(data) @@ -1471,15 +1463,15 @@ class SVSPreprocessor(AbsPreprocessor): def __init__( self, train: bool, - token_type: str = None, + token_type: Optional[str] = None, token_list: Union[Path, str, Iterable[str]] = None, bpemodel: Union[Path, str, Iterable[str]] = None, text_cleaner: Collection[str] = None, - g2p_type: str = None, + g2p_type: Optional[str] = None, unk_symbol: str = "", space_symbol: str = "", non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, - delimiter: str = None, + delimiter: Optional[str] = None, singing_volume_normalize: float = None, singing_name: str = "singing", text_name: str = "text", @@ -1527,12 +1519,12 @@ def __init__( self.tokenizer = None self.token_id_converter = None + @typechecked def __call__( self, uid: str, data: Dict[str, Union[str, np.ndarray, tuple]], ) -> Dict[str, np.ndarray]: - assert check_argument_types() if self.singing_name in data: if self.singing_volume_normalize is not None: @@ -1637,14 +1629,14 @@ class TSEPreprocessor(EnhPreprocessor): def __init__( self, train: bool, - train_spk2enroll: str = None, + train_spk2enroll: Optional[str] = None, enroll_segment: int = None, load_spk_embedding: bool = False, load_all_speakers: bool = False, # inherited from EnhPreprocessor - rir_scp: str = None, + rir_scp: Optional[str] = None, rir_apply_prob: float = 1.0, - noise_scp: str = None, + noise_scp: Optional[str] = None, noise_apply_prob: float = 1.0, noise_db_range: str = "3_10", short_noise_thres: float = 0.5, @@ -1753,10 +1745,10 @@ def _read_audio_segment(self, path, seg_len=None): raise RuntimeError(f"Something wrong: {path}") return audio[:, 0] + @typechecked def _speech_process( self, uid: str, data: Dict[str, Union[str, np.ndarray]] ) -> Dict[str, Union[str, np.ndarray]]: - assert check_argument_types() ref_names = [k for k in data.keys() if re.match(r"speech_ref\d+", k)] num_spk = len(ref_names) @@ -1846,13 +1838,12 @@ def _speech_process( else: data[name] = soundfile.read(data[name])[0] - assert check_return_type(data) return data + @typechecked def __call__( self, uid: str, data: Dict[str, Union[str, np.ndarray]] ) -> Dict[str, np.ndarray]: - assert check_argument_types() data = super()._speech_process(uid, data) data = self._speech_process(uid, data) @@ -1886,10 +1877,10 @@ def __init__( self, train: bool, target_duration: float, # in seconds - spk2utt: str = None, + spk2utt: Optional[str] = None, sample_rate: int = 16000, num_eval: int = 10, - rir_scp: str = None, + rir_scp: Optional[str] = None, rir_apply_prob: float = 1.0, noise_info: List[ Tuple[float, str, Tuple[int, int], Tuple[float, float]] @@ -2123,10 +2114,10 @@ def _text_process( return data + @typechecked def __call__( self, uid: str, data: Dict[str, Union[str, np.ndarray]] ) -> Dict[str, np.ndarray]: - assert check_argument_types() data = self._text_process(data) data = self._speech_process(data) @@ -2138,18 +2129,18 @@ class S2TPreprocessor(CommonPreprocessor): def __init__( self, train: bool, - token_type: str = None, + token_type: Optional[str] = None, token_list: Union[Path, str, Iterable[str]] = None, bpemodel: Union[Path, str, Iterable[str]] = None, text_cleaner: Collection[str] = None, - g2p_type: str = None, + g2p_type: Optional[str] = None, unk_symbol: str = "", space_symbol: str = "", non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, - delimiter: str = None, - rir_scp: str = None, + delimiter: Optional[str] = None, + rir_scp: Optional[str] = None, rir_apply_prob: float = 1.0, - noise_scp: str = None, + noise_scp: Optional[str] = None, noise_apply_prob: float = 1.0, noise_db_range: str = "3_10", short_noise_thres: float = 0.5, @@ -2205,10 +2196,10 @@ def __init__( self.first_time = self.token_id_converter.token2id[first_time_symbol] self.last_time = self.token_id_converter.token2id[last_time_symbol] + @typechecked def _pad_or_trim_speech( self, data: Dict[str, Union[str, np.ndarray]] ) -> Tuple[Dict[str, Union[str, np.ndarray]], int]: - assert check_argument_types() init_pad = 0 if self.speech_name in data: @@ -2238,13 +2229,12 @@ def _pad_or_trim_speech( data[self.speech_name] = speech.T # convert back to time first - assert check_return_type((data, init_pad)) return data, init_pad + @typechecked def _text_process( self, data: Dict[str, Union[str, np.ndarray]], time_shift: int ) -> Dict[str, np.ndarray]: - assert check_argument_types() text_names = [self.text_name, self.text_prev_name, self.text_ctc_name] if self.tokenizer is not None: @@ -2295,13 +2285,12 @@ def _text_process( data[name] = text_ints - assert check_return_type(data) return data + @typechecked def __call__( self, uid: str, data: Dict[str, Union[str, np.ndarray]] ) -> Dict[str, np.ndarray]: - assert check_argument_types() data = self._speech_process(data) data, init_pad = self._pad_or_trim_speech(data) diff --git a/espnet2/train/reporter.py b/espnet2/train/reporter.py index 8587bb53146..1bf6333413f 100644 --- a/espnet2/train/reporter.py +++ b/espnet2/train/reporter.py @@ -14,7 +14,7 @@ import numpy as np import torch from packaging.version import parse as V -from typeguard import check_argument_types, check_return_type +from typeguard import typechecked Num = Union[float, int, complex, torch.Tensor, np.ndarray] @@ -22,8 +22,8 @@ _reserved = {"time", "total_count"} -def to_reported_value(v: Num, weight: Num = None) -> "ReportedValue": - assert check_argument_types() +@typechecked +def to_reported_value(v: Num, weight: Optional[Num] = None) -> "ReportedValue": if isinstance(v, (torch.Tensor, np.ndarray)): if np.prod(v.shape) != 1: raise ValueError(f"v must be 0 or 1 dimension: {len(v.shape)}") @@ -38,12 +38,11 @@ def to_reported_value(v: Num, weight: Num = None) -> "ReportedValue": retval = WeightedAverage(v, weight) else: retval = Average(v) - assert check_return_type(retval) return retval +@typechecked def aggregate(values: Sequence["ReportedValue"]) -> Num: - assert check_argument_types() for v in values: if not isinstance(v, type(values[0])): @@ -82,7 +81,6 @@ def aggregate(values: Sequence["ReportedValue"]) -> Num: else: raise NotImplementedError(f"type={type(values[0])}") - assert check_return_type(retval) return retval @@ -117,8 +115,8 @@ class SubReporter: See the docstring of Reporter for the usage. """ + @typechecked def __init__(self, key: str, epoch: int, total_count: int): - assert check_argument_types() self.key = key self.epoch = epoch self.start_time = time.perf_counter() @@ -151,12 +149,12 @@ def next(self): self._seen_keys_in_the_step = set() + @typechecked def register( self, stats: Dict[str, Optional[Union[Num, Dict[str, Num]]]], - weight: Num = None, + weight: Optional[Num] = None, ) -> None: - assert check_argument_types() if self._finished: raise RuntimeError("Already finished") if len(self._seen_keys_in_the_step) == 0: @@ -286,8 +284,8 @@ class Reporter: """ + @typechecked def __init__(self, epoch: int = 0): - assert check_argument_types() if epoch < 0: raise ValueError(f"epoch must be 0 or more: {epoch}") self.epoch = epoch @@ -500,8 +498,8 @@ def matplotlib_plot(self, output_dir: Union[str, Path]): p.parent.mkdir(parents=True, exist_ok=True) plt.savefig(p) + @typechecked def _plot_stats(self, keys: Sequence[str], key2: str): - assert check_argument_types() # str is also Sequence[str] if isinstance(keys, str): raise TypeError(f"Input as [{keys}]") @@ -540,7 +538,7 @@ def _plot_stats(self, keys: Sequence[str], key2: str): return plt def tensorboard_add_scalar( - self, summary_writer, epoch: int = None, key1: str = None + self, summary_writer, epoch: int = None, key1: Optional[str] = None ): if epoch is None: epoch = self.get_epoch() diff --git a/espnet2/train/spk_trainer.py b/espnet2/train/spk_trainer.py index 88d88e33688..369ec16eb5c 100644 --- a/espnet2/train/spk_trainer.py +++ b/espnet2/train/spk_trainer.py @@ -7,54 +7,29 @@ overriding validate_one_epoch. """ -import argparse -import dataclasses -import logging -from contextlib import contextmanager -from dataclasses import is_dataclass -from pathlib import Path -from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union - -import humanfriendly +from typing import Dict, Iterable + import numpy as np import torch import torch.nn.functional as F import torch.optim -from packaging.version import parse as V -from typeguard import check_argument_types - -from espnet2.iterators.abs_iter_factory import AbsIterFactory -from espnet2.main_funcs.average_nbest_models import average_nbest_models -from espnet2.main_funcs.calculate_all_attentions import calculate_all_attentions -from espnet2.schedulers.abs_scheduler import ( - AbsBatchStepScheduler, - AbsEpochStepScheduler, - AbsScheduler, - AbsValEpochStepScheduler, -) -from espnet2.torch_utils.add_gradient_noise import add_gradient_noise +from typeguard import typechecked + from espnet2.torch_utils.device_funcs import to_device -from espnet2.torch_utils.recursive_op import recursive_average -from espnet2.torch_utils.set_all_random_seed import set_all_random_seed -from espnet2.train.abs_espnet_model import AbsESPnetModel from espnet2.train.distributed_utils import DistributedOption -from espnet2.train.reporter import Reporter, SubReporter +from espnet2.train.reporter import SubReporter from espnet2.train.trainer import Trainer, TrainerOptions -from espnet2.utils.build_dataclass import build_dataclass from espnet2.utils.eer import ComputeErrorRates, ComputeMinDcf, tuneThresholdfromScore -from espnet2.utils.kwargs2args import kwargs2args if torch.distributed.is_available(): from torch.distributed import ReduceOp class SpkTrainer(Trainer): - """ - Trainer. - Designed for speaker recognition. + """Trainer designed for speaker recognition. + Training will be done as closed set classification. Validation will be open set EER calculation. - """ def __init__(self): @@ -62,6 +37,7 @@ def __init__(self): @classmethod @torch.no_grad() + @typechecked def validate_one_epoch( cls, model: torch.nn.Module, @@ -70,9 +46,7 @@ def validate_one_epoch( options: TrainerOptions, distributed_option: DistributedOption, ) -> None: - assert check_argument_types() ngpu = options.ngpu - no_forward_run = options.no_forward_run distributed = distributed_option.distributed model.eval() @@ -193,7 +167,7 @@ def validate_one_epoch( ] torch.distributed.all_gather(labels_all, labels) labels = torch.cat(labels_all) - rank = torch.distributed.get_rank() + # rank = torch.distributed.get_rank() torch.distributed.barrier() scores = scores.detach().cpu().numpy() labels = labels.detach().cpu().numpy() @@ -246,6 +220,7 @@ def validate_one_epoch( @classmethod @torch.no_grad() + @typechecked def extract_embed( cls, model: torch.nn.Module, @@ -257,26 +232,20 @@ def extract_embed( custom_bs: int, average: bool = False, ) -> None: - assert check_argument_types() ngpu = options.ngpu - no_forward_run = options.no_forward_run distributed = distributed_option.distributed model.eval() - - scores = [] - labels = [] spk_embd_dic = {} # [For distributed] Because iteration counts are not always equals between # processes, send stop-flag to the other processes if iterator is finished - iterator_stop = torch.tensor(0).to("cuda" if ngpu > 0 else "cpu") + # iterator_stop = torch.tensor(0).to("cuda" if ngpu > 0 else "cpu") # fill dictionary with speech samples utt_id_list = [] utt_id_whole_list = [] speech_list = [] - task_token_list = [] task_token = None if distributed: rank = torch.distributed.get_rank() diff --git a/espnet2/train/trainer.py b/espnet2/train/trainer.py index 032b4e72344..17db41168e1 100644 --- a/espnet2/train/trainer.py +++ b/espnet2/train/trainer.py @@ -15,7 +15,7 @@ import torch.nn import torch.optim from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.iterators.abs_iter_factory import AbsIterFactory from espnet2.main_funcs.average_nbest_models import average_nbest_models @@ -133,9 +133,9 @@ def __init__(self): raise RuntimeError("This class can't be instantiated.") @classmethod + @typechecked def build_options(cls, args: argparse.Namespace) -> TrainerOptions: """Build options consumed by train(), eval(), and plot_attention()""" - assert check_argument_types() return build_dataclass(TrainerOptions, args) @classmethod @@ -174,6 +174,7 @@ def resume( logging.info(f"The training was resumed using {checkpoint}") @classmethod + @typechecked def run( cls, model: AbsESPnetModel, @@ -186,7 +187,6 @@ def run( distributed_option: DistributedOption, ) -> None: """Perform training. This method performs the main process of training.""" - assert check_argument_types() # NOTE(kamo): Don't check the type more strictly as far trainer_options assert is_dataclass(trainer_options), type(trainer_options) assert len(optimizers) == len(schedulers), (len(optimizers), len(schedulers)) @@ -516,6 +516,7 @@ def run( ) @classmethod + @typechecked def train_one_epoch( cls, model: torch.nn.Module, @@ -528,7 +529,6 @@ def train_one_epoch( options: TrainerOptions, distributed_option: DistributedOption, ) -> bool: - assert check_argument_types() grad_noise = options.grad_noise accum_grad = options.accum_grad @@ -790,6 +790,7 @@ def train_one_epoch( @classmethod @torch.no_grad() + @typechecked def validate_one_epoch( cls, model: torch.nn.Module, @@ -798,7 +799,6 @@ def validate_one_epoch( options: TrainerOptions, distributed_option: DistributedOption, ) -> None: - assert check_argument_types() ngpu = options.ngpu no_forward_run = options.no_forward_run distributed = distributed_option.distributed @@ -842,6 +842,7 @@ def validate_one_epoch( @classmethod @torch.no_grad() + @typechecked def plot_attention( cls, model: torch.nn.Module, @@ -851,7 +852,6 @@ def plot_attention( reporter: SubReporter, options: TrainerOptions, ) -> None: - assert check_argument_types() import matplotlib ngpu = options.ngpu diff --git a/espnet2/train/uasr_trainer.py b/espnet2/train/uasr_trainer.py index ec0b6555f82..708cd2a8c53 100644 --- a/espnet2/train/uasr_trainer.py +++ b/espnet2/train/uasr_trainer.py @@ -14,7 +14,7 @@ import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.schedulers.abs_scheduler import AbsBatchStepScheduler, AbsScheduler from espnet2.torch_utils.device_funcs import to_device @@ -61,9 +61,9 @@ class UASRTrainer(Trainer): """ @classmethod + @typechecked def build_options(cls, args: argparse.Namespace) -> TrainerOptions: """Build options consumed by train(), eval(), and plot_attention().""" - assert check_argument_types() return build_dataclass(UASRTrainerOptions, args) @classmethod @@ -83,6 +83,7 @@ def add_arguments(cls, parser: argparse.ArgumentParser): ) @classmethod + @typechecked def train_one_epoch( cls, model: torch.nn.Module, @@ -96,7 +97,6 @@ def train_one_epoch( distributed_option: DistributedOption, ) -> bool: """Train one epoch for UASR.""" - assert check_argument_types() grad_noise = options.grad_noise accum_grad = options.accum_grad @@ -310,6 +310,7 @@ def train_one_epoch( @classmethod @torch.no_grad() + @typechecked def validate_one_epoch( cls, model: torch.nn.Module, @@ -319,7 +320,6 @@ def validate_one_epoch( distributed_option: DistributedOption, ) -> None: """Validate one epoch.""" - assert check_argument_types() ngpu = options.ngpu no_forward_run = options.no_forward_run distributed = distributed_option.distributed diff --git a/espnet2/tts/espnet_model.py b/espnet2/tts/espnet_model.py index b33970b6b76..b2fe715134b 100644 --- a/espnet2/tts/espnet_model.py +++ b/espnet2/tts/espnet_model.py @@ -8,7 +8,7 @@ import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.layers.abs_normalize import AbsNormalize from espnet2.layers.inversible_interface import InversibleInterface @@ -28,6 +28,7 @@ def autocast(enabled=True): # NOQA class ESPnetTTSModel(AbsESPnetModel): """ESPnet model for text-to-speech task.""" + @typechecked def __init__( self, feats_extract: Optional[AbsFeatsExtract], @@ -39,7 +40,6 @@ def __init__( tts: AbsTTS, ): """Initialize ESPnetTTSModel module.""" - assert check_argument_types() super().__init__() self.feats_extract = feats_extract self.pitch_extract = pitch_extract diff --git a/espnet2/tts/fastspeech/fastspeech.py b/espnet2/tts/fastspeech/fastspeech.py index b9a92bd07aa..98a779ac55f 100644 --- a/espnet2/tts/fastspeech/fastspeech.py +++ b/espnet2/tts/fastspeech/fastspeech.py @@ -8,7 +8,7 @@ import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.torch_utils.device_funcs import force_gatherable from espnet2.torch_utils.initialize import initialize @@ -44,6 +44,7 @@ class FastSpeech(AbsTTS): """ + @typechecked def __init__( self, # network structure related @@ -190,7 +191,6 @@ def __init__( calculation. """ - assert check_argument_types() super().__init__() # store hyperparameters diff --git a/espnet2/tts/fastspeech2/fastspeech2.py b/espnet2/tts/fastspeech2/fastspeech2.py index 1f2e662b7aa..c382cf78154 100644 --- a/espnet2/tts/fastspeech2/fastspeech2.py +++ b/espnet2/tts/fastspeech2/fastspeech2.py @@ -8,7 +8,7 @@ import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.torch_utils.device_funcs import force_gatherable from espnet2.torch_utils.initialize import initialize @@ -45,6 +45,7 @@ class FastSpeech2(AbsTTS): """ + @typechecked def __init__( self, # network structure related @@ -224,7 +225,6 @@ def __init__( calculation. """ - assert check_argument_types() super().__init__() # store hyperparameters diff --git a/espnet2/tts/fastspeech2/loss.py b/espnet2/tts/fastspeech2/loss.py index 167ea7cd295..70f2aae68c5 100644 --- a/espnet2/tts/fastspeech2/loss.py +++ b/espnet2/tts/fastspeech2/loss.py @@ -6,7 +6,7 @@ from typing import Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet.nets.pytorch_backend.fastspeech.duration_predictor import ( # noqa: H301 DurationPredictorLoss, @@ -17,6 +17,7 @@ class FastSpeech2Loss(torch.nn.Module): """Loss function module for FastSpeech2.""" + @typechecked def __init__(self, use_masking: bool = True, use_weighted_masking: bool = False): """Initialize feed-forward Transformer loss module. @@ -27,7 +28,6 @@ def __init__(self, use_masking: bool = True, use_weighted_masking: bool = False) calculation. """ - assert check_argument_types() super().__init__() assert (use_masking != use_weighted_masking) or not use_masking diff --git a/espnet2/tts/fastspeech2/variance_predictor.py b/espnet2/tts/fastspeech2/variance_predictor.py index aba9a64576d..ddc30a39220 100644 --- a/espnet2/tts/fastspeech2/variance_predictor.py +++ b/espnet2/tts/fastspeech2/variance_predictor.py @@ -6,7 +6,7 @@ """Variance predictor related modules.""" import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm @@ -22,6 +22,7 @@ class VariancePredictor(torch.nn.Module): """ + @typechecked def __init__( self, idim: int, @@ -41,7 +42,6 @@ def __init__( dropout_rate (float): Dropout rate. """ - assert check_argument_types() super().__init__() self.conv = torch.nn.ModuleList() for idx in range(n_layers): diff --git a/espnet2/tts/feats_extract/dio.py b/espnet2/tts/feats_extract/dio.py index 69e052d7304..3218bc7e56b 100644 --- a/espnet2/tts/feats_extract/dio.py +++ b/espnet2/tts/feats_extract/dio.py @@ -12,7 +12,7 @@ import torch import torch.nn.functional as F from scipy.interpolate import interp1d -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.tts.feats_extract.abs_feats_extract import AbsFeatsExtract from espnet.nets.pytorch_backend.nets_utils import pad_list @@ -36,6 +36,7 @@ class Dio(AbsFeatsExtract): """ + @typechecked def __init__( self, fs: Union[int, str] = 22050, @@ -48,7 +49,6 @@ def __init__( use_log_f0: bool = True, reduction_factor: int = None, ): - assert check_argument_types() super().__init__() if isinstance(fs, str): fs = humanfriendly.parse_size(fs) diff --git a/espnet2/tts/feats_extract/energy.py b/espnet2/tts/feats_extract/energy.py index c7f9e0fcc14..d8ef2b97820 100644 --- a/espnet2/tts/feats_extract/energy.py +++ b/espnet2/tts/feats_extract/energy.py @@ -3,12 +3,12 @@ """Energy extractor.""" -from typing import Any, Dict, Tuple, Union +from typing import Any, Dict, Optional, Tuple, Union import humanfriendly import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.layers.stft import Stft from espnet2.tts.feats_extract.abs_feats_extract import AbsFeatsExtract @@ -18,20 +18,20 @@ class Energy(AbsFeatsExtract): """Energy extractor.""" + @typechecked def __init__( self, fs: Union[int, str] = 22050, n_fft: int = 1024, - win_length: int = None, + win_length: Optional[int] = None, hop_length: int = 256, window: str = "hann", center: bool = True, normalized: bool = False, onesided: bool = True, use_token_averaged_energy: bool = True, - reduction_factor: int = None, + reduction_factor: Optional[int] = None, ): - assert check_argument_types() super().__init__() if isinstance(fs, str): fs = humanfriendly.parse_size(fs) diff --git a/espnet2/tts/feats_extract/linear_spectrogram.py b/espnet2/tts/feats_extract/linear_spectrogram.py index e8b1a6c0411..56fc388e6c1 100644 --- a/espnet2/tts/feats_extract/linear_spectrogram.py +++ b/espnet2/tts/feats_extract/linear_spectrogram.py @@ -1,7 +1,7 @@ from typing import Any, Dict, Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.layers.stft import Stft from espnet2.tts.feats_extract.abs_feats_extract import AbsFeatsExtract @@ -13,17 +13,17 @@ class LinearSpectrogram(AbsFeatsExtract): Stft -> amplitude-spec """ + @typechecked def __init__( self, n_fft: int = 1024, - win_length: int = None, + win_length: Optional[int] = None, hop_length: int = 256, window: Optional[str] = "hann", center: bool = True, normalized: bool = False, onesided: bool = True, ): - assert check_argument_types() super().__init__() self.n_fft = n_fft self.hop_length = hop_length diff --git a/espnet2/tts/feats_extract/log_mel_fbank.py b/espnet2/tts/feats_extract/log_mel_fbank.py index b05424713e5..21c95e3b1e8 100644 --- a/espnet2/tts/feats_extract/log_mel_fbank.py +++ b/espnet2/tts/feats_extract/log_mel_fbank.py @@ -2,7 +2,7 @@ import humanfriendly import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.layers.log_mel import LogMel from espnet2.layers.stft import Stft @@ -15,11 +15,12 @@ class LogMelFbank(AbsFeatsExtract): Stft -> amplitude-spec -> Log-Mel-Fbank """ + @typechecked def __init__( self, fs: Union[int, str] = 16000, n_fft: int = 1024, - win_length: int = None, + win_length: Optional[int] = None, hop_length: int = 256, window: Optional[str] = "hann", center: bool = True, @@ -31,7 +32,6 @@ def __init__( htk: bool = False, log_base: Optional[float] = 10.0, ): - assert check_argument_types() super().__init__() if isinstance(fs, str): fs = humanfriendly.parse_size(fs) diff --git a/espnet2/tts/feats_extract/log_spectrogram.py b/espnet2/tts/feats_extract/log_spectrogram.py index f436d6e04fe..150197cd4a9 100644 --- a/espnet2/tts/feats_extract/log_spectrogram.py +++ b/espnet2/tts/feats_extract/log_spectrogram.py @@ -1,7 +1,7 @@ from typing import Any, Dict, Optional, Tuple import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.layers.stft import Stft from espnet2.tts.feats_extract.abs_feats_extract import AbsFeatsExtract @@ -13,17 +13,17 @@ class LogSpectrogram(AbsFeatsExtract): Stft -> log-amplitude-spec """ + @typechecked def __init__( self, n_fft: int = 1024, - win_length: int = None, + win_length: Optional[int] = None, hop_length: int = 256, window: Optional[str] = "hann", center: bool = True, normalized: bool = False, onesided: bool = True, ): - assert check_argument_types() super().__init__() self.n_fft = n_fft self.hop_length = hop_length diff --git a/espnet2/tts/feats_extract/yin.py b/espnet2/tts/feats_extract/yin.py index cf10542488c..0b015d37574 100644 --- a/espnet2/tts/feats_extract/yin.py +++ b/espnet2/tts/feats_extract/yin.py @@ -8,10 +8,9 @@ def differenceFunction(x, N, tau_max): - """ - Compute difference function of data x. This corresponds to equation (6) in [1] - This solution is implemented directly with torch rfft. + """Compute difference function of data x. This corresponds to equation (6) in [1] + This solution is implemented directly with torch rfft. :param x: audio data (Tensor) :param N: length of data @@ -51,10 +50,9 @@ def differenceFunction(x, N, tau_max): def differenceFunction_np(x, N, tau_max): - """ - Compute difference function of data x. This corresponds to equation (6) in [1] - This solution is implemented directly with Numpy fft. + """Compute difference function of data x. This corresponds to equation (6) in [1] + This solution is implemented directly with Numpy fft. :param x: audio data :param N: length of data @@ -77,8 +75,7 @@ def differenceFunction_np(x, N, tau_max): def cumulativeMeanNormalizedDifferenceFunction(df, N, eps=1e-8): - """ - Compute cumulative mean normalized difference function (CMND). + """Compute cumulative mean normalized difference function (CMND). This corresponds to equation (8) in [1] @@ -104,6 +101,7 @@ def cumulativeMeanNormalizedDifferenceFunction(df, N, eps=1e-8): def differenceFunctionTorch(xs: torch.Tensor, N, tau_max) -> torch.Tensor: """pytorch backend batch-wise differenceFunction + has 1e-4 level error with input shape of (32, 22050*1.5) Args: xs: @@ -116,7 +114,6 @@ def differenceFunctionTorch(xs: torch.Tensor, N, tau_max) -> torch.Tensor: xs = xs.double() w = xs.shape[-1] tau_max = min(tau_max, w) - zeros = torch.zeros((xs.shape[0], 1)) x_cumsum = torch.cat( ( torch.zeros((xs.shape[0], 1), device=xs.device), diff --git a/espnet2/tts/feats_extract/ying.py b/espnet2/tts/feats_extract/ying.py index 63ffa869071..cdf0093c140 100644 --- a/espnet2/tts/feats_extract/ying.py +++ b/espnet2/tts/feats_extract/ying.py @@ -1,16 +1,24 @@ # modified from https://github.com/dhchoi99/NANSY # We have modified the implementation of dhchoi99 to be fully differentiable. import math -from typing import Any, Dict, Tuple, Union +from typing import Any, Dict, Optional, Tuple import torch +import torch.nn.functional as F +from typeguard import typechecked from espnet2.tts.feats_extract.abs_feats_extract import AbsFeatsExtract -from espnet2.tts.feats_extract.yin import * +from espnet2.tts.feats_extract.yin import ( + cumulativeMeanNormalizedDifferenceFunctionTorch, + differenceFunctionTorch, +) from espnet.nets.pytorch_backend.nets_utils import pad_list class Ying(AbsFeatsExtract): + """Extact Ying-based Features.""" + + @typechecked def __init__( self, fs: int = 22050, @@ -65,7 +73,8 @@ def midi_to_lag(self, m: int, octave_range: float = 12): return lag def yingram_from_cmndf(self, cmndfs: torch.Tensor) -> torch.Tensor: - """yingram calculator from cMNDFs + """yingram calculator from cMNDFs. + (cumulative Mean Normalized Difference Functions) Args: @@ -79,7 +88,6 @@ def yingram_from_cmndf(self, cmndfs: torch.Tensor) -> torch.Tensor: y: calculated batch yingram - """ # c_ms = np.asarray([Pitch.midi_to_lag(m, fs) for m in ms]) # c_ms = torch.from_numpy(c_ms).to(cmndfs.device) @@ -107,7 +115,6 @@ def yingram(self, x: torch.Tensor): """ # x.shape: t -> B,T, B,T = x.shape B, T = x.shape - w_len = self.W frames = self.unfold(x.view(B, 1, 1, T)) frames = frames.permute(0, 2, 1).contiguous().view(-1, self.W) # [B* frames, W] @@ -141,13 +148,14 @@ def _adjust_num_frames(x: torch.Tensor, num_frames: torch.Tensor) -> torch.Tenso x = x[:num_frames] return x + @typechecked def forward( self, input: torch.Tensor, - input_lengths: torch.Tensor = None, - feats_lengths: torch.Tensor = None, - durations: torch.Tensor = None, - durations_lengths: torch.Tensor = None, + input_lengths: Optional[torch.Tensor] = None, + feats_lengths: Optional[torch.Tensor] = None, + durations: Optional[torch.Tensor] = None, + durations_lengths: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: if input_lengths is None: input_lengths = ( diff --git a/espnet2/tts/gst/style_encoder.py b/espnet2/tts/gst/style_encoder.py index 93f8f66ced8..c786394f6e8 100644 --- a/espnet2/tts/gst/style_encoder.py +++ b/espnet2/tts/gst/style_encoder.py @@ -6,7 +6,7 @@ from typing import Sequence import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet.nets.pytorch_backend.transformer.attention import ( MultiHeadedAttention as BaseMultiHeadedAttention, @@ -42,6 +42,7 @@ class StyleEncoder(torch.nn.Module): """ + @typechecked def __init__( self, idim: int = 80, @@ -56,7 +57,6 @@ def __init__( gru_units: int = 128, ): """Initilize global style encoder module.""" - assert check_argument_types() super(StyleEncoder, self).__init__() self.ref_enc = ReferenceEncoder( @@ -114,6 +114,7 @@ class ReferenceEncoder(torch.nn.Module): """ + @typechecked def __init__( self, idim=80, @@ -125,7 +126,6 @@ def __init__( gru_units: int = 128, ): """Initilize reference encoder module.""" - assert check_argument_types() super(ReferenceEncoder, self).__init__() # check hyperparameters are valid @@ -209,6 +209,7 @@ class StyleTokenLayer(torch.nn.Module): """ + @typechecked def __init__( self, ref_embed_dim: int = 128, @@ -218,7 +219,6 @@ def __init__( dropout_rate: float = 0.0, ): """Initilize style token layer module.""" - assert check_argument_types() super(StyleTokenLayer, self).__init__() gst_embs = torch.randn(gst_tokens, gst_token_dim // gst_heads) diff --git a/espnet2/tts/prodiff/loss.py b/espnet2/tts/prodiff/loss.py index bbf337db8a0..0d9ca9fe8b1 100644 --- a/espnet2/tts/prodiff/loss.py +++ b/espnet2/tts/prodiff/loss.py @@ -8,7 +8,7 @@ import torch from torch.nn import functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet.nets.pytorch_backend.fastspeech.duration_predictor import ( # noqa: H301 DurationPredictorLoss, @@ -160,6 +160,7 @@ def ssim(self, tensor1: torch.Tensor, tensor2: torch.Tensor): class ProDiffLoss(torch.nn.Module): """Loss function module for ProDiffLoss.""" + @typechecked def __init__( self, use_masking: bool = True, @@ -174,7 +175,6 @@ def __init__( calculation. """ - assert check_argument_types() super().__init__() assert (use_masking != use_weighted_masking) or not use_masking diff --git a/espnet2/tts/prodiff/prodiff.py b/espnet2/tts/prodiff/prodiff.py index dc9ab933d1d..14d259aee82 100644 --- a/espnet2/tts/prodiff/prodiff.py +++ b/espnet2/tts/prodiff/prodiff.py @@ -9,7 +9,7 @@ import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.torch_utils.device_funcs import force_gatherable from espnet2.torch_utils.initialize import initialize @@ -43,6 +43,7 @@ class ProDiff(AbsTTS): """ + @typechecked def __init__( self, # network structure related @@ -222,7 +223,6 @@ def __init__( calculation. """ - assert check_argument_types() super().__init__() # store hyperparameters diff --git a/espnet2/tts/tacotron2/tacotron2.py b/espnet2/tts/tacotron2/tacotron2.py index c4664b68181..5a98ceb5412 100644 --- a/espnet2/tts/tacotron2/tacotron2.py +++ b/espnet2/tts/tacotron2/tacotron2.py @@ -8,7 +8,7 @@ import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.torch_utils.device_funcs import force_gatherable from espnet2.tts.abs_tts import AbsTTS @@ -35,6 +35,7 @@ class Tacotron2(AbsTTS): """ + @typechecked def __init__( self, # network structure related @@ -58,7 +59,7 @@ def __init__( postnet_layers: int = 5, postnet_chans: int = 512, postnet_filts: int = 5, - output_activation: str = None, + output_activation: Optional[str] = None, use_batch_norm: bool = True, use_concate: bool = True, use_residual: bool = False, @@ -144,7 +145,6 @@ def __init__( guided_attn_loss_lambda (float): Lambda in guided attention loss. """ - assert check_argument_types() super().__init__() # store hyperparameters diff --git a/espnet2/tts/transformer/transformer.py b/espnet2/tts/transformer/transformer.py index be80c756eb1..83e7c16c138 100644 --- a/espnet2/tts/transformer/transformer.py +++ b/espnet2/tts/transformer/transformer.py @@ -7,7 +7,7 @@ import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.torch_utils.device_funcs import force_gatherable from espnet2.torch_utils.initialize import initialize @@ -43,6 +43,7 @@ class Transformer(AbsTTS): """ + @typechecked def __init__( self, # network structure related @@ -203,7 +204,6 @@ def __init__( guided_attn_loss_lambda (float): Lambda in guided attention loss. """ - assert check_argument_types() super().__init__() # store hyperparameters diff --git a/espnet2/uasr/discriminator/conv_discriminator.py b/espnet2/uasr/discriminator/conv_discriminator.py index 1b17765dd8e..8a7e0dba8b9 100644 --- a/espnet2/uasr/discriminator/conv_discriminator.py +++ b/espnet2/uasr/discriminator/conv_discriminator.py @@ -2,7 +2,7 @@ from typing import Dict, Optional import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.uasr.discriminator.abs_discriminator import AbsDiscriminator from espnet2.utils.types import str2bool @@ -25,6 +25,7 @@ def forward(self, x): class ConvDiscriminator(AbsDiscriminator): """convolutional discriminator for UASR.""" + @typechecked def __init__( self, input_dim: int, @@ -42,7 +43,6 @@ def __init__( weight_norm: str2bool = False, ): super().__init__() - assert check_argument_types() if cfg is not None: cfg = argparse.Namespace(**cfg) self.conv_channels = cfg.discriminator_dim @@ -146,8 +146,8 @@ def make_conv( *inner_net, ) + @typechecked def forward(self, x: torch.Tensor, padding_mask: Optional[torch.Tensor]): - assert check_argument_types() # (Batch, Time, Channel) -> (Batch, Channel, Time) x = x.transpose(1, 2) diff --git a/espnet2/uasr/espnet_model.py b/espnet2/uasr/espnet_model.py index e8b8253056c..1ca77210b86 100644 --- a/espnet2/uasr/espnet_model.py +++ b/espnet2/uasr/espnet_model.py @@ -7,7 +7,7 @@ import torch import torch.nn.functional as F from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.frontend.abs_frontend import AbsFrontend from espnet2.text.token_id_converter import TokenIDConverter @@ -42,6 +42,7 @@ class ESPnetUASRModel(AbsESPnetModel): https://github.com/facebookresearch/fairseq/tree/main/examples/wav2vec/unsupervised """ + @typechecked def __init__( self, frontend: Optional[AbsFrontend], @@ -66,7 +67,6 @@ def __init__( decay_temperature: float = 0.99995, use_collected_training_feats: str2bool = False, ): - assert check_argument_types() super().__init__() # note that eos is the same as sos (equivalent ID) @@ -119,8 +119,9 @@ def number_updates(self): return self._number_updates @number_updates.setter + @typechecked def number_updates(self, iiter: int): - assert check_argument_types() and iiter >= 0 + assert iiter >= 0 self._number_updates = iiter def forward( diff --git a/espnet2/uasr/generator/conv_generator.py b/espnet2/uasr/generator/conv_generator.py index bb6ceab94b4..2f3924b39e4 100644 --- a/espnet2/uasr/generator/conv_generator.py +++ b/espnet2/uasr/generator/conv_generator.py @@ -3,7 +3,7 @@ from typing import Dict, Optional import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.uasr.generator.abs_generator import AbsGenerator from espnet2.utils.types import str2bool @@ -37,6 +37,7 @@ def forward(self, x): class ConvGenerator(AbsGenerator): """convolutional generator for UASR.""" + @typechecked def __init__( self, input_dim: int, @@ -53,7 +54,6 @@ def __init__( residual: str2bool = True, ): super().__init__() - assert check_argument_types() self.input_dim = input_dim self.output_dim = output_dim diff --git a/espnet2/uasr/loss/discriminator_loss.py b/espnet2/uasr/loss/discriminator_loss.py index 3d106f83bdc..cb494c152be 100644 --- a/espnet2/uasr/loss/discriminator_loss.py +++ b/espnet2/uasr/loss/discriminator_loss.py @@ -1,6 +1,6 @@ import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.uasr.loss.abs_loss import AbsUASRLoss from espnet2.utils.types import str2bool @@ -9,6 +9,7 @@ class UASRDiscriminatorLoss(AbsUASRLoss): """discriminator loss for UASR.""" + @typechecked def __init__( self, weight: float = 1.0, @@ -17,7 +18,6 @@ def __init__( reduction: str = "sum", ): super().__init__() - assert check_argument_types() self.weight = weight self.smoothing = smoothing self.smoothing_one_sided = smoothing_one_side diff --git a/espnet2/uasr/loss/gradient_penalty.py b/espnet2/uasr/loss/gradient_penalty.py index b24d3fb7f59..b774690f36a 100644 --- a/espnet2/uasr/loss/gradient_penalty.py +++ b/espnet2/uasr/loss/gradient_penalty.py @@ -1,7 +1,7 @@ import numpy as np import torch from torch import autograd -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.uasr.discriminator.abs_discriminator import AbsDiscriminator from espnet2.uasr.loss.abs_loss import AbsUASRLoss @@ -11,6 +11,7 @@ class UASRGradientPenalty(AbsUASRLoss): """gradient penalty for UASR.""" + @typechecked def __init__( self, discriminator: AbsDiscriminator, @@ -19,7 +20,6 @@ def __init__( reduction: str = "sum", ): super().__init__() - assert check_argument_types() self.discriminator = [discriminator] self.weight = weight diff --git a/espnet2/uasr/loss/phoneme_diversity_loss.py b/espnet2/uasr/loss/phoneme_diversity_loss.py index c83d5dc9337..1ae6b1e0c44 100644 --- a/espnet2/uasr/loss/phoneme_diversity_loss.py +++ b/espnet2/uasr/loss/phoneme_diversity_loss.py @@ -1,5 +1,5 @@ import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.uasr.loss.abs_loss import AbsUASRLoss from espnet2.utils.types import str2bool @@ -8,12 +8,12 @@ class UASRPhonemeDiversityLoss(AbsUASRLoss): """phoneme diversity loss for UASR.""" + @typechecked def __init__( self, weight: float = 1.0, ): super().__init__() - assert check_argument_types() self.weight = weight diff --git a/espnet2/uasr/loss/pseudo_label_loss.py b/espnet2/uasr/loss/pseudo_label_loss.py index 2421895d88c..8e9f100fa24 100644 --- a/espnet2/uasr/loss/pseudo_label_loss.py +++ b/espnet2/uasr/loss/pseudo_label_loss.py @@ -1,6 +1,6 @@ import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.uasr.loss.abs_loss import AbsUASRLoss from espnet2.utils.types import str2bool @@ -9,6 +9,7 @@ class UASRPseudoLabelLoss(AbsUASRLoss): """auxiliary pseudo label loss for UASR.""" + @typechecked def __init__( self, weight: float = 1.0, @@ -19,7 +20,6 @@ def __init__( reduction: str = "none", ): super().__init__() - assert check_argument_types() self.weight = weight self.input_dim = input_dim diff --git a/espnet2/uasr/loss/smoothness_penalty.py b/espnet2/uasr/loss/smoothness_penalty.py index 357844887d6..1ec3b44b5f5 100644 --- a/espnet2/uasr/loss/smoothness_penalty.py +++ b/espnet2/uasr/loss/smoothness_penalty.py @@ -1,6 +1,6 @@ import torch import torch.nn.functional as F -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.uasr.loss.abs_loss import AbsUASRLoss @@ -8,13 +8,13 @@ class UASRSmoothnessPenalty(AbsUASRLoss): """smoothness penalty for UASR.""" + @typechecked def __init__( self, weight: float = 1.0, reduction: str = "none", ): super().__init__() - assert check_argument_types() self.weight = weight self.reduction = reduction diff --git a/espnet2/uasr/segmenter/join_segmenter.py b/espnet2/uasr/segmenter/join_segmenter.py index 436432841be..85d353d8356 100644 --- a/espnet2/uasr/segmenter/join_segmenter.py +++ b/espnet2/uasr/segmenter/join_segmenter.py @@ -2,13 +2,14 @@ from typing import Dict, Optional import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.uasr.segmenter.abs_segmenter import AbsSegmenter from espnet2.utils.types import str2bool class JoinSegmenter(AbsSegmenter): + @typechecked def __init__( self, cfg: Optional[Dict] = None, @@ -18,7 +19,6 @@ def __init__( remove_zeros: str2bool = False, ): super().__init__() - assert check_argument_types() if cfg is not None: cfg = argparse.Namespace(**cfg["segmentation"]) @@ -31,20 +31,20 @@ def __init__( self.mean_pool_join = mean_join_pool self.remove_zeros = remove_zeros + @typechecked def pre_segment( self, xs_pad: torch.Tensor, padding_mask: torch.Tensor, ) -> torch.Tensor: - assert check_argument_types() return xs_pad, padding_mask + @typechecked def logit_segment( self, logits: torch.Tensor, padding_mask: torch.Tensor, ) -> torch.Tensor: - assert check_argument_types() preds = logits.argmax(dim=-1) if padding_mask.any(): diff --git a/espnet2/uasr/segmenter/random_segmenter.py b/espnet2/uasr/segmenter/random_segmenter.py index 957e85cbe06..86596aeb676 100644 --- a/espnet2/uasr/segmenter/random_segmenter.py +++ b/espnet2/uasr/segmenter/random_segmenter.py @@ -1,13 +1,14 @@ import math import torch -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.uasr.segmenter.abs_segmenter import AbsSegmenter from espnet2.utils.types import str2bool class RandomSegmenter(AbsSegmenter): + @typechecked def __init__( self, subsample_rate: float = 0.25, @@ -16,7 +17,6 @@ def __init__( remove_zeros: str2bool = False, ): super().__init__() - assert check_argument_types() self.subsample_rate = subsample_rate def pre_segment( diff --git a/espnet2/utils/build_dataclass.py b/espnet2/utils/build_dataclass.py index 6675c99a014..db66f06dacb 100644 --- a/espnet2/utils/build_dataclass.py +++ b/espnet2/utils/build_dataclass.py @@ -12,6 +12,6 @@ def build_dataclass(dataclass, args: argparse.Namespace): raise ValueError( f"args doesn't have {field.name}. You need to set it to ArgumentsParser" ) - check_type(field.name, getattr(args, field.name), field.type) + check_type(getattr(args, field.name), field.type) kwargs[field.name] = getattr(args, field.name) return dataclass(**kwargs) diff --git a/espnet2/utils/eer.py b/espnet2/utils/eer.py index 7c45d889dbc..4e657e681d1 100644 --- a/espnet2/utils/eer.py +++ b/espnet2/utils/eer.py @@ -43,7 +43,7 @@ def ComputeErrorRates(scores, labels): key=itemgetter(1), ) ) - sorted_labels = [] + labels = [labels[i] for i in sorted_indexes] fnrs = [] fprs = [] diff --git a/espnet2/utils/griffin_lim.py b/espnet2/utils/griffin_lim.py index ab7c9097e49..8fe94999470 100644 --- a/espnet2/utils/griffin_lim.py +++ b/espnet2/utils/griffin_lim.py @@ -13,18 +13,19 @@ import numpy as np import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import typechecked EPS = 1e-10 +@typechecked def logmel2linear( lmspc: np.ndarray, fs: int, n_fft: int, n_mels: int, - fmin: int = None, - fmax: int = None, + fmin: Optional[int] = None, + fmax: Optional[int] = None, ) -> np.ndarray: """Convert log Mel filterbank to linear spectrogram. @@ -51,11 +52,12 @@ def logmel2linear( return np.maximum(EPS, np.dot(inv_mel_basis, mspc.T).T) +@typechecked def griffin_lim( spc: np.ndarray, n_fft: int, n_shift: int, - win_length: int = None, + win_length: Optional[int] = None, window: Optional[str] = "hann", n_iter: Optional[int] = 32, ) -> np.ndarray: @@ -111,16 +113,17 @@ def griffin_lim( class Spectrogram2Waveform(object): """Spectrogram to waveform conversion module.""" + @typechecked def __init__( self, n_fft: int, n_shift: int, - fs: int = None, - n_mels: int = None, - win_length: int = None, + fs: Optional[int] = None, + n_mels: Optional[int] = None, + win_length: Optional[int] = None, window: Optional[str] = "hann", - fmin: int = None, - fmax: int = None, + fmin: Optional[int] = None, + fmax: Optional[int] = None, griffin_lim_iters: Optional[int] = 8, ): """Initialize module. @@ -137,7 +140,6 @@ def __init__( griffin_lim_iters: The number of iterations. """ - assert check_argument_types() self.fs = fs self.logmel2linear = ( partial( diff --git a/espnetez/task.py b/espnetez/task.py index 0e7f66231c5..2965bd933e4 100644 --- a/espnetez/task.py +++ b/espnetez/task.py @@ -3,11 +3,12 @@ import argparse import logging from pathlib import Path +from typing import Optional import numpy as np import torch from torch.utils.data import DataLoader -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.iterators.abs_iter_factory import AbsIterFactory from espnet2.iterators.category_iter_factory import CategoryIterFactory @@ -160,13 +161,13 @@ def build_iter_factory( ) @classmethod + @typechecked def build_sequence_iter_factory( cls, args: argparse.Namespace, iter_options: IteratorOptions, mode: str, ) -> AbsIterFactory: - assert check_argument_types() if mode == "train": dataset = cls.train_dataset @@ -267,22 +268,22 @@ def build_task_iter_factory( raise NotImplementedError @classmethod + @typechecked def build_streaming_iterator( cls, data_path_and_name_and_type, preprocess_fn, collate_fn, - key_file: str = None, + key_file: Optional[str] = None, batch_size: int = 1, dtype: str = np.float32, num_workers: int = 1, allow_variable_data_keys: bool = False, ngpu: int = 0, inference: bool = False, - mode: str = None, + mode: Optional[str] = None, ) -> DataLoader: """Build DataLoader using iterable dataset""" - assert check_argument_types() if mode == "train" and cls.train_dataloader is not None: return cls.train_dataloader elif mode == "valid" and cls.valid_dataloader is not None: diff --git a/setup.py b/setup.py index 1d01ddb65bc..d5be9f60291 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ "setuptools>=38.5.1", "packaging", "configargparse>=1.2.1", - "typeguard==2.13.3", + "typeguard", "humanfriendly", "scipy>=1.4.1", "filelock", @@ -92,7 +92,7 @@ "pytest-runner", ], "test": [ - "pytest>=3.3.0", + "pytest>=7.0.0", "pytest-timeouts>=1.2.1", "pytest-pythonpath>=0.7.3", "pytest-cov>=2.7.1", diff --git a/test/espnet2/asr/test_ctc.py b/test/espnet2/asr/test_ctc.py index 20d534359c9..8546e7844af 100644 --- a/test/espnet2/asr/test_ctc.py +++ b/test/espnet2/asr/test_ctc.py @@ -41,7 +41,7 @@ def test_ctc_argmax(ctc_type, ctc_args): def test_bayes_risk_ctc(ctc_args): # Skip the test if K2 is not installed try: - import k2 + import k2 # noqa except ImportError: return diff --git a/test/espnet2/asr_transducer/test_decoder.py b/test/espnet2/asr_transducer/test_decoder.py index 558a1ffd767..051d4fdcc2a 100644 --- a/test/espnet2/asr_transducer/test_decoder.py +++ b/test/espnet2/asr_transducer/test_decoder.py @@ -78,10 +78,10 @@ def test_mega_decoder(params): def test_mega_rel_pos_bias_type(): - vocab_size, labels = prepare() + vocab_size, _ = prepare() with pytest.raises(ValueError): - decoder = MEGADecoder(vocab_size, rel_pos_bias_type="foo") + _ = MEGADecoder(vocab_size, rel_pos_bias_type="foo") @pytest.mark.parametrize( @@ -103,7 +103,7 @@ def test_mega_rel_pos_bias(rel_pos_bias_type): def test_rnn_type(): - vocab_size, labels = prepare() + vocab_size, _ = prepare() with pytest.raises(ValueError): _ = RNNDecoder(vocab_size, rnn_type="foo") diff --git a/test/espnet2/bin/test_enh_inference_streaming.py b/test/espnet2/bin/test_enh_inference_streaming.py index 2e70abc2aef..8380ffb9297 100644 --- a/test/espnet2/bin/test_enh_inference_streaming.py +++ b/test/espnet2/bin/test_enh_inference_streaming.py @@ -84,4 +84,4 @@ def test_SeparateSpeech( output_chunks[channel].append(output[channel]) separate_speech.reset() - waves = [separate_speech.merge(chunks, ilens) for chunks in output_chunks] + _ = [separate_speech.merge(chunks, ilens) for chunks in output_chunks] diff --git a/test/espnet2/bin/test_s2st_inference.py b/test/espnet2/bin/test_s2st_inference.py index c32cc7eb767..4a98c329490 100644 --- a/test/espnet2/bin/test_s2st_inference.py +++ b/test/espnet2/bin/test_s2st_inference.py @@ -8,7 +8,6 @@ from espnet2.bin.s2st_inference import Speech2Speech, get_parser, main from espnet2.tasks.s2st import S2STTask -from espnet.nets.beam_search import Hypothesis def test_get_parser(): diff --git a/test/espnet2/enh/diffusion/test_score_based_diffusion.py b/test/espnet2/enh/diffusion/test_score_based_diffusion.py index ed4e027650c..3acad08b62a 100644 --- a/test/espnet2/enh/diffusion/test_score_based_diffusion.py +++ b/test/espnet2/enh/diffusion/test_score_based_diffusion.py @@ -1,7 +1,5 @@ import pytest import torch -from packaging.version import parse as V -from torch import Tensor from espnet2.enh.diffusion.score_based_diffusion import ScoreModel diff --git a/test/espnet2/enh/layers/test_ncsnpp.py b/test/espnet2/enh/layers/test_ncsnpp.py index b8a809f0ad9..8468ce69ec9 100644 --- a/test/espnet2/enh/layers/test_ncsnpp.py +++ b/test/espnet2/enh/layers/test_ncsnpp.py @@ -1,6 +1,5 @@ import pytest import torch -from torch import Tensor from espnet2.enh.layers.ncsnpp import NCSNpp diff --git a/test/espnet2/enh/test_espnet_diffusion_se.py b/test/espnet2/enh/test_espnet_diffusion_se.py index 4c4c8f0bedc..cbfc124df80 100644 --- a/test/espnet2/enh/test_espnet_diffusion_se.py +++ b/test/espnet2/enh/test_espnet_diffusion_se.py @@ -1,6 +1,5 @@ import pytest import torch -from packaging.version import parse as V from espnet2.enh.decoder.stft_decoder import STFTDecoder from espnet2.enh.diffusion.score_based_diffusion import ScoreModel diff --git a/test/espnet2/fileio/test_datadir_writer.py b/test/espnet2/fileio/test_datadir_writer.py index eaf37a97834..f62d7f69331 100644 --- a/test/espnet2/fileio/test_datadir_writer.py +++ b/test/espnet2/fileio/test_datadir_writer.py @@ -1,6 +1,7 @@ from pathlib import Path import pytest +from typeguard import TypeCheckError from espnet2.fileio.datadir_writer import DatadirWriter @@ -14,7 +15,7 @@ def test_DatadirWriter(tmp_path: Path): # __setitem__() sub["bb"] = "aa" - with pytest.raises(TypeError): + with pytest.raises(TypeCheckError): sub["bb"] = 1 with pytest.raises(RuntimeError): # Already has children diff --git a/test/espnet2/fileio/test_npy_scp.py b/test/espnet2/fileio/test_npy_scp.py index c965f12ce42..b53b7972d6e 100644 --- a/test/espnet2/fileio/test_npy_scp.py +++ b/test/espnet2/fileio/test_npy_scp.py @@ -1,7 +1,6 @@ from pathlib import Path import numpy as np -import pytest from espnet2.fileio.npy_scp import NpyScpReader, NpyScpWriter diff --git a/test/espnet2/fileio/test_score_scp.py b/test/espnet2/fileio/test_score_scp.py index ca66a261baf..81992f6ff8a 100644 --- a/test/espnet2/fileio/test_score_scp.py +++ b/test/espnet2/fileio/test_score_scp.py @@ -4,7 +4,6 @@ import miditoolkit import miditoolkit.midi.containers as ct import music21 as m21 -import numpy as np from espnet2.fileio.score_scp import ( NOTE, diff --git a/test/espnet2/gan_svs/visinger/test_visinger.py b/test/espnet2/gan_svs/visinger/test_visinger.py index b6f2d51c945..67122ea3997 100644 --- a/test/espnet2/gan_svs/visinger/test_visinger.py +++ b/test/espnet2/gan_svs/visinger/test_visinger.py @@ -5,6 +5,7 @@ """Test VISinger related modules.""" import pytest +import scipy import torch from espnet2.gan_svs.vits import VITS @@ -216,10 +217,10 @@ def get_test_data(): "use_conformer_conv_in_text_encoder": True, "decoder_kernel_size": 7, "decoder_channels": 16, - "decoder_upsample_scales": (2, 2, 4, 16), - "decoder_upsample_kernel_sizes": (4, 4, 8, 32), - "decoder_resblock_kernel_sizes": (3, 5), - "decoder_resblock_dilations": [(1, 3), (1, 3)], + "decoder_upsample_scales": [2, 2, 4, 16], + "decoder_upsample_kernel_sizes": [4, 4, 8, 32], + "decoder_resblock_kernel_sizes": [3, 5], + "decoder_resblock_dilations": [[1, 3], [1, 3]], "use_weight_norm_in_decoder": True, "posterior_encoder_kernel_size": 5, "posterior_encoder_layers": 2, @@ -343,10 +344,10 @@ def get_test_data(): "use_conformer_conv_in_text_encoder": True, "decoder_kernel_size": 7, "decoder_channels": 16, - "decoder_upsample_scales": (16, 16), - "decoder_upsample_kernel_sizes": (32, 32), - "decoder_resblock_kernel_sizes": (3, 5), - "decoder_resblock_dilations": [(1, 3), (1, 3)], + "decoder_upsample_scales": [16, 16], + "decoder_upsample_kernel_sizes": [32, 32], + "decoder_resblock_kernel_sizes": [3, 5], + "decoder_resblock_dilations": [[1, 3], [1, 3]], "use_weight_norm_in_decoder": True, "posterior_encoder_kernel_size": 5, "posterior_encoder_layers": 2, @@ -440,12 +441,12 @@ def get_test_data(): "use_conformer_conv_in_text_encoder": True, "decoder_kernel_size": 7, "decoder_channels": 16, - "decoder_downsample_scales": (16, 16), - "decoder_downsample_kernel_sizes": (32, 32), - "decoder_upsample_scales": (16, 16), - "decoder_upsample_kernel_sizes": (32, 32), - "decoder_resblock_kernel_sizes": (3, 5), - "decoder_resblock_dilations": [(1, 3), (1, 3)], + "decoder_downsample_scales": [16, 16], + "decoder_downsample_kernel_sizes": [32, 32], + "decoder_upsample_scales": [16, 16], + "decoder_upsample_kernel_sizes": [32, 32], + "decoder_resblock_kernel_sizes": [3, 5], + "decoder_resblock_dilations": [[1, 3], [1, 3]], "use_weight_norm_in_decoder": True, "posterior_encoder_kernel_size": 5, "posterior_encoder_layers": 2, @@ -539,12 +540,12 @@ def get_test_data(): "use_conformer_conv_in_text_encoder": True, "decoder_kernel_size": 7, "decoder_channels": 16, - "decoder_downsample_scales": (16, 16), - "decoder_downsample_kernel_sizes": (32, 32), - "decoder_upsample_scales": (16, 16), - "decoder_upsample_kernel_sizes": (32, 32), - "decoder_resblock_kernel_sizes": (3, 5), - "decoder_resblock_dilations": [(1, 3), (1, 3)], + "decoder_downsample_scales": [16, 16], + "decoder_downsample_kernel_sizes": [32, 32], + "decoder_upsample_scales": [16, 16], + "decoder_upsample_kernel_sizes": [32, 32], + "decoder_resblock_kernel_sizes": [3, 5], + "decoder_resblock_dilations": [[1, 3], [1, 3]], "use_weight_norm_in_decoder": True, "posterior_encoder_kernel_size": 5, "posterior_encoder_layers": 2, @@ -638,12 +639,12 @@ def get_test_data(): "use_conformer_conv_in_text_encoder": True, "decoder_kernel_size": 7, "decoder_channels": 16, - "decoder_downsample_scales": (16, 16), - "decoder_downsample_kernel_sizes": (32, 32), - "decoder_upsample_scales": (16, 16), - "decoder_upsample_kernel_sizes": (32, 32), - "decoder_resblock_kernel_sizes": (3, 5), - "decoder_resblock_dilations": [(1, 3), (1, 3)], + "decoder_downsample_scales": [16, 16], + "decoder_downsample_kernel_sizes": [32, 32], + "decoder_upsample_scales": [16, 16], + "decoder_upsample_kernel_sizes": [32, 32], + "decoder_resblock_kernel_sizes": [3, 5], + "decoder_resblock_dilations": [[1, 3], [1, 3]], "use_weight_norm_in_decoder": True, "posterior_encoder_kernel_size": 5, "posterior_encoder_layers": 2, @@ -699,12 +700,12 @@ def get_test_data(): "use_conformer_conv_in_text_encoder": True, "decoder_kernel_size": 7, "decoder_channels": 16, - "decoder_downsample_scales": (16, 16), - "decoder_downsample_kernel_sizes": (32, 32), - "decoder_upsample_scales": (16, 16), - "decoder_upsample_kernel_sizes": (32, 32), - "decoder_resblock_kernel_sizes": (3, 5), - "decoder_resblock_dilations": [(1, 3), (1, 3)], + "decoder_downsample_scales": [16, 16], + "decoder_downsample_kernel_sizes": [32, 32], + "decoder_upsample_scales": [16, 16], + "decoder_upsample_kernel_sizes": [32, 32], + "decoder_resblock_kernel_sizes": [3, 5], + "decoder_resblock_dilations": [[1, 3], [1, 3]], "use_weight_norm_in_decoder": True, "posterior_encoder_kernel_size": 5, "posterior_encoder_layers": 2, @@ -763,10 +764,10 @@ def make_vits_generator_args(**kwargs): "use_conformer_conv_in_text_encoder": True, "decoder_kernel_size": 7, "decoder_channels": 16, - "decoder_upsample_scales": (16, 16), - "decoder_upsample_kernel_sizes": (32, 32), - "decoder_resblock_kernel_sizes": (3, 5), - "decoder_resblock_dilations": [(1, 3), (1, 3)], + "decoder_upsample_scales": [16, 16], + "decoder_upsample_kernel_sizes": [32, 32], + "decoder_resblock_kernel_sizes": [3, 5], + "decoder_resblock_dilations": [[1, 3], [1, 3]], "use_weight_norm_in_decoder": True, "posterior_encoder_kernel_size": 5, "posterior_encoder_layers": 2, @@ -887,6 +888,10 @@ def make_vits_loss_args(**kwargs): get_test_data(), ) def test_vits_is_trainable_and_decodable(gen_dict, dis_dict, loss_dict): + try: + from scipy.signal import kaiser + except ImportError: + pytest.skip("Compatibility issue with scipy.") idim = 10 odim = 5 gen_args = make_vits_generator_args(**gen_dict) @@ -1072,6 +1077,10 @@ def test_vits_is_trainable_and_decodable(gen_dict, dis_dict, loss_dict): def test_multi_speaker_vits_is_trainable_and_decodable( gen_dict, dis_dict, loss_dict, spks, spk_embed_dim, langs ): + try: + from scipy.signal import kaiser + except ImportError: + pytest.skip("Compatibility issue with scipy.") idim = 10 odim = 5 global_channels = 8 diff --git a/test/espnet2/gan_tts/hifigan/test_hifigan.py b/test/espnet2/gan_tts/hifigan/test_hifigan.py index a71f77fd84c..642a8a7c7f9 100644 --- a/test/espnet2/gan_tts/hifigan/test_hifigan.py +++ b/test/espnet2/gan_tts/hifigan/test_hifigan.py @@ -186,6 +186,10 @@ def test_hifigan_generator_and_discriminator_and_loss( not is_parallel_wavegan_available, reason="parallel_wavegan is not installed." ) def test_parallel_wavegan_compatibility(): + try: + from scipy.signal import kaiser + except ImportError: + pytest.skip("Kaiser window was not found at scipy.signal. Check scipy version.") from parallel_wavegan.models import HiFiGANGenerator as PWGHiFiGANGenerator model_pwg = PWGHiFiGANGenerator(**make_hifigan_generator_args()) diff --git a/test/espnet2/gan_tts/jets/test_jets.py b/test/espnet2/gan_tts/jets/test_jets.py index ee8e286a841..5039da50d40 100644 --- a/test/espnet2/gan_tts/jets/test_jets.py +++ b/test/espnet2/gan_tts/jets/test_jets.py @@ -189,7 +189,8 @@ def make_jets_loss_args(**kwargs): # NOTE(kan-bayashi): first forward requires jit compile # so a little bit more time is needed to run. Therefore, # here we extend execution timeout from 2 sec to 8 sec. -@pytest.mark.execution_timeout(8) +# NOTE(Nelson): 8 sec. is not enough. Extending to 15. +@pytest.mark.execution_timeout(15) @pytest.mark.skipif( "1.6" in torch.__version__, reason="group conv in pytorch 1.6 has an issue. " diff --git a/test/espnet2/gan_tts/melgan/test_melgan.py b/test/espnet2/gan_tts/melgan/test_melgan.py index 1219565ee2f..eb296f9eaa6 100644 --- a/test/espnet2/gan_tts/melgan/test_melgan.py +++ b/test/espnet2/gan_tts/melgan/test_melgan.py @@ -135,6 +135,10 @@ def test_melgan_generator_and_discriminator(dict_g, dict_d): not is_parallel_wavegan_available, reason="parallel_wavegan is not installed." ) def test_parallel_wavegan_compatibility(): + try: + from scipy.signal import kaiser + except ImportError: + pytest.skip("Kaiser window was not found at scipy.signal. Check scipy version.") from parallel_wavegan.models import MelGANGenerator as PWGMelGANGenerator model_pwg = PWGMelGANGenerator(**make_melgan_generator_args()) diff --git a/test/espnet2/gan_tts/parallel_wavegan/test_parallel_wavegan.py b/test/espnet2/gan_tts/parallel_wavegan/test_parallel_wavegan.py index 85107d0c75a..155561fcc1d 100644 --- a/test/espnet2/gan_tts/parallel_wavegan/test_parallel_wavegan.py +++ b/test/espnet2/gan_tts/parallel_wavegan/test_parallel_wavegan.py @@ -139,6 +139,10 @@ def test_parallel_wavegan_generator_and_discriminator(dict_g, dict_d): not is_parallel_wavegan_available, reason="parallel_wavegan is not installed." ) def test_parallel_wavegan_compatibility(): + try: + from scipy.signal import kaiser + except ImportError: + pytest.skip("Kaiser window was not found at scipy.signal. Check scipy version.") from parallel_wavegan.models import ( ParallelWaveGANGenerator as PWGParallelWaveGANGenerator, ) diff --git a/test/espnet2/gan_tts/style_melgan/test_style_melgan.py b/test/espnet2/gan_tts/style_melgan/test_style_melgan.py index 5291e4913f3..56e955fb7ec 100644 --- a/test/espnet2/gan_tts/style_melgan/test_style_melgan.py +++ b/test/espnet2/gan_tts/style_melgan/test_style_melgan.py @@ -125,6 +125,10 @@ def test_style_melgan_trainable(dict_g, dict_d): not is_parallel_wavegan_available, reason="parallel_wavegan is not installed." ) def test_parallel_wavegan_compatibility(): + try: + from scipy.signal import kaiser + except ImportError: + pytest.skip("Kaiser window was not found at scipy.signal. Check scipy version.") from parallel_wavegan.models import StyleMelGANGenerator as PWGStyleMelGANGenerator model_pwg = PWGStyleMelGANGenerator(**make_style_melgan_generator_args()) diff --git a/test/espnet2/gan_tts/vits/test_vits.py b/test/espnet2/gan_tts/vits/test_vits.py index 0211f1ad713..7b6465b6ee2 100644 --- a/test/espnet2/gan_tts/vits/test_vits.py +++ b/test/espnet2/gan_tts/vits/test_vits.py @@ -274,6 +274,7 @@ def make_vits_loss_args(**kwargs): return defaults +@pytest.mark.execution_timeout(10) @pytest.mark.skipif( "1.6" in torch.__version__, reason="group conv in pytorch 1.6 has an issue. " @@ -349,6 +350,7 @@ def test_vits_is_trainable_and_decodable(gen_dict, dis_dict, loss_dict): assert output_dict["wav"].size(0) == inputs["feats"].size(0) * upsample_factor +@pytest.mark.execution_timeout(10) @pytest.mark.skipif( "1.6" in torch.__version__, reason="Group conv in pytorch 1.6 has an issue. " diff --git a/test/espnet2/layers/test_augmentation.py b/test/espnet2/layers/test_augmentation.py index 95f1c4651d9..ca0fac3348e 100644 --- a/test/espnet2/layers/test_augmentation.py +++ b/test/espnet2/layers/test_augmentation.py @@ -24,20 +24,20 @@ def test_lowpass_filtering(): audio = torch.randn(1000) sr = 8000 - ret = lowpass_filtering(audio, sr, cutoff_freq=1000, Q=0.707) + _ = lowpass_filtering(audio, sr, cutoff_freq=1000, Q=0.707) def test_highpass_filtering(): audio = torch.randn(1000) sr = 8000 - ret = highpass_filtering(audio, sr, cutoff_freq=3000, Q=0.707) + _ = highpass_filtering(audio, sr, cutoff_freq=3000, Q=0.707) @pytest.mark.parametrize("const_skirt_gain", [True, False]) def test_bandpass_filtering(const_skirt_gain): audio = torch.randn(1000) sr = 8000 - ret = bandpass_filtering( + _ = bandpass_filtering( audio, sr, center_freq=2000, Q=0.707, const_skirt_gain=const_skirt_gain ) @@ -45,76 +45,76 @@ def test_bandpass_filtering(const_skirt_gain): def test_bandreject_filtering(): audio = torch.randn(2000) sr = 8000 - ret = bandreject_filtering(audio, sr, center_freq=2000, Q=0.707) + _ = bandreject_filtering(audio, sr, center_freq=2000, Q=0.707) def test_contrast(): audio = torch.randn(1000) sr = 8000 - ret = contrast(audio, sr, enhancement_amount=75) + _ = contrast(audio, sr, enhancement_amount=75) def test_equalization_filtering(): audio = torch.randn(1000) sr = 8000 - ret = equalization_filtering(audio, sr, center_freq=2000, gain=0, Q=0.707) + _ = equalization_filtering(audio, sr, center_freq=2000, gain=0, Q=0.707) @pytest.mark.parametrize("n_steps", [-4, 5]) def test_pitch_shift(n_steps): audio = torch.randn(1000) sr = 2000 - ret = pitch_shift(audio, sr, n_steps=n_steps, bins_per_octave=12) + _ = pitch_shift(audio, sr, n_steps=n_steps, bins_per_octave=12) @pytest.mark.parametrize("factor", [0.9, 1.1]) def test_speed_perturb(factor): audio = torch.randn(1000) sr = 8000 - ret = speed_perturb(audio, sr, factor=factor) + _ = speed_perturb(audio, sr, factor=factor) @pytest.mark.parametrize("factor", [0.9, 1.1]) def test_time_stretch(factor): audio = torch.randn(1000) sr = 8000 - ret = time_stretch(audio, sr, factor=factor) + _ = time_stretch(audio, sr, factor=factor) def test_preemphasis(): audio = torch.randn(1000) sr = 8000 - ret = preemphasis(audio, sr, coeff=0.97) + _ = preemphasis(audio, sr, coeff=0.97) def test_deemphasis(): audio = torch.randn(1000) sr = 8000 - ret = deemphasis(audio, sr, coeff=0.97) + _ = deemphasis(audio, sr, coeff=0.97) def test_clipping(): audio = torch.randn(1000) sr = 8000 - ret = clipping(audio, sr, min_quantile=0.1, max_quantile=0.9) + _ = clipping(audio, sr, min_quantile=0.1, max_quantile=0.9) def test_polarity_inverse(): audio = torch.randn(1000) sr = 8000 - ret = polarity_inverse(audio, sr) + _ = polarity_inverse(audio, sr) def test_reverse(): audio = torch.randn(1000) sr = 8000 - ret = reverse(audio, sr) + _ = reverse(audio, sr) def test_phase_corruption(): audio = torch.randn(1000) sr = 8000 - ret = corrupt_phase(audio, sr) + _ = corrupt_phase(audio, sr) @pytest.mark.parametrize("apply_n", [[1, 1], [1, 4]]) @@ -141,4 +141,4 @@ def test_data_augmentation(apply_n): data_aug = DataAugmentation(effects, apply_n) audio = torch.randn(1000) sr = 8000 - ret = data_aug(audio, sr) + _ = data_aug(audio, sr) diff --git a/test/espnet2/layers/test_create_adapter.py b/test/espnet2/layers/test_create_adapter.py index 046aeca47dd..37af99431cf 100644 --- a/test/espnet2/layers/test_create_adapter.py +++ b/test/espnet2/layers/test_create_adapter.py @@ -1,10 +1,8 @@ import sys -from typing import List import pytest import torch from packaging.version import parse as V -from typeguard import check_argument_types from espnet2.asr.decoder.transformer_decoder import TransformerDecoder from espnet2.asr.frontend.s3prl import S3prlFrontend diff --git a/test/espnet2/layers/test_create_adapter_fn.py b/test/espnet2/layers/test_create_adapter_fn.py index 0fc1de9d5c3..3450531e193 100644 --- a/test/espnet2/layers/test_create_adapter_fn.py +++ b/test/espnet2/layers/test_create_adapter_fn.py @@ -1,16 +1,14 @@ import sys -from typing import List import pytest import torch from packaging.version import parse as V -from typeguard import check_argument_types +from typeguard import TypeCheckError from espnet2.asr.decoder.transformer_decoder import TransformerDecoder from espnet2.asr.frontend.s3prl import S3prlFrontend from espnet2.layers.create_adapter_fn import create_houlsby_adapter, create_lora_adapter -from espnet2.layers.houlsby_adapter_layer import ( - Houlsby_Adapter, +from espnet2.layers.houlsby_adapter_layer import ( # Houlsby_Adapter, HoulsbyTransformerSentenceEncoderLayer, ) @@ -215,7 +213,7 @@ def test_create_lora_adapter_unsupport_target(rank, alpha, target_modules): @pytest.mark.parametrize("rank, alpha, target_modules", [(2, 4, 5)]) def test_create_lora_adapter_invalid_type(rank, alpha, target_modules): model = init_decoder_model() - with pytest.raises(TypeError): + with pytest.raises(TypeCheckError): create_lora_adapter( model=model, rank=rank, alpha=alpha, target_modules=target_modules ) diff --git a/test/espnet2/layers/test_houlsby_adapter_layer.py b/test/espnet2/layers/test_houlsby_adapter_layer.py index 75d74481ae5..535099d7fd1 100644 --- a/test/espnet2/layers/test_houlsby_adapter_layer.py +++ b/test/espnet2/layers/test_houlsby_adapter_layer.py @@ -1,15 +1,14 @@ import sys -from typing import List -from unittest.mock import MagicMock, patch import pytest import torch from packaging.version import parse as V -from typeguard import check_argument_types try: - import s3prl - from s3prl.upstream.wav2vec2.wav2vec2_model import TransformerSentenceEncoderLayer + import s3prl # noqa + from s3prl.upstream.wav2vec2.wav2vec2_model import ( # noqa + TransformerSentenceEncoderLayer, + ) is_s3prl_available = True except ImportError: diff --git a/test/espnet2/lm/test_espnet_multitask.py b/test/espnet2/lm/test_espnet_multitask.py index 631b340cc2a..2ee547ce19a 100644 --- a/test/espnet2/lm/test_espnet_multitask.py +++ b/test/espnet2/lm/test_espnet_multitask.py @@ -2,10 +2,7 @@ import torch from espnet2.lm.espnet_model_multitask import ESPnetMultitaskLanguageModel -from espnet2.lm.seq_rnn_lm import SequentialRNNLM from espnet2.lm.transformer_lm import TransformerLM -from espnet.nets.batch_beam_search import BatchBeamSearch -from espnet.nets.beam_search import BeamSearch @pytest.mark.parametrize("arch", [TransformerLM]) diff --git a/test/espnet2/s2st/test_s2st_espnet_model.py b/test/espnet2/s2st/test_s2st_espnet_model.py index db0b41e8657..b5f26e710e9 100644 --- a/test/espnet2/s2st/test_s2st_espnet_model.py +++ b/test/espnet2/s2st/test_s2st_espnet_model.py @@ -9,7 +9,6 @@ from espnet2.s2st.espnet_model import ESPnetS2STModel from espnet2.s2st.losses.attention_loss import S2STAttentionLoss from espnet2.s2st.losses.ctc_loss import S2STCTCLoss -from espnet2.s2st.losses.guided_attention_loss import S2STGuidedAttentionLoss from espnet2.s2st.losses.tacotron_loss import S2STTacotron2Loss from espnet2.s2st.synthesizer.discrete_synthesizer import TransformerDiscreteSynthesizer from espnet2.s2st.synthesizer.translatotron import Translatotron diff --git a/test/espnet2/schedulers/test_warmup_reducelronplateau.py b/test/espnet2/schedulers/test_warmup_reducelronplateau.py index fabe9477bb0..abd92572336 100644 --- a/test/espnet2/schedulers/test_warmup_reducelronplateau.py +++ b/test/espnet2/schedulers/test_warmup_reducelronplateau.py @@ -1,4 +1,3 @@ -import numpy as np import torch from espnet2.schedulers.warmup_reducelronplateau import WarmupReduceLROnPlateau diff --git a/test/espnet2/schedulers/test_warmup_step_lr.py b/test/espnet2/schedulers/test_warmup_step_lr.py index 70cd37bb7e0..be2aa11a714 100644 --- a/test/espnet2/schedulers/test_warmup_step_lr.py +++ b/test/espnet2/schedulers/test_warmup_step_lr.py @@ -1,4 +1,3 @@ -import numpy as np import torch from espnet2.schedulers.warmup_step_lr import WarmupStepLR diff --git a/test/espnet2/text/test_hugging_face_token_id_converter.py b/test/espnet2/text/test_hugging_face_token_id_converter.py index 730d820ccb7..5202e150e59 100644 --- a/test/espnet2/text/test_hugging_face_token_id_converter.py +++ b/test/espnet2/text/test_hugging_face_token_id_converter.py @@ -8,6 +8,7 @@ def hugging_face_token_id_converter(request): return HuggingFaceTokenIDConverter(request.param) +@pytest.mark.execution_timeout(10) def test_init_pythia(): id_converter = HuggingFaceTokenIDConverter("EleutherAI/pythia-410m-deduped") assert id_converter.get_num_vocabulary_size() == 50254 diff --git a/test/espnet2/text/test_whisper_token_id_converter.py b/test/espnet2/text/test_whisper_token_id_converter.py index 749b5b134ec..ce8bf99b7a2 100644 --- a/test/espnet2/text/test_whisper_token_id_converter.py +++ b/test/espnet2/text/test_whisper_token_id_converter.py @@ -130,6 +130,6 @@ def test_tokens2ids_add_tokens(tmp_path): tknlist_path.touch() with open(tknlist_path, "w") as f: f.write("command:yes\n") - id_converter = OpenAIWhisperTokenIDConverter( + _ = OpenAIWhisperTokenIDConverter( "whisper_multilingual", added_tokens_txt=str(tknlist_path) ) diff --git a/test/espnet2/text/test_whisper_tokenizer.py b/test/espnet2/text/test_whisper_tokenizer.py index f23fade322f..c7a6bd5141d 100644 --- a/test/espnet2/text/test_whisper_tokenizer.py +++ b/test/espnet2/text/test_whisper_tokenizer.py @@ -90,6 +90,6 @@ def test_tokenization_add_tokens(tmp_path): tknlist_path.touch() with open(tknlist_path, "w") as f: f.write("command:yes\n") - tokenizer = OpenAIWhisperTokenizer( + _ = OpenAIWhisperTokenizer( "whisper_multilingual", added_tokens_txt=str(tknlist_path) ) diff --git a/test/espnetez/test_ez.py b/test/espnetez/test_ez.py index db26d9614fb..a03a41728b8 100644 --- a/test/espnetez/test_ez.py +++ b/test/espnetez/test_ez.py @@ -1,12 +1,10 @@ # Copyright 2024 Masao Someki # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -import os import shutil import tempfile from pathlib import Path import pytest -import torch import espnetez as ez from espnet2.tasks.asr import ASRTask diff --git a/test/espnetez/test_integration_espnetez_ft.py b/test/espnetez/test_integration_espnetez_ft.py index 5ee7a18014d..706bd36065d 100644 --- a/test/espnetez/test_integration_espnetez_ft.py +++ b/test/espnetez/test_integration_espnetez_ft.py @@ -126,7 +126,7 @@ def build_model_fn(args): tokenizer = getattr(pretrained_model, "tokenizer", None) finetune_config = ez.config.update_finetune_config( - args.task, vars(pretrain_config), f"../asr1/conf/finetune_with_lora.yaml" + args.task, vars(pretrain_config), "../asr1/conf/finetune_with_lora.yaml" ) finetune_config["max_epoch"] = 2 diff --git a/test/test_nets_utils.py b/test/test_nets_utils.py index c17aa8e383d..8bdd5a8f86a 100644 --- a/test/test_nets_utils.py +++ b/test/test_nets_utils.py @@ -36,6 +36,7 @@ def test_make_pad_mask(test_case): @pytest.mark.parametrize("test_case", test_cases) def test_trace_make_pad_mask(test_case): """Test if onnx-convertible make_pad_mask can be traced with torch.jit.trace + If it's traceable then it can be exported to ONNX. """ args, input_names, kwargs_trace, kwargs_non_trace = get_args(test_case.copy())