Merge from master branch and solve conflicts.

espnet · Sep 30, 2023 · d1847a8 · d1847a8
2 parents cc55e58 + 8457fe2
commit d1847a8
Show file tree

Hide file tree

Showing 86 changed files with 2,614 additions and 80 deletions.
diff --git a/.github/workflows/ci_on_centos7.yml b/.github/workflows/ci_on_centos7.yml
@@ -29,7 +29,7 @@ jobs:
         # To avoid UnicodeEncodeError for python<=3.6
         LC_ALL: en_US.UTF-8
     steps:
-      - uses: actions/checkout@master
+      - uses: actions/checkout@v3.6.0
       - name: check OS
         run: cat /etc/os-release
       - name: install dependencies

diff --git a/doc/paper/espnet-se++/paper.bib b/doc/paper/espnet-se++/paper.bib
@@ -1,4 +1,5 @@
 @inproceedings{Li:2021,
+	doi = {10.1109/slt48900.2021.9383615},
   title={{ESPnet-SE}: End-to-end speech enhancement and separation toolkit designed for {ASR} integration},
   author={Li, C. and Shi, J. and Zhang, W. and Subramanian, A. S. and Chang, X. and Kamo, N. and Hira, M. and Hayashi, T. and Boeddeker, C. and Chen, Z. Watanabe, S. },
   booktitle={2021 IEEE Spoken Language Technology Workshop (SLT)},
@@ -7,17 +8,166 @@ @inproceedings{Li:2021
   organization={IEEE}
 }
 
+@inproceedings{Hershey:2016,
+  doi = {10.1109/icassp.2016.7471631},
+  title={Deep clustering: Discriminative embeddings for segmentation and separation},
+  author={Hershey, J. R. and Chen, Z. and Le Roux, J. and Watanabe, S.},
+  booktitle={2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  pages={31--35},
+  year={2016},
+  organization={IEEE}
+}
+
+@inproceedings{Chen:2017,
+  doi = {10.1109/icassp.2017.7952155},
+  title={Deep attractor network for single-microphone speaker separation},
+  author={Chen, Z. and Luo, Y. and Mesgarani, N.},
+  booktitle={2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  pages={246--250},
+  year={2017},
+  organization={IEEE}
+}
+
+@inproceedings{Hu:2020,
+  doi = {10.21437/interspeech.2020-2537},
+  title={{DCCRN}: Deep complex convolution recurrent network for phase-aware speech enhancement},
+  author={Hu, Y. and Liu, Y. and Lv, S. and Xing, M. and Zhang, S. and Fu, Y. and Wu, J. and Zhang, B. and Xie, L.},
+  booktitle={Proceedings of Interspeech},
+  pages={2472--2476},
+  year={2020}
+}
+
+@article{Tan:2021,
+  doi = {10.1109/taslp.2021.3082318},
+	title={Deep learning based real-time speech enhancement for dual-microphone mobile phones},
+  author={Tan, K. and Zhang, X. and Wang, D.},
+  journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
+  volume={29},
+  pages={1853--1863},
+  year={2021},
+  publisher={IEEE}
+}
+@inproceedings{Li:2022,
+    doi = {10.1109/icassp43922.2022.9746372},
+    url = {https://doi.org/10.1109%2Ficassp43922.2022.9746372},
+    pages={681--685},
+    year = 2022,
+    month = {may},
+    publisher = {{IEEE}},
+    author = {Li, C. and Yang, L. and Wang, W. and Qian, Y.},
+    title = {{SkiM}: Skipping Memory Lstm for Low-Latency Real-Time Continuous Speech Separation},
+    booktitle = {2022 {IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})}
+}
+
+@inproceedings{Dang:2022,
+    doi = {10.1109/icassp43922.2022.9746171},
+    url = {https://doi.org/10.1109%2Ficassp43922.2022.9746171},
+    pages={6857--6861},
+    year = 2022,
+    month = {may},
+    publisher = {{IEEE}},
+    author = {Dang, F. and Chen, H. and Zhang, P.},
+    title = {{DPT}-{FSNet}: Dual-Path Transformer Based Full-Band and Sub-Band Fusion Network for Speech Enhancement},
+    booktitle = {2022 {IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})}
+}
+
+@inproceedings{Takahashi:2019,
+    doi = {10.21437/interspeech.2019-1550},
+    url = {https://doi.org/10.21437%2Finterspeech.2019-1550},
+    pages={1348--1352},
+    year = 2019,
+    month = {sep},
+    publisher = {{ISCA}},
+    author = {Takahashi, N. and Parthasaarathy, S. and Goswami, N. and Mitsufuji, Y.},
+    title = {Recursive Speech Separation for Unknown Number of Speakers},
+    booktitle = {Interspeech 2019}
+}
+
+@inproceedings{Luo:2019a,
+    doi = {10.1109/asru46091.2019.9003849},
+    url = {https://doi.org/10.1109%2Fasru46091.2019.9003849},
+    pages={260--267},
+    year = 2019,
+    month = {dec},
+    publisher = {{IEEE}},
+    author = {Luo, Y. and Han, C. and Mesgarani, N. and Ceolini, E. and Liu, S.},
+    title = {{FaSNet}: Low-Latency Adaptive Beamforming for Multi-Microphone Audio Processing},
+    booktitle = {2019 {IEEE} Automatic Speech Recognition and Understanding Workshop ({ASRU})}
+}
 
-@inproceedings{Lu:2022,
-  title={{ESPnet-SE++}: Speech Enhancement for Robust Speech Recognition, Translation, and Understanding},
+@inproceedings{Lu:2022a,
+    doi = {10.1109/icassp43922.2022.9747146},
+    url = {https://doi.org/10.1109%2Ficassp43922.2022.9747146},
+    pages={9201--9205},
+    year = 2022,
+    month = {may},
+    publisher = {{IEEE}},
+    author = {Lu, Y. J. and Cornell, S. and Chang, X. and Zhang, W. and Li, C. and Ni, Z. and Wang, Z. and Watanabe, S.},
+    title = {Towards Low-Distortion Multi-Channel Speech Enhancement: The {ESPNET}-Se Submission to the {L3DAS22} Challenge},
+    booktitle = {2022 {IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})}
+}
+
+@inproceedings{Luo:2018,
+    doi = {10.1109/icassp.2018.8462116},
+    url = {https://doi.org/10.1109%2Ficassp.2018.8462116},
+    pages={696--700},
+    year = 2018,
+    month = {apr},
+    publisher = {{IEEE}},
+    author = {Luo, Y. and Mesgarani, N.},
+    title = {{TaSNet}: Time-Domain Audio Separation Network for Real-Time, Single-Channel Speech Separation},
+    booktitle = {2018 {IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})}
+}
+
+@inproceedings{Le:2019,
+    doi = {10.1109/icassp.2019.8683855},
+    url = {https://doi.org/10.1109%2Ficassp.2019.8683855},
+    pages={626--630},
+    year = 2019,
+    month = {may},
+    publisher = {{IEEE}},
+    author = {Le Roux, J. and Wisdom, S. and Erdogan, H. and Hershey, J. R.},
+    title = {{SDR} {\textendash} Half-baked or Well Done?},
+    booktitle = {2019 {IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})}
+}
+
+@inproceedings{Boeddeker:2021,
+    doi = {10.1109/icassp39728.2021.9414661},
+    url = {https://doi.org/10.1109%2Ficassp39728.2021.9414661},
+    pages={8428--8432},
+    year = 2021,
+    month = {jun},
+    publisher = {{IEEE}},
+    author = {Boeddeker, C. and Zhang, W. and Nakatani, T. and Kinoshita, K. and Ochiai, T. and Delcroix, M. and Kamo, N. and Qian, Y. and Haeb-Umbach, R.},
+    title = {Convolutive Transfer Function Invariant {SDR} Training Criteria for Multi-Channel Reverberant Speech Separation},
+    booktitle = {2021 {IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})}
+}
+
+@inproceedings{Scheibler:2022,
+    doi = {10.1109/icassp43922.2022.9747473},
+    url = {https://doi.org/10.1109%2Ficassp43922.2022.9747473},
+    pages={701--705},
+    year = 2022,
+    month = {may},
+    publisher = {{IEEE}},
+    author = {Scheibler, R.},
+    title = {{SDR} {\textemdash} Medium Rare with Fast Computations},
+    booktitle = {2022 {IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})}
+}
+
+
+@inproceedings{Lu:2022b,
+  doi = {10.21437/interspeech.2022-10727},
+	title={{ESPnet-SE++}: Speech Enhancement for Robust Speech Recognition, Translation, and Understanding},
   author={Lu, Y. J. and Chang, X. and Li, C. and Zhang, W. and Cornell, S. and Ni, Z. and Masuyama, Y. and Yan, B. and Scheibler, R. and Wang, Z. Q. and Tsao, Y. and Qian Y. Watanabe, S.},
   booktitle={Proceedings of Interspeech},
   pages={5458--5462},
   year={2022},
 }
 
 @inproceedings{Hayashi:2020,
-  title={{ESPnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
+  doi = {10.1109/icassp40776.2020.9053512},
+	title={{ESPnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
   author={Hayashi, T. and Yamamoto, R. and Inoue, K. and Yoshimura, T. and Watanabe, S. and Toda, T. and Takeda, K. and Zhang, Y. Tan, X.},
   booktitle={2020 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
   pages={7654--7658},
@@ -26,6 +176,7 @@ @inproceedings{Hayashi:2020
 }
 
 @inproceedings{Inaguma:2020,
+  doi = {10.18653/v1/2020.acl-demos.34},
   title = {{ESP}net-{ST}: All-in-One Speech Translation Toolkit},
     author = {Inaguma, H. and Kiyono, S. and Duh, K. and Karita, S. and Soplin, N. E. Y. and Hayashi, T. and Watanabe, S.},
     booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations},
@@ -36,7 +187,8 @@ @inproceedings{Inaguma:2020
 
 
 @inproceedings{Arora:2022,
-  title={{ESPnet-SLU}: Advancing spoken language understanding through {ESPnet}},
+  doi = {10.1109/icassp43922.2022.9747674},
+	title={{ESPnet-SLU}: Advancing spoken language understanding through {ESPnet}},
   author={Arora, S. and Dalmia, S. and Denisov, P. and Chang, X. and Ueda, Y. and Peng, Y. and Zhang, Y. and Kumar, S. and Ganesan, K. and Yan, B, Watanabe, S.},
   booktitle={2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
   pages={7167--7171},
@@ -46,14 +198,16 @@ @inproceedings{Arora:2022
 
 @inproceedings{Watanabe:2018,
   author={Watanabe, S. and Hori, T. and Karita, S. and Hayashi, T. and Nishitoba, J. and Unno, Y. and Soplin, N. E. Y. and Heymann, J. and Wiesner, M. and Chen, N. and Renduchintala, A. and Ochiai, T.},
-  title={{ESPnet}: End-to-End Speech Processing Toolkit},
+  doi = {10.21437/interspeech.2018-1456},
+	title={{ESPnet}: End-to-End Speech Processing Toolkit},
   booktitle={Proceedings of Interspeech},
   pages={2207--2211},
   year={2018},
 }
 
 
 @inproceedings{Manilow:2018,
+  doi = {10.1163/1872-9037_afco_asc_1322},
   title={The Northwestern University Source Separation Library.},
   author={Manilow, E. and Seetharaman, P. and Pardo, B.},
   booktitle={International Society for Music Information Retrieval (ISMIR)},
@@ -62,13 +216,14 @@ @inproceedings{Manilow:2018
 }
 
 @article{Ni:2019,
-  title={Onssen: an open-source speech separation and enhancement library},
+  title={{ONSSEN}: an open-source speech separation and enhancement library},
   author={Ni, Zhaoheng Mandel, Michael I},
   journal={arXiv preprint arXiv:1911.00982},
   year={2019}
 }
 
 @inproceedings{Pariente:2020,
+  doi = {10.21437/interspeech.2020-1673},
   title={Asteroid: the PyTorch-based audio source separation toolkit for researchers},
   author={Pariente, M. and Cornell, S. and Cosentino, J. and Sivasankaran, S. and Tzinis, E. and Heitkaemper, J. and Olvera, M. and St{\"o}ter, F. R. and Hu, M. and Mart{\'\i}n-Do{\~n}as, J. M. and Ditter, D. and Frank, A. and Deleforge, A. and Vincent, E. },
   booktitle={Proceedings of Interspeech},
@@ -84,7 +239,8 @@ @article{Ravanelli:2021
 }
 
 @inproceedings{Povey:2011,
-  title={The Kaldi speech recognition toolkit},
+  doi = {10.15199/48.2016.11.70},
+  title={The {Kaldi} speech recognition toolkit},
   author={Povey, D. and Ghoshal, A. and Boulianne, G. and Burget, L. and Glembek, O. and Goel, N. and Hannemann, M. and Motlicek, P. and Qian, Y. and Schwarz, P. and  Silovsky´, J. and Stemmer, G. Vesely, K.},
   booktitle={IEEE 2011 workshop on automatic speech recognition and understanding},
   number={CONF},
@@ -94,27 +250,20 @@ @inproceedings{Povey:2011
 
 
 @inproceedings{Luo:2020,
-  title={Dual-path rnn: efficient long sequence modeling for time-domain single-channel speech separation},
+  doi = {10.1109/icassp40776.2020.9054266},
+  title={Dual-path {RNN}: efficient long sequence modeling for time-domain single-channel speech separation},
   author={Luo, Y. and Chen, Z. and Yoshioka, T.},
-  booktitle={ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  booktitle={2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
   pages={46--50},
   year={2020},
   organization={IEEE}
 }
 
-@inproceedings{Luo:2018,
-  title={Tasnet: time-domain audio separation network for real-time, single-channel speech separation},
-  author={Luo, Y. and Mesgarani, N. },
-  booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
-  pages={696--700},
-  year={2018},
-  organization={IEEE}
-}
-
-@article{Luo:2019,
-  title={Conv-tasnet: Surpassing ideal time--frequency magnitude masking for speech separation},
+@article{Luo:2019b,
+  doi = {10.1109/taslp.2019.2915167},
+  title={{Conv-TasNet}: Surpassing ideal time--frequency magnitude masking for speech separation},
   author={Luo, Y. and Mesgarani, N.},
-  journal={IEEE/Association for Computing Machinery (ACM) transactions on audio, speech, and language processing},
+  journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
   volume={27},
   number={8},
   pages={1256--1266},
@@ -123,16 +272,8 @@ @article{Luo:2019
 }
 
 
-@inproceedings{le:2019,
-  title={SDR--half-baked or well done?},
-  author={Le Roux, J. and Wisdom, S. and Erdogan, H. and Hershey, J. R. },
-  booktitle={2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
-  pages={626--630},
-  year={2019},
-  organization={IEEE}
-}
-
 @article{Taal:2011,
+  doi = {10.1109/tasl.2011.2114881},
   title={An algorithm for intelligibility prediction of time--frequency weighted noisy speech},
   author={Taal, C. H. and Hendriks, R. C. and Heusdens, R. and Jensen, J.},
   journal={IEEE Transactions on Audio, Speech, and Language Processing},
@@ -144,6 +285,7 @@ @article{Taal:2011
 }
 
 @inproceedings{Rix:2001,
+  doi = {10.1109/icassp.2001.941023},
   title={Perceptual evaluation of speech quality (PESQ)-a new method for speech quality assessment of telephone networks and codecs},
   author={Rix, A. W. and Beerends, J. G. and Hollier, M. P. and Hekstra, A. P.},
   booktitle={2001 IEEE international conference on acoustics, speech, and signal processing. Proceedings (Cat. No. 01CH37221)},
@@ -154,6 +296,7 @@ @inproceedings{Rix:2001
 }
 
 @inproceedings{Yu:2017,
+  doi = {10.1109/icassp.2017.7952154},
   title={Permutation invariant training of deep models for speaker-independent multi-talker speech separation},
   author={Yu, D. and Kolbæk, M. and Tan, Z. H. and Jensen, J.},
   booktitle={2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
@@ -163,9 +306,10 @@ @inproceedings{Yu:2017
 }
 
 @article{Towns:2014,
-  title={XSEDE: accelerating scientific discovery},
+  doi = {10.1109/mcse.2014.80},
+  title={{XSEDE}: accelerating scientific discovery},
   author={Towns, J. and Cockerill, T. and Dahan, M. and Foster, I. and Gaither, K. and Grimshaw, A. and Hazlewood, V. and Lathrop, S. and Lifka, D. and Peterson, G. D. and Roskies, R. and Scott, J. R. and Wilkins-Diehr, N.},
-  journal={Computing in science \& engineering},
+  journal={Computing in Science \& Engineering},
   volume={16},
   number={5},
   pages={62--74},
@@ -174,7 +318,8 @@ @article{Towns:2014
 }
 
 @inproceedings{Nystrom:2015,
-  title={Bridges: a uniquely flexible HPC resource for new communities and data analytics},
+  doi = {10.1145/2792745.2792775},
+  title={Bridges: a uniquely flexible {HPC} resource for new communities and data analytics},
   author={Nystrom, N. A. and Levine, M. J. and Roskies, R. Z. and Scott, J. R.},
   booktitle={Proceedings of the 2015 XSEDE Conference: Scientific Advancements Enabled by Enhanced Cyberinfrastructure},
   pages={1--8},

diff --git a/doc/paper/espnet-se++/paper.md b/doc/paper/espnet-se++/paper.md
@@ -71,7 +71,7 @@ bibliography: paper.bib
 
 # Summary
 This paper presents the software design and user interface of ESPnet-SE++, a new speech separation and enhancement (SSE) module of the ESPnet toolkit.
-ESPnet-SE++ significantly expands the functionality of ESPnet-SE [@Li:2021] with several new models, loss functions, and training recipes [@Lu:2022]. Crucially, it features a new, redesigned interface, which allows for a flexible combination of SSE front-ends with many downstream tasks, including automatic speech recognition (ASR), speaker diarization (SD), speech translation (ST), and spoken language understanding (SLU).
+ESPnet-SE++ significantly expands the functionality of ESPnet-SE [@Li:2021] with several new models[@Hershey:2016; @Chen:2017; @Hu:2020; @Tan:2021; @Li:2022; @Dang:2022; @Takahashi:2019; @Luo:2019a; @Lu:2022a], loss functions [@Luo:2018; @Le:2019; @Boeddeker:2021; @Scheibler:2022], and training recipes as shown in [@Lu:2022b]. Crucially, it features a new, redesigned interface, which allows for a flexible combination of SSE front-ends with many downstream tasks, including automatic speech recognition (ASR), speaker diarization (SD), speech translation (ST), and spoken language understanding (SLU).
 
 # Statement of need
 

diff --git a/egs/commonvoice/asr1/local/data_prep.pl b/egs/commonvoice/asr1/local/data_prep.pl
@@ -5,6 +5,8 @@
 #
 # Usage: data_prep.pl /export/data/cv_corpus_v1/cv-valid-train valid_train
 
+use open ':std', ':encoding(UTF-8)'; # Use UTF-8 encoding for all standard streams
+
 if (@ARGV != 3) {
   print STDERR "Usage: $0 <path-to-commonvoice-corpus> <dataset> <valid-train|valid-dev|valid-test>\n";
   print STDERR "e.g. $0 /export/data/cv_corpus_v1 cv-valid-train valid-train\n";
@@ -48,7 +50,7 @@
   $uttId =~ tr/\//-/;
   # speaker information should be suffix of the utterance Id
   $uttId = "$spkr-$uttId";
-  $text =~ tr/a-z/A-Z/;
+  $text = uc($text);
   if (index($text, "{") != -1 and index($text, "}" != -1)) {
     next;
   }

diff --git a/egs/covost2/st1/local/data_prep_commonvoice.pl b/egs/covost2/st1/local/data_prep_commonvoice.pl
@@ -5,6 +5,8 @@
 #
 # Usage: data_prep.pl /export/data/cv_corpus_v1/cv-valid-train valid_train
 
+use open ':std', ':encoding(UTF-8)'; # Use UTF-8 encoding for standard streams
+
 if (@ARGV != 3) {
   print STDERR "Usage: $0 <path-to-commonvoice-corpus> <dataset> <valid-train|valid-dev|valid-test>\n";
   print STDERR "e.g. $0 /export/data/cv_corpus_v1 cv-valid-train valid-train\n";
@@ -48,7 +50,7 @@
   $uttId =~ tr/\//-/;
   # speaker information should be suffix of the utterance Id
   $uttId = "$spkr-$uttId";
-  $text =~ tr/a-z/A-Z/;
+  $text = uc($text);
   # if (index($text, "{") != -1 and index($text, "}" != -1)) {
   #   next;
   # }