espnet · mergify · Jun 22, 2023 · Jun 8, 2023 · Jun 8, 2023 · Jun 20, 2023
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/mfa_format.py b/egs2/TEMPLATE/asr1/pyscripts/utils/mfa_format.py
@@ -143,7 +143,9 @@ def get_parser():
     return parser
 
 
-def get_phoneme_durations(data: Dict, original_text: str, fs: int, hop_size: int):
+def get_phoneme_durations(
+    data: Dict, original_text: str, fs: int, hop_size: int, n_samples: int
+):
     """Get phohene durations."""
     orig_text = original_text.replace(" ", "").rstrip()
     text_pos = 0
@@ -226,7 +228,7 @@ def get_phoneme_durations(data: Dict, original_text: str, fs: int, hop_size: int
     # STFT frames calculation: https://github.com/librosa/librosa/issues/1288
     # centered stft
 
-    total_durations = int(Decimal(str(maxTimestamp)) * fs / hop_size) + 1
+    total_durations = int(n_samples / hop_size) + 1
     timing_frames = [int(timing * fs / hop_size) + 1 for timing in timings]
     durations = [
         timing_frames[i + 1] - timing_frames[i] for i in range(len(timing_frames) - 1)
@@ -263,6 +265,8 @@ def validate(args):
 
 def make_durations(args):
     """Make durations file."""
+    import soundfile as sf
+
     wavs_dir = Path(args.corpus_dir)
     textgrid_dir = args.textgrid_dir
     train_text_path = args.train_text_path
@@ -276,9 +280,19 @@ def make_durations(args):
                 len(lab_paths) > 0
             ), f"The folder {wavs_dir} does not contain any transcription."
             for lab_path in lab_paths:
+                wav_path = lab_path.as_posix().replace(
+                    ".lab", ".wav"
+                )  # Assumes .wav files are in same dir as .lab files
+                if not os.path.exists(wav_path):
+                    logging.warning("There is no wav file for %s, skipping.", lab_path)
+                    continue
+                with sf.SoundFile(wav_path) as audio:
+                    no_samples = (
+                        audio.frames
+                    )  # get no. of samples directly from .wav file
                 filename = (
                     lab_path.as_posix()
-                    .replace(args.corpus_dir + "/", "")
+                    .replace(args.corpus_dir.rstrip("/") + "/", "")
                     .replace(".lab", "")
                 )
                 with open(lab_path) as lab_file:
@@ -290,7 +304,11 @@ def make_durations(args):
                 with codecs.open(tg_path, "r", encoding="utf-8") as reader:
                     _data_dict = json.load(reader)
                 new_phones, durations = get_phoneme_durations(
-                    _data_dict, original_text, args.samplerate, args.hop_size
+                    _data_dict,
+                    original_text,
+                    args.samplerate,
+                    args.hop_size,
+                    no_samples,
                 )
                 key = filename.split("/")[-1]
                 text_file.write(f'{key} {" ".join(new_phones)}\n')
@@ -327,6 +345,9 @@ def make_labs(args):
 
     from espnet2.text.cleaner import TextCleaner
 
+    if not args.text_cleaner:
+        args.text_cleaner = None
+
     corpus_dir = Path(args.corpus_dir)
     cleaner = TextCleaner(args.text_cleaner)