Skip to content

Commit

Permalink
Merge pull request #5223 from G-Thor/patch-3
Browse files Browse the repository at this point in the history
Fix bugs in mfa_format.py
  • Loading branch information
mergify[bot] committed Jun 22, 2023
2 parents b5a88e9 + 5daff33 commit 161e4bb
Showing 1 changed file with 30 additions and 9 deletions.
39 changes: 30 additions & 9 deletions egs2/TEMPLATE/asr1/pyscripts/utils/mfa_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ def make_labs_[dataset]:
import re
import sys
import traceback
from decimal import Decimal
from pathlib import Path
from typing import Dict

import kaldiio
import soundfile as sf
from pyopenjtalk import run_frontend

from espnet2.text.phoneme_tokenizer import PhonemeTokenizer
Expand Down Expand Up @@ -61,9 +61,8 @@ def get_path(s, sep=os.sep):


def get_jp_text(text):
new_text = run_frontend(text)[0]
new_text = " ".join(x.split(",")[0] for x in new_text)
return new_text
new_text = run_frontend(text)
return " ".join([token["string"] for token in new_text])


def get_parser():
Expand Down Expand Up @@ -143,7 +142,9 @@ def get_parser():
return parser


def get_phoneme_durations(data: Dict, original_text: str, fs: int, hop_size: int):
def get_phoneme_durations(
data: Dict, original_text: str, fs: int, hop_size: int, n_samples: int
):
"""Get phohene durations."""
orig_text = original_text.replace(" ", "").rstrip()
text_pos = 0
Expand Down Expand Up @@ -226,7 +227,7 @@ def get_phoneme_durations(data: Dict, original_text: str, fs: int, hop_size: int
# STFT frames calculation: https://github.com/librosa/librosa/issues/1288
# centered stft

total_durations = int(Decimal(str(maxTimestamp)) * fs / hop_size) + 1
total_durations = int(n_samples / hop_size) + 1
timing_frames = [int(timing * fs / hop_size) + 1 for timing in timings]
durations = [
timing_frames[i + 1] - timing_frames[i] for i in range(len(timing_frames) - 1)
Expand Down Expand Up @@ -263,6 +264,7 @@ def validate(args):

def make_durations(args):
"""Make durations file."""

wavs_dir = Path(args.corpus_dir)
textgrid_dir = args.textgrid_dir
train_text_path = args.train_text_path
Expand All @@ -276,9 +278,22 @@ def make_durations(args):
len(lab_paths) > 0
), f"The folder {wavs_dir} does not contain any transcription."
for lab_path in lab_paths:
wav_path = lab_path.as_posix().replace(
".lab", ".wav"
) # Assumes .wav files are in same dir as .lab files
if not os.path.exists(wav_path):
logging.warning("There is no wav file for %s, skipping.", lab_path)
continue

# get no. of samples and original sr directly from audio file
with sf.SoundFile(wav_path) as audio:
orig_sr = audio.samplerate
# Account for downsampling
no_samples = int(audio.frames * (args.samplerate / orig_sr))

filename = (
lab_path.as_posix()
.replace(args.corpus_dir + "/", "")
.replace(args.corpus_dir.rstrip("/") + "/", "")
.replace(".lab", "")
)
with open(lab_path) as lab_file:
Expand All @@ -290,7 +305,11 @@ def make_durations(args):
with codecs.open(tg_path, "r", encoding="utf-8") as reader:
_data_dict = json.load(reader)
new_phones, durations = get_phoneme_durations(
_data_dict, original_text, args.samplerate, args.hop_size
_data_dict,
original_text,
args.samplerate,
args.hop_size,
no_samples,
)
key = filename.split("/")[-1]
text_file.write(f'{key} {" ".join(new_phones)}\n')
Expand Down Expand Up @@ -323,10 +342,12 @@ def make_dictionary(args):

def make_labs(args):
"""Make lab file for datasets."""
import soundfile as sf

from espnet2.text.cleaner import TextCleaner

if not args.text_cleaner:
args.text_cleaner = None

corpus_dir = Path(args.corpus_dir)
cleaner = TextCleaner(args.text_cleaner)

Expand Down

0 comments on commit 161e4bb

Please sign in to comment.