Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bugs in mfa_format.py #5223

Merged
merged 6 commits into from
Jun 22, 2023
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
29 changes: 25 additions & 4 deletions egs2/TEMPLATE/asr1/pyscripts/utils/mfa_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,9 @@ def get_parser():
return parser


def get_phoneme_durations(data: Dict, original_text: str, fs: int, hop_size: int):
def get_phoneme_durations(
data: Dict, original_text: str, fs: int, hop_size: int, n_samples: int
):
"""Get phohene durations."""
orig_text = original_text.replace(" ", "").rstrip()
text_pos = 0
Expand Down Expand Up @@ -226,7 +228,7 @@ def get_phoneme_durations(data: Dict, original_text: str, fs: int, hop_size: int
# STFT frames calculation: https://github.com/librosa/librosa/issues/1288
# centered stft

total_durations = int(Decimal(str(maxTimestamp)) * fs / hop_size) + 1
total_durations = int(n_samples / hop_size) + 1
timing_frames = [int(timing * fs / hop_size) + 1 for timing in timings]
durations = [
timing_frames[i + 1] - timing_frames[i] for i in range(len(timing_frames) - 1)
Expand Down Expand Up @@ -263,6 +265,8 @@ def validate(args):

def make_durations(args):
"""Make durations file."""
import soundfile as sf
G-Thor marked this conversation as resolved.
Show resolved Hide resolved

wavs_dir = Path(args.corpus_dir)
textgrid_dir = args.textgrid_dir
train_text_path = args.train_text_path
Expand All @@ -276,9 +280,19 @@ def make_durations(args):
len(lab_paths) > 0
), f"The folder {wavs_dir} does not contain any transcription."
for lab_path in lab_paths:
wav_path = lab_path.as_posix().replace(
".lab", ".wav"
) # Assumes .wav files are in same dir as .lab files
if not os.path.exists(wav_path):
logging.warning("There is no wav file for %s, skipping.", lab_path)
continue
with sf.SoundFile(wav_path) as audio:
G-Thor marked this conversation as resolved.
Show resolved Hide resolved
no_samples = (
audio.frames
) # get no. of samples directly from .wav file
filename = (
lab_path.as_posix()
.replace(args.corpus_dir + "/", "")
.replace(args.corpus_dir.rstrip("/") + "/", "")
.replace(".lab", "")
)
with open(lab_path) as lab_file:
Expand All @@ -290,7 +304,11 @@ def make_durations(args):
with codecs.open(tg_path, "r", encoding="utf-8") as reader:
_data_dict = json.load(reader)
new_phones, durations = get_phoneme_durations(
_data_dict, original_text, args.samplerate, args.hop_size
_data_dict,
original_text,
args.samplerate,
args.hop_size,
no_samples,
)
key = filename.split("/")[-1]
text_file.write(f'{key} {" ".join(new_phones)}\n')
Expand Down Expand Up @@ -327,6 +345,9 @@ def make_labs(args):

G-Thor marked this conversation as resolved.
Show resolved Hide resolved
from espnet2.text.cleaner import TextCleaner

if not args.text_cleaner:
args.text_cleaner = None

corpus_dir = Path(args.corpus_dir)
cleaner = TextCleaner(args.text_cleaner)

Expand Down