In [27]:
from pathlib import Path

import numpy as np

from utils.config_manager import Config
from model.factory import tts_ljspeech, tts_custom
from data.audio import Audio



In [28]:
config_path = 'config/peeter_jutustav_16/session_paths.yaml'
config_loader = Config(config_path=config_path)
model = config_loader.load_model(None)  # if None defaults to latest
conf = config_loader.config
audio = Audio(conf)

restored weights from logs/peeter_jutustav_16/ljspeech/tts_config_est.aligner_config/weights/ckpt-312 at step 260000


In [41]:
text = 'Karu oli hoolimatu, lampjalgne ja räpane.'

In [42]:
# Phonemize
phons = model.text_pipeline.phonemizer(text)
print(phons)

karʊ oʎɪ hoːʎimatu,lampjalɡne ja ræpane.


In [43]:
# Tokenize
tokens = model.text_pipeline.tokenizer(phons)
print(tokens)

[21, 12, 28, 98, 1, 25, 102, 72, 1, 18, 25, 119, 102, 19, 23, 12, 30, 31, 6, 22, 12, 23, 26, 20, 12, 22, 64, 24, 16, 1, 20, 12, 1, 28, 37, 26, 12, 24, 16, 8]


In [44]:
# Run model
out = model.predict(tokens, encode=False, phoneme_max_duration=None)
mel = out['mel'].numpy().T
print(mel.shape)

(80, 178)


In [45]:
# Create Mel basis
from librosa import filters
mel_basis = filters.mel(
    sr = audio.config['sampling_rate'],
    n_fft=audio.config['n_fft'],
    n_mels=audio.config['mel_channels'],
    fmin=audio.config['f_min'],
    fmax=audio.config['f_max']
)
print(mel_basis.shape)

(80, 513)


In [46]:

# Compute NNLS
from librosa.util import nnls
amp_mel = audio._denormalize(mel)
inverse = nnls(mel_basis, amp_mel)
np.power(inverse, 1./1, out=inverse)
print(inverse.shape)

(513, 178)


In [47]:
# Griffin-lim
from librosa.core import griffinlim
wav = griffinlim(
    inverse,
    n_iter = 32,
    hop_length=audio.config['hop_length'],
    win_length=audio.config['win_length'],
)

In [48]:
outdir = Path('outputs/test')
outdir.mkdir(exist_ok=True, parents=True)
audio.save_wav(wav, (outdir / f"test").with_suffix('.wav'))