In [None]:
import ipywidgets as widgets
import padertorch as pt
import paderbox as pb
import torch
import torchaudio

from creapy.utils import config
from IPython.display import display, Audio, clear_output
from pathlib import Path
from pvq_manipulation.models.vits import Vits_NT
from pvq_manipulation.models.ffjord import FFJORD
from pvq_manipulation.models.hubert import HubertExtractor, SID_LARGE_LAYER
from pvq_manipulation.helper.manipulation_fkt import *

config._CONFIG_DIR = "./helper/creapy_config.yaml"
config._USER_CONFIG_DIR = "./helper/user_config.yaml"
config.USER_CONFIG_DIR = "./helper/user_config.yaml"

In [None]:
pvq_labels = ['Weight', 'Resonance', 'Breathiness', 'Roughness', 'Loudness', 'Strain', 'Pitch', 'Creak']

storage_dir_normalizing_flow = Path("./saved_models/flow_interspeech/")

config_norm_flow = pb.io.load_yaml(storage_dir_normalizing_flow / "config.json")
normalizing_flow = FFJORD.load_model(storage_dir_normalizing_flow, checkpoint="model.pt")

# load TTS model

In [None]:
storage_dir_tts = Path("./saved_models/tts_model/")
tts_model = Vits_NT.load_model(storage_dir_tts, checkpoint="model.pt")

# load hubert model

In [None]:
hubert_model = HubertExtractor(
    layer=SID_LARGE_LAYER,
    model_name="HUBERT_LARGE",
    backend="torchaudio",
    device=tts_model.device, 
    storage_dir="./saved_models/hubert_model/" # target storage dir hubert model
)

# Example Synthesis

In [None]:
audio_file = "1034_121119_000028_000001"

if not Path(f"./saved_models/audio_files/{audio_file}.pth").is_file():
    example = load_audio_files(f"./saved_models/audio_files/{audio_file}.wav")
    speaker_embedding = extract_speaker_embedding(tts_model, example)
    torch.save(speaker_embedding, f"./saved_models/audio_files/{audio_file}.pth")
wav_1 = tts_model.synthesize_from_example({
    'text' : "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", 
    'd_vector_storage_root': f"./saved_models/audio_files/{audio_file}.pth"
})
display(Audio(wav_1, rate=24_000, normalize=True))

# Get example manipulation
With the manipulation the desired voice quality can be choosen and with the manipulation_intensity the degree of change. The intensity of the sampled embedding is then the estimated intensity of the input signal plus the manipulation intensity

In [None]:
example = {
    'audio_file': "1034_121119_000028_000001",
    'transcription': "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
}

example = load_audio_files(example)
labels = load_speaker_labels(example, config_norm_flow, hubert_model, pvq_labels)

if not Path(f"./saved_models/audio_files/{audio_file}.pth").is_file():
    speaker_embedding = extract_speaker_embedding(tts_model, example)
    torch.save(speaker_embedding, f"./saved_models/audio_files/{audio_file}.pth")
else:
    speaker_embedding = torch.load(f"./saved_models/audio_files/{audio_file}.pth")

wav_manipulated = get_manipulation(
    example=example, 
    d_vector=speaker_embedding, 
    labels=labels[None, :], 
    flow=normalizing_flow,
    tts_model=tts_model,
    manipulation='Breathiness', # Breathiness, Creak, Roughness, Weight, Resonance
    manipulation_intensity=1,
    pvq_labels=pvq_labels,
)
display(Audio(wav_manipulated, rate=24_000, normalize=True))

# Interface with Widgets 

In [None]:
example = {
    'audio_file': "1034_121119_000028_000001",
    'transcription': "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
}

example = load_audio_files(example)
labels = load_speaker_labels(example, config_norm_flow, hubert_model, pvq_labels)
if not Path(f"./saved_models/audio_files/{audio_file}.pth").is_file():
    speaker_embedding = extract_speaker_embedding(tts_model, example)
    torch.save(speaker_embedding, f"./saved_models/audio_files/{audio_file}.pth")
else:
    speaker_embedding = torch.load(f"./saved_models/audio_files/{audio_file}.pth")

manipulation_idx_widget = widgets.Dropdown(
    options=['Weight', 'Resonance', 'Breathiness', 'Roughness', 'Creak'],
    value='Breathiness', 
    description='Type:',
    style={'description_width': 'initial'}
)

manipulation_fkt_widget = widgets.FloatSlider(
    value=1.0, min=-2.0, max=5.0, step=0.1,
    description='Strength:',
    style={'description_width': 'initial'}
)

transcription_widget = widgets.Text(
    value="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
    placeholder='Type something',
    description='String:',
    disabled=False,
    layout=widgets.Layout(width='900px')
)

run_button = widgets.Button(description="Run Manipulation")

audio_output = widgets.Output()

def update_manipulation(b):
    with audio_output:
        clear_output(wait=True)
        print("üîÅ Running manipulation...")
        
    wav_manipulated = get_manipulation(
        example=example, 
        d_vector=speaker_embedding, 
        labels=labels[None, :], 
        flow=normalizing_flow,
        tts_model=tts_model,
        manipulation=manipulation_idx_widget.value,
        manipulation_intensity=manipulation_fkt_widget.value,
        pvq_labels=pvq_labels,
    )
    with audio_output:
        clear_output(wait=True)
        display(Audio(wav_manipulated, rate=24_000, normalize=True))

run_button.on_click(update_manipulation)
display(manipulation_fkt_widget, transcription_widget, manipulation_idx_widget, run_button, audio_output)