In [None]:
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path
import numpy as np
import soundfile as sf
import os
import librosa
import sounddevice as sd
import wavio
import glob
from helper import draw_embed, create_spectrogram, read_audio
%matplotlib inline

# 1. Record your own voice

In [None]:
print("Recording...")
duration = 5  # seconds
fs = 48000
sd.default.samplerate = fs
sd.default.channels = 1
myrecording = sd.rec(int(duration * fs))
sd.wait(duration)
print("Saving sample as myvoice.mp3")
path_myrecording = "./samples/myvoice.mp3"
wavio.write(path_myrecording, myrecording, fs, sampwidth=2)
sd.play(myrecording, fs) #st
print("Done! Saved sample as myvoice.mp3")

In [None]:
fig = create_spectrogram(path_myrecording)

# 2. Load your pretrained models

In [None]:
print("Loading pretrained models...")
seed = 42
low_mem = False
num_generated = 0
enc_model_fpath = Path("encoder/saved_models/pretrained.pt")
syn_model_dir = Path("synthesizer/saved_models/logs-pretrained/")
voc_model_fpath = Path("vocoder/saved_models/pretrained/pretrained.pt")
encoder.load_model(enc_model_fpath)
synthesizer = Synthesizer(
    syn_model_dir.joinpath("taco_pretrained"), low_mem=low_mem, seed=seed
)
vocoder.load_model(voc_model_fpath)
print("Loaded pretrained models!")

# 3. Choose a recording

In [None]:
audio_folder = "samples"
filenames = glob.glob(os.path.join(audio_folder, "*.mp3"))
print(filenames)

selected_filename = 'samples/myvoice.mp3'
in_fpath = Path(selected_filename.replace('"', "").replace("'", ""))

# 4. Start preprocessing

In [None]:
original_wav, sampling_rate = librosa.load(str(in_fpath))
preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
print("Loaded file succesfully!")
embed = encoder.embed_utterance(preprocessed_wav)
sd.play(original_wav, sampling_rate) #st
print("Created the embedding")

In [None]:
fig = draw_embed(embed, "myembedding", None)

# 5. Synthesize the text you like to hear

In [None]:
## Generating the spectrogram
text = input("Write a sentence (+-20 words) to be synthesized:\n")

In [None]:
if text != "":
    texts = [text]
    embeds = [embed]
    # If you know what the attention layer alignments are,
    # you can retrieve them here by passing return_alignments=True
    specs = synthesizer.synthesize_spectrograms(texts, embeds)
    spec = specs[0]
    print("Created the mel spectrogram")

    # Generating the waveform
    print("Synthesizing the waveform:")

    generated_wav = vocoder.infer_waveform(spec)

    # Post-generation
    # There's a bug with sounddevice that makes the audio cut one
    # second earlier, so we pad it.
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")

    # Trim excess silences to compensate for gaps in spectrograms (issue #53)
    generated_wav = encoder.preprocess_wav(generated_wav)

    # Play the audio (non-blocking)
    try:
        sd.stop()
        sd.play(generated_wav, synthesizer.sample_rate)
    except sd.PortAudioError as e:
        print("\nCaught exception: %s" % repr(e))
        print(
            'Continuing without audio playback. Suppress this message with \
            the "--no_sound" flag.\n'
        )

    # Save it on the disk
    filename = "demo_output_%02d.wav" % num_generated
    sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
    num_generated += 1
    print("\nSaved output as %s\n\n" % filename)