In [None]:
from pathlib import Path
from Encoder.audio import preprocess_wav
from Encoder.audio import wav_to_mel_spectrogram
import torch
from Tacotron.utils.text import *
from Encoder.model import SpeakerEncoder
from Tacotron.model import Tacotron
import numpy as np
from Tacotron.hparams import hparams
from typing import Union, List
from wavernn.model import WaveRNN
import soundfile as sf
device = "cuda" if torch.cuda.is_available() else "cpu"
checkpoint1=torch.load(r"DIR\encoder.pt",map_location=device)
model1=SpeakerEncoder(device,device)
model1.load_state_dict(checkpoint1["model_state"])
checkpoint2=torch.load(r"DIR\synthesizer.pt",map_location=device)
model2 = Tacotron(
        embed_dims=hparams.tts_embed_dims,
        num_chars=len(symbols),
        encoder_dims=hparams.tts_encoder_dims,
        decoder_dims=hparams.tts_decoder_dims,
        n_mels=hparams.num_mels,
        fft_bins=hparams.num_mels,
        postnet_dims=hparams.tts_postnet_dims,
        encoder_K=hparams.tts_encoder_K,
        lstm_dims=hparams.tts_lstm_dims,
        postnet_K=hparams.tts_postnet_K,
        num_highways=hparams.tts_num_highways,
        dropout=hparams.tts_dropout,
        stop_threshold=hparams.tts_stop_threshold,
        speaker_embedding_size=hparams.speaker_embedding_size
    ).to(device)
model2.load_state_dict(checkpoint2["model_state"])
checkpoint3=torch.load(r"DIR\vocoder.pt",map_location=device)
model3 = WaveRNN(
        rnn_dims=512,
        fc_dims=512,
        bits=9,
        pad=2,
        upsample_factors=(5, 5, 8),
        feat_dims=80,
        compute_dims=128,
        res_out_dims=128,
        res_blocks=10,
        hop_length=200,
        sample_rate=16000,
    )
model3.load_state_dict(checkpoint3["model_state"])
def pad1d(x, max_len, pad_value=0):
    return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)

def synthesize_spectrograms(texts: List[str],
                                embeddings: Union[np.ndarray, List[np.ndarray]],
                                return_alignments=False):
        # Preprocess text inputs
        inputs = [text_to_sequence(text.strip(), ["english_cleaners"]) for text in texts]
        if not isinstance(embeddings, list):
            embeddings = [embeddings]

        # Batch inputs
        batched_inputs = [inputs[i:i+hparams.synthesis_batch_size]
                             for i in range(0, len(inputs), hparams.synthesis_batch_size)]
        batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size]
                             for i in range(0, len(embeddings), hparams.synthesis_batch_size)]

        specs = []
        for i, batch in enumerate(batched_inputs, 1):
            if True:
                print(f"\n| Generating {i}/{len(batched_inputs)}")

            # Pad texts so they are all the same length
            text_lens = [len(text) for text in batch]
            max_text_len = max(text_lens)
            chars = [pad1d(text, max_text_len) for text in batch]
            chars = np.stack(chars)

            # Stack speaker embeddings into 2D array for batch processing
            speaker_embeds = np.stack(batched_embeds[i-1])

            # Convert to tensor
            chars = torch.tensor(chars).long().to(device)
            speaker_embeddings = torch.tensor(speaker_embeds).float().to(device)

            # Inference
            _, mels, alignments = model2.generate(chars, speaker_embeddings)
            mels = mels.detach().cpu().numpy()
            for m in mels:
                # Trim silence from end of each spectrogram
                while np.max(m[:, -1]) < hparams.tts_stop_threshold:
                    m = m[:, :-1]
                specs.append(m)

        if True:
            print("\n\nDone.\n")
        return (specs, alignments) if return_alignments else specs
def embed_frames_batch(frames_batch):
 
    frames = torch.from_numpy(frames_batch).to(device)
    embed = model1.forward(frames).detach().cpu().numpy()
    return embed


def compute_partial_slices(n_samples, partial_utterance_n_frames=160,
                           min_pad_coverage=0.75, overlap=0.5):
   
    assert 0 <= overlap < 1
    assert 0 < min_pad_coverage <= 1

    samples_per_frame = int((16000 * 10 / 1000))
    n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
    frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)

    # Compute the slices
    wav_slices, mel_slices = [], []
    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
    for i in range(0, steps, frame_step):
        mel_range = np.array([i, i + partial_utterance_n_frames])
        wav_range = mel_range * samples_per_frame
        mel_slices.append(slice(*mel_range))
        wav_slices.append(slice(*wav_range))

    # Evaluate whether extra padding is warranted or not
    last_wav_range = wav_slices[-1]
    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
    if coverage < min_pad_coverage and len(mel_slices) > 1:
        mel_slices = mel_slices[:-1]
        wav_slices = wav_slices[:-1]

    return wav_slices, mel_slices

def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):

    if not using_partials:
        frames = wav_to_mel_spectrogram(wav)
        embed = embed_frames_batch(frames[None, ...])[0]
        if return_partials:
            return embed, None, None
        return embed

    # Compute where to split the utterance into partials and pad if necessary
    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
    max_wave_length = wave_slices[-1].stop
    if max_wave_length >= len(wav):
        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")

    # Split the utterance into partials
    frames = wav_to_mel_spectrogram(wav)
    frames_batch = np.array([frames[s] for s in mel_slices])
    partial_embeds = embed_frames_batch(frames_batch)

    # Compute the utterance embedding from the partial embeddings
    raw_embed = np.mean(partial_embeds, axis=0)
    embed = raw_embed / np.linalg.norm(raw_embed, 2)

    if return_partials:
        return embed, partial_embeds, wave_slices
    return embed

In [None]:
input_file= Path(r"\test2.wav")
preprocessed_wav = preprocess_wav(input_file)
embed = embed_utterance(preprocessed_wav)


text="And this is how I becoming the destroyer of world."

mel=synthesize_spectrograms([text],[embed])[0]
mel = mel / 4.
mel = torch.from_numpy(mel[None, ...])
wav = model3.generate(mel, True, 8000, 800, True,  None)
generated_wav = np.pad(wav, (0, 16000), mode="constant")
generated_wav = preprocess_wav(generated_wav)
path= Path(r"\output.wav")
sf.write(path, wav.astype(np.float32), 16000)


In [None]:
input_file= Path(r"")
preprocessed_wav = preprocess_wav(input_file)

frames = wav_to_mel_spectrogram(preprocessed_wav)
frames = torch.from_numpy(frames[None, ...]).to(device)
embed = SpeakerEncoder.forward(frames).detach().cpu().numpy()[0]

text=""

mel=synthesize_spectrograms([text],[embed])[0]
mel = mel / 4.
mel = torch.from_numpy(mel[None, ...])
wav = WaveRNN.generate(mel, True, 8000, 800, True,  None)

sf.write("generated.wav", wav.astype(np.float32), 16000)