In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import numpy as np
import argparse
import os
import time
import IPython
import pyworld as pw

from styler import STYLER
from dataset import Dataset
from evaluate import evaluate
from synthesize import preprocess_text, synthesize, read, preprocess_audio, get_processed_data_from_wav
import hparams as hp
import utils
import audio as Audio


In [2]:
torch.manual_seed(0)

# Get device
device = torch.device('cuda'if torch.cuda.is_available()else 'cpu')


# Define model
model = nn.DataParallel(STYLER()).to(device)

checkpoint_path = os.path.join(hp.checkpoint_path())
checkpoint = torch.load(os.path.join(
    checkpoint_path, 'checkpoint_1190000.pth.tar'))
model.load_state_dict(checkpoint['model'])

# Load vocoder
vocoder = utils.get_vocoder()

model.requires_grad = False
model.eval()

Removing weight norm...


DataParallel(
  (module): STYLER(
    (style_modeling): StyleModeling(
      (style_encoder): StyleEncoder(
        (text_encoder): Encoder(
          (src_word_emb): Embedding(152, 256, padding_idx=0)
          (layer_stack): ModuleList(
            (0): FFTBlock(
              (slf_attn): MultiHeadAttention(
                (w_qs): Linear(in_features=256, out_features=256, bias=True)
                (w_ks): Linear(in_features=256, out_features=256, bias=True)
                (w_vs): Linear(in_features=256, out_features=256, bias=True)
                (attention): ScaledDotProductAttention(
                  (softmax): Softmax(dim=2)
                )
                (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
                (fc): Linear(in_features=256, out_features=256, bias=True)
                (dropout): Dropout(p=0.2, inplace=False)
              )
              (pos_ffn): PositionwiseFeedForward(
                (w_1): Conv1d(256, 1024, kernel_size=(9,)

In [3]:
reference = 'Evelynn_002'

speaker_id = 'Evelynn'

target_speaker = 'Jinx'

speaker_example = f'wav_data\\{target_speaker}\\{target_speaker}_001.wav'

In [4]:
audio_path = f'wav_data\\{speaker_id}\\{reference}.wav'
tg_path = f'preprocessed\\VCTK\\TextGrid\\{speaker_id}\\{reference}.TextGrid'

target_sentence = utils.get_transcript(f'wav_data\\{speaker_id}\\{reference}.txt')

text = preprocess_text(target_sentence)

spker_embed_path = os.path.join(
                hp.preprocessed_path, "spker_embed", "{}-spker_embed-{}.npy".format(hp.dataset, speaker_id))
speaker_embed = torch.from_numpy(np.load(spker_embed_path)).to(device)

target_spker_embed_path = os.path.join(
                hp.preprocessed_path, "spker_embed", "{}-spker_embed-{}.npy".format(hp.dataset, target_speaker))
target_speaker_embed = torch.from_numpy(np.load(target_spker_embed_path)).to(device)

_, wav = read(audio_path)


f0, energy, mel = get_processed_data_from_wav(audio_path, tg_path, False)

energy = (energy-hp.energy_min)/(hp.energy_max-hp.energy_min)
f0_norm = utils.speaker_normalization(f0)
mel, mel_len, energy, f0, f0_norm = preprocess_audio(mel, energy, f0, f0_norm)

|{SH OW1 M IY1 DH AH0 M AE1 N HH UW1 HH AE1 Z EH1 V R IY0 TH IH2 NG sp AH0 N D AY1 L SH OW1 Y UW1 M AY1 N EH1 K S T V IH1 K T AH0 M}|


In [5]:
with torch.no_grad():
    output1 = synthesize('', model, vocoder, text, target_sentence, speaker_embed, speaker_id, False, mel, mel_len, f0, f0_norm, energy, write=False)
    output2 = synthesize('', model, vocoder, text, target_sentence, target_speaker_embed, speaker_id, False, mel, mel_len, f0, f0_norm, energy, write=False)
    

In [6]:
print("Sentence:" + target_sentence)

print("Original Reference:")
IPython.display.display(IPython.display.Audio(audio_path))

print("Original Recreation:")
IPython.display.display(IPython.display.Audio(output1, rate=22050))

print("Target Speaker:")
IPython.display.display(IPython.display.Audio(speaker_example))
                        
print("Synth Result:")
IPython.display.display(IPython.display.Audio(output2, rate=22050))

Sentence:show me the man who has everything, and i'll show you my next victim.
Original Reference:


Original Recreation:


Target Speaker:


Synth Result:


In [7]:
def extract_encodings(model, text, speaker_embed, mel, mel, f0_norm, energy, src_len, mel_len)
    with torch.no_grad():
        src_len = torch.from_numpy(np.array([text.shape[1]])).to(device)
        src_mask = utils.get_mask_from_lengths(src_len, None)
        mel_mask = utils.get_mask_from_lengths(mel_len, None)
        (text_encoding, pitch_embedding, speaker_encoding, energy_embedding), noise_encoding, d_prediction, p_prediction, e_prediction, mel_len, mel_mask, (aug_posterior_d, aug_posterior_p, aug_posterior_e) = model.module.style_modeling(
                        text, speaker_embed, mel, mel, f0_norm, energy, src_len, mel_len, src_mask, mel_mask, seperate=True)
        return (text_encoding, pitch_embedding, speaker_encoding, energy_embedding), noise_encoding, d_prediction, p_prediction, e_prediction, mel_len, mel_mask, (aug_posterior_d, aug_posterior_p, aug_posterior_e)

tensor(-0.1799, device='cuda:0')
torch.Size([1, 377, 256])
