# Emotion Embedding Test

### Utils

In [None]:
# load packages
import random
import yaml
from munch import Munch
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import librosa
import pandas 
from models import StyleEncoder

DEVICE = "cuda"
DATA_PATH = "Data/emotion_embedding_test_file.txt"
DATA_HEADER = ["file_path","emotion_id"]
DATA_SEPARETOR = "|"
MODEL_PATH = "Models/Experiment-1"

%matplotlib inline

: 

In [3]:
# Source: http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is18/en_speaker_used.txt
# Source: https://github.com/jjery2243542/voice_conversion

to_mel = torchaudio.transforms.MelSpectrogram(
    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
mean, std = -4, 4

def preprocess(wave):
    wave_tensor = torch.from_numpy(wave).float()
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor

def build_model(model_params={}):
    args = Munch(model_params)
    style_encoder = StyleEncoder(args.dim_in, args.style_dim, args.num_domains, args.max_conv_dim)
    return style_encoder

def compute_style(speaker_dicts):
    reference_embeddings = {}
    for key, (path, speaker) in speaker_dicts.items():
        wave, sr = librosa.load(path, sr=24000)
        audio, index = librosa.effects.trim(wave, top_db=30)
        if sr != 24000:
            wave = librosa.resample(wave, sr, 24000)
        mel_tensor = preprocess(wave).to(DEVICE)
        with torch.no_grad():
            label = torch.LongTensor([speaker])
            ref = starganv2.style_encoder(mel_tensor.unsqueeze(1), label)
        reference_embeddings[key] = (ref, label)
    
    return reference_embeddings

### Load models

In [6]:
# load starganv2

with open(f'{MODEL_PATH}/config.yml') as f:
    starganv2_config = yaml.safe_load(f)
starganv2 = build_model(model_params=starganv2_config["model_params"])
params = torch.load(f"{MODEL_PATH}/epoth.pth", map_location=DEVICE)
params = params['model_ema']
_ = [starganv2[key].load_state_dict(params[key]) for key in starganv2 if key == "style_encoder"]
_ = [starganv2[key].eval() for key in starganv2 if key == "style_encoder"]
starganv2.style_encoder = starganv2.style_encoder.to(DEVICE)

#### Convert by style encoder

In [8]:
# with reference, using style encoder
dataframe = pd.read_csv(DATA_PATH, sep=DATA_SEPARETOR, names=DATA_HEADER)

speaker_dicts = {}
for s in selected_speakers:
    print(s)
    k = s
    speaker_dicts[str(s)] = (f'Demo/VCTK-corpus/{str(k)}/{str(k)}.wav', speakers.index(s))

reference_embeddings = compute_style(speaker_dicts)

1
2
3
