# 실습: Voice Conversion 모델 동작을 위한 함수 구현

본 실습의 목표는 Voice Conversion을 동작시키기 위해서 필요한 함수들을 구현하여 VC 모델을 동작시키는 것입니다. 구현이 완료된 이후에는 다양한 소스/타겟 음성을 입력하여 음성 변조 결과를 확인할 수 있습니다.



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 모델 다운로드
먼저 미리 학습되어 있는 VC 모델과, 목소리 정보를 추출할수 있는 모델을 다운로드 합니다.

In [7]:
!pip install -U --no-cache-dir gdown --pre

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.4-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.6.4


In [8]:
CONFIG_SE_PATH = "config_se.json"
CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
# download config 
! gdown --id  19cDrhZZ0PfKf2Zhr_ebB-QASRw844Tn1 -O $CONFIG_SE_PATH
# download checkpoint  
! gdown --id   17JsW6h6TIh7-LkU2EvB_gnNrPcdBxt7X -O $CHECKPOINT_SE_PATH
# download checkpoint
! gdown --id 1sgEjHt0lbPSEw9-FSbC_mBoOPwNi87YR -O best_model.pth.tar

Downloading...
From: https://drive.google.com/uc?id=19cDrhZZ0PfKf2Zhr_ebB-QASRw844Tn1
To: /content/drive/MyDrive/SubPJT2_Voice_Conversion/config_se.json
100% 3.49k/3.49k [00:00<00:00, 5.46MB/s]
Downloading...
From: https://drive.google.com/uc?id=17JsW6h6TIh7-LkU2EvB_gnNrPcdBxt7X
To: /content/drive/MyDrive/SubPJT2_Voice_Conversion/SE_checkpoint.pth.tar
100% 44.6M/44.6M [00:00<00:00, 172MB/s]
Downloading...
From: https://drive.google.com/uc?id=1sgEjHt0lbPSEw9-FSbC_mBoOPwNi87YR
To: /content/drive/MyDrive/SubPJT2_Voice_Conversion/best_model.pth.tar
100% 380M/380M [00:05<00:00, 65.4MB/s]


### 라이브러리 import
필요한 라이브러리들을 import합니다.

In [3]:
!pip install -r /content/drive/MyDrive/SubPJT2_Voice_Conversion/requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting librosa==0.8.0
  Downloading librosa-0.8.0.tar.gz (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 KB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.19.5
  Downloading numpy-1.19.5-cp38-cp38-manylinux2010_x86_64.whl (14.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.9/14.9 MB[0m [31m86.5 MB/s[0m eta [36m0:00:00[0m
Collecting pypinyin
  Downloading pypinyin-0.48.0-py2.py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m83.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pysbd
  Downloading pysbd-0.3.4-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.1/71.1 KB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting pydub
  Downloading pydub-0.

In [9]:
import sys
TTS_PATH = "/content/drive/MyDrive/SubPJT2_Voice_Conversion/"

# add libraries into environment
sys.path.append(TTS_PATH) # set this if TTS is not installed globally
print(sys.path)
import os
import string
import time
import argparse
import json

import numpy as np
import IPython
from IPython.display import Audio

import torch

from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
try:
  from TTS.utils.audio import AudioProcessor
except:
  from TTS.utils.audio import AudioProcessor


from TTS.tts.models import setup_model
from TTS.config import load_config
from TTS.tts.models.vits import *

from TTS.tts.utils.speakers import SpeakerManager
from pydub import AudioSegment
import librosa

['/content', '/env/python', '/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/usr/local/lib/python3.8/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.8/dist-packages/IPython/extensions', '/root/.ipython', '/content/drive/MyDrive/SubPJT2_Voice_Conversion/', '/content/drive/MyDrive/SubPJT2_Voice_Conversion/']


### Voice Conversion 모델 세팅
미리 학습된 Voice Conversion 모델을 동작하기 위한 기본적인 세팅을 진행합니다.

In [5]:
cd /content/drive/MyDrive/SubPJT2_Voice_Conversion

/content/drive/MyDrive/SubPJT2_Voice_Conversion


In [11]:
# model vars 
MODEL_PATH = 'best_model.pth.tar'
CONFIG_PATH = 'config.json'
TTS_LANGUAGES = "language_ids.json"
TTS_SPEAKERS = "speakers.json"
SAMPLING_RATE=16000
USE_CUDA = torch.cuda.is_available()

# load the config
C = load_config(CONFIG_PATH)
# load the audio processor
ap = AudioProcessor(**C.audio)

speaker_embedding = None

C.model_args['d_vector_file'] = TTS_SPEAKERS
C.model_args['use_speaker_encoder_as_loss'] = False

model = setup_model(C)
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
# print(model.language_manager.num_languages, model.embedded_language_dim)
# print(model.emb_l)
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
# remove speaker encoder
model_weights = cp['model'].copy()
for key in list(model_weights.keys()):
  if "speaker_encoder" in key:
    del model_weights[key]

model.load_state_dict(model_weights)
model.eval()

if USE_CUDA:
    model = model.cuda()

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:False
 | > do_amp_to_db_mel:True
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Using model: vits
 > Speaker manager is loaded with 6 speakers: female-en-5, female-en-5
, female-pt-4
, male-en-2, male-en-2
, male-pt-3



### Speaker Encoder 모델 세팅
미리 학습된 Speaker Encoder 모델을 동작하기 위한 기본적인 세팅을 진행합니다.

In [12]:
SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA)

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:512
 | > power:1.5
 | > preemphasis:0.97
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:False
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > stats_path:None
 | > base:10
 | > hop_length:160
 | > win_length:400


#실습 진행하기

## Req. 2-2:	Spectrogram을 생성하는 compute_spec() 함수 구현

In [13]:
import librosa

def compute_spec(ref_file):
    ################################################################################
    # TODO: Spectrogram을 생성하는 compute_spec() 함수 구현                             #
    ################################################################################

    hop_length = 256  # 전체 frame 수
    n_fft = 1024  # frame 하나당 sample 수
    win_length = 1024

    #음성 파일 로드
    sig, sr = librosa.load(ref_file, sr=16000)

    # STFT
    stft = librosa.stft(sig, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window = "hann", pad_mode = "reflect")

    #절댓값 변환
    magnitude = np.abs(stft)

    #float32로 변환하고 tensor로 변환
    spec = torch.tensor(magnitude,dtype = torch.float32).unsqueeze(0)

    ################################################################################
    # TODO: Spectrogram을 생성하는 compute_spec() 함수 구현                             #
    ################################################################################
    
    return spec

### Req. 2-2의 구현을 완료한 뒤 테스트 합니다.

In [14]:
####Req. 2-2 test 용도####
test_audio = "/content/drive/MyDrive/SubPJT2_Voice_Conversion/jupyter/source/test.wav"
test_spec = compute_spec(test_audio)
print("shape of the test spectrogram: ", test_spec.shape)
print("max value of the test spectrogram: ", test_spec.max())
print("min value of the test spectrogram: ", test_spec.min())
####Req. 2-2 test 용도####

shape of the test spectrogram:  torch.Size([1, 513, 376])
max value of the test spectrogram:  tensor(157.3934)
min value of the test spectrogram:  tensor(7.8661e-07)


### Req. 2-3의 구현을 완료한 뒤 테스트 합니다.

In [15]:
####Req. 2-3 test 용도####
test_audio = "/content/drive/MyDrive/SubPJT2_Voice_Conversion/jupyter/source/test.wav"
test_emb = SE_speaker_manager.compute_speaker_embedding(test_audio)
print("shape of the test embedding: ", test_emb.shape)
print("max value of the test embedding: ", test_emb.max())
print("min value of the test embedding: ", test_emb.min())
####Req. 2-3 test 용도####

torch.Size([96161])
torch.Size([1, 96161])
shape of the test embedding:  torch.Size([1, 512])
max value of the test embedding:  tensor(0.2351, device='cuda:0')
min value of the test embedding:  tensor(-0.2167, device='cuda:0')


### Req. 2-4의 구현을 완료한 뒤 테스트 합니다.

In [16]:
####Req. 2-4 test 용도####
test_audios = ["/content/drive/MyDrive/SubPJT2_Voice_Conversion/jupyter/source/test.wav", "/content/drive/MyDrive/SubPJT2_Voice_Conversion/jupyter/source/test2.wav"]
test_emb = SE_speaker_manager.compute_d_vector_from_clip(test_audios)
test_emb = torch.FloatTensor(test_emb).unsqueeze(0)
print("max value of the test embedding: ", test_emb.max())
print("min value of the test embedding: ", test_emb.min())
####Req. 2-4 test 용도####

torch.Size([96161])
torch.Size([1, 96161])
torch.Size([122561])
torch.Size([1, 122561])
max value of the test embedding:  tensor(0.1433)
min value of the test embedding:  tensor(-0.2144)


In [17]:
####Req. 2-4 test 용도####
test_audios = ["/content/drive/MyDrive/SubPJT2_Voice_Conversion/jupyter/source/test2.wav"]
test_emb = SE_speaker_manager.compute_d_vector_from_clip(test_audios)
test_emb = torch.FloatTensor(test_emb).unsqueeze(0)
print("max value of the test embedding: ", test_emb.max())
print("min value of the test embedding: ", test_emb.min())
####Req. 2-4 test 용도####

torch.Size([122561])
torch.Size([1, 122561])
max value of the test embedding:  tensor(0.1345)
min value of the test embedding:  tensor(-0.2572)


# Voice Conversion 모델을 동작합니다.

In [73]:
print("Select target speaker reference audios files:")

target_files = []

for i in range(1,4):
    
    target_files.append(f"./jupyter/source/src{i}.m4a")


Select target speaker reference audios files:


In [74]:
print("Select driving audio file:")

driving_files = []

for i in range(1,5):

    driving_files.append(f"/content/drive/MyDrive/SubPJT2_Voice_Conversion/jupyter/source/tar{i}.m4a")

Select driving audio file:


## Req. 2-5:	소스 음성과 타켓 음성의 embedding을 추출하는 함수 구현

In [75]:
################################################################################
# TODO: 소스 음성과 타켓 음성의 embedding을 추출하는 함수 구현                            #
################################################################################

#target embedding

target_emb = SE_speaker_manager.compute_d_vector_from_clip(target_files)
target_emb = torch.FloatTensor(target_emb)

#source embedding

source_emb = SE_speaker_manager.compute_d_vector_from_clip(driving_files)
source_emb = torch.FloatTensor(source_emb)

################################################################################
# TODO: 소스 음성과 타켓 음성의 embedding을 추출하는 함수 구현                            #
################################################################################



torch.Size([39382])
torch.Size([1, 39382])




torch.Size([40496])
torch.Size([1, 40496])




torch.Size([44211])
torch.Size([1, 44211])




torch.Size([23778])
torch.Size([1, 23778])




torch.Size([70961])
torch.Size([1, 70961])




torch.Size([119630])
torch.Size([1, 119630])




torch.Size([36409])
torch.Size([1, 36409])


In [76]:
driving_file = driving_files[0]
driving_spec = compute_spec(driving_file)
y_lengths = torch.tensor([driving_spec.size(-1)])

if USE_CUDA:
    ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), source_emb.cuda(), target_emb.cuda())
    ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
else:
    ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, source_emb, target_emb)
    ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()

print("Target Speaker reference Audio")
IPython.display.display(Audio(target_files[2], rate=ap.sample_rate))

print("Source speaker reference Audio")
IPython.display.display(Audio(driving_files[0], rate=ap.sample_rate))

print("Play the converted audio:")
IPython.display.display(Audio(ref_wav_voc, rate=SAMPLING_RATE))



Target Speaker reference Audio


Source speaker reference Audio


Play the converted audio:


In [56]:
print("Play the converted audio:")
IPython.display.display(Audio(ref_wav_voc, rate=SAMPLING_RATE))

Play the converted audio:


In [33]:
driving_spec.shape

torch.Size([1, 513, 164])

In [34]:
source_emb.shape

torch.Size([1, 1, 512])

In [35]:
target_emb.shape

torch.Size([1, 1, 512])