# Bark text-to-speech voice cloning.
Clone voices to create speaker history prompt files (.npz) for [bark text-to-speech](https://github.com/suno-ai/bark).

## Install packages

In [1]:
%pip install -r requirements.txt

Ignoring soundfile: markers 'platform_system == "Windows"' don't match your environment
Collecting sox
  Downloading sox-1.4.1-py2.py3-none-any.whl (39 kB)
Installing collected packages: sox
Successfully installed sox-1.4.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install --upgrade safetensors

Note: you may need to restart the kernel to use updated packages.


## Load models

In [1]:
from einops import pack, unpack

In [2]:
device = 'cuda'  # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')

import numpy as np
import torch
import torchaudio
from encodec import EncodecModel
from encodec.utils import convert_audio

from transformers import HubertModel

from bark_hubert_quantizer.customtokenizer import CustomTokenizer


In [3]:

print('Loading HuBERT...')
hubert_model = HubertModel.from_pretrained("team-lucid/hubert-base-korean")
hubert_model.to(device)

print('Loading Quantizer...')
quant_model = CustomTokenizer.load_from_checkpoint("Literature/new_model_epoch_8.pth", device) # 여기 수정해야됨.

print('Loading Encodec...')
encodec_model = EncodecModel.encodec_model_24khz()
encodec_model.set_target_bandwidth(6.0)
encodec_model.to(device)

print('Downloaded and loaded models!')

Loading HuBERT...
Loading Quantizer...
1024 768 10000 1
Loading Encodec...
Downloaded and loaded models!


In [4]:
torch.cuda.empty_cache()

## Load wav and create speaker history prompt

In [161]:
wav_file = 'wav_data/여_아동_차분한/여_아동_차분한_중립.wav'  # Put the path of the speaker you want to use here. # 여기 수정
out_file = 'npz_data/여_아동_차분한/여_아동_차분한_중립'  # Put the path to save the cloned speaker to here. # 여기 수정

wav, sr = torchaudio.load(wav_file)
print(f'sampling rate : {sr}')

wav_hubert = wav.to(device)

if wav_hubert.shape[0] == 2:  # Stereo to mono if needed
    wav_hubert = wav_hubert.mean(0, keepdim=True)

sampling rate : 48000


In [162]:
# 우리가 사용할 데이터는 44.1kHz이므로 이를 16kHz로 resampling 해줘야함
resampler = torchaudio.transforms.Resample(
    orig_freq= sr,
    new_freq= 16000).to(device)

if sr > 16000:
    wav_hubert = resampler(wav_hubert)

In [163]:
print('Extracting semantics...') # input wav는 16kHz

semantic_vectors = hubert_model.forward(wav_hubert).last_hidden_state

embed, packed_shape = pack(semantic_vectors, '* d')
semantic_vectors = torch.from_numpy(embed.cpu().detach().numpy()).to(device)

print('Tokenizing semantics...')
semantic_tokens = quant_model.get_token(semantic_vectors)

print('Creating coarse and fine prompts...')
wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)

wav = wav.to(device)



Extracting semantics...
Tokenizing semantics...
Creating coarse and fine prompts...


In [164]:
with torch.no_grad():
    encoded_frames = encodec_model.encode(wav)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # 얘가 뭔 리스트 안에 튜플 안에 있어서 그거 꺼내주고 [1, 8, 1702]를 [8, 1702]로 unsqueeze

codes = codes.cpu()
semantic_tokens = semantic_tokens.cpu()

np.savez(out_file,
         semantic_prompt=semantic_tokens.squeeze(),
         fine_prompt=codes,
         coarse_prompt=codes[:2, :]
         )

print('Done!')

Done!
