# Bark text-to-speech voice cloning.
Clone voices to create speaker history prompt files (.npz) for [bark text-to-speech](https://github.com/suno-ai/bark).

## Install packages

In [5]:
%pip install -r requirements.txt

Ignoring soundfile: markers 'platform_system == "Windows"' don't match your environment
Collecting audiolm-pytorch==1.1.4
  Downloading audiolm_pytorch-1.1.4-py3-none-any.whl (37 kB)
Collecting fairseq
  Downloading fairseq-0.12.2.tar.gz (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting sox
  Downloading sox-1.4.1-py2.py3-none-any.whl (39 kB)
Collecting local-attention>=1.8.4
  Downloading local_attention-1.8.6-py3-none-any.whl (8.1 kB)
Collecting ema-pytorch>=0.2.2
  Downloading ema_pytorch-0.2.3-py3-none-any.whl (4.4 kB)
Collecting beartype
  Downloading beartype-0.16.3-py3-none-any.whl (816 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8

In [2]:
pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113

[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://download.pytorch.org/whl/cu117
Collecting pillow!=8.3.*,>=5.3.0
  Downloading https://download.pytorch.org/whl/Pillow-9.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 12.9 MB/s eta 0:00:01
[?25hInstalling collected packages: pillow
Successfully installed pillow-9.3.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
torch.cuda.empty_cache()

## Load models

In [8]:
device = 'cuda'  # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')

import numpy as np
import torch
import torchaudio
from encodec import EncodecModel
from encodec.utils import convert_audio

from transformers import HubertModel

from bark_hubert_quantizer.customtokenizer import CustomTokenizer

print('Loading HuBERT...')
hubert_model = HubertModel.from_pretrained("team-lucid/hubert-base-korean")
hubert_model.to(device)

print('Loading Quantizer...')
quant_model = CustomTokenizer.load_from_checkpoint("tokenizer.pth", device)

print('Loading Encodec...')
encodec_model = EncodecModel.encodec_model_24khz()
encodec_model.set_target_bandwidth(6.0)
encodec_model.to(device)

print('Downloaded and loaded models!')

Loading HuBERT...
Loading Quantizer...
Loading Encodec...
Downloaded and loaded models!


## Load wav and create speaker history prompt

In [9]:
wav_file = 'file/wav/test.wav'  # Put the path of the speaker you want to use here.
out_file = 'file/npz'  # Put the path to save the cloned speaker to here.

wav, sr = torchaudio.load(wav_file)

wav_hubert = wav.to(device)

if wav_hubert.shape[0] == 2:  # Stereo to mono if needed
    wav_hubert = wav_hubert.mean(0, keepdim=True)

In [10]:
print('Extracting semantics...')
semantic_vectors = hubert_model.forward(wav_hubert).last_hidden_state # input wav는 16kHz # output은 [1, 3125, 768]

Extracting semantics...


In [6]:
print('Tokenizing semantics...')
semantic_tokens = quant_model.get_token(semantic_vectors) # [1, 10000]

In [13]:
print('Creating coarse and fine prompts...')
wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)

wav = wav.to(device)

Creating coarse and fine prompts...


In [19]:
with torch.no_grad():
    encoded_frames = encodec_model.encode(wav)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # 얘가 뭔 리스트 안에 튜플 안에 있어서 그거 꺼내주고 [1, 8, 1702]를 [8, 1702]로 unsqueeze

codes = codes.cpu()
semantic_tokens = semantic_tokens.cpu()

np.savez(out_file,
         semantic_prompt=semantic_tokens.squeeze(), # [10000]으로 squeeze 해줌
         fine_prompt=codes,
         coarse_prompt=codes[:2, :]
         )

print('Done!')

Done!


In [52]:
semantic_vectors.size()

torch.Size([1, 3125, 768])

In [15]:
codes.size()

torch.Size([8, 1702])

In [16]:
codes[:2, :].size()

torch.Size([2, 1702])

In [20]:
data = np.load('file/ko_speaker_1.npz')

In [33]:
data['fine_prompt'].shape

(8, 480)

In [34]:
encodec_model.sample_rate

24000