In [1]:
import torch
import torchaudio
from supervoice_gpt import SupervoiceGPT, Tokenizer, config
from IPython.display import Audio, display

In [2]:
# Model
device = "cuda"
tokenizer = Tokenizer(config, "tokenizer_text.model")
model = SupervoiceGPT(config).to(device)
checkpoint = torch.load(f'./output/exp-02.pt', map_location=device)
model.load_state_dict(checkpoint['model'])
model.eval()
print(checkpoint['step'])

38000


In [3]:
tokens = model.generate("What time is it?", tokenizer, max_new_tokens = 1024, top_k = 6, device = device)
print(tokens)
print(len(tokens))

tensor([[  56,  663,  922,  830],
        [ 965,  297,  757,  693],
        [ 664,  963,   88,  609],
        ...,
        [ 676,  952,  356,  617],
        [1018,  949,  326,  401],
        [ 431,  678,  745,  822]])
1024


In [4]:
facodec = torch.hub.load(repo_or_dir='ex3ndr/facodec', model='facodec', trust_repo = True)

def load_mono_audio(path):
    # Load audio
    audio, sr = torchaudio.load(path)

    # Resample
    if sr != 16000:
        audio = torchaudio.transforms.Resample(sr, 16000)(audio)
        sr = 16000

    # Convert to mono
    if audio.shape[0] > 1:
        audio = audio.mean(dim=0, keepdim=True)

    # Convert to single dimension
    audio = audio[0]

    return audio

source_style_audio = load_mono_audio("./eval_eval_0.wav")

Using cache found in /home/steve/.cache/torch/hub/ex3ndr_facodec_master


In [5]:
style = facodec.speaker_embedding(source_style_audio)

In [6]:
prosody_code = tokens[:, :1].T
content_code = tokens[:, 1:3].T
print(content_code[0])
waveform = facodec.speech_convert(prosody_code, content_code, style)
display(Audio(data=waveform, rate=16000))

tensor([663, 297, 963,  ..., 952, 949, 678])


In [7]:
src_codec = torch.load("external_datasets/librilight-processed/16/342/canterburytales_24_chaucer_64kb_0000.codec.pt", map_location="cpu")
prosody_code = src_codec[:1,:]
content_code = src_codec[1:3,:]
waveform = facodec.speech_convert(prosody_code, content_code, style)
display(Audio(data=waveform, rate=16000))

In [8]:
from supervoice_gpt import Tokenizer, config
from utils.datasets import create_dataset_loader
tokenizer = Tokenizer(config, "./tokenizer_text.model")
train_loader = create_dataset_loader("./external_datasets/librilight-processed/files_all.txt", batch_size = 1, input_length = 640, output_length = 2048, workers = 1, tokenizer = tokenizer)
x, y, t, x_l, y_l = next(iter(train_loader))

In [9]:
prosody_code = t.squeeze(0)[:, :1].T
content_code = t.squeeze(0)[:, 1:3].T
waveform = facodec.speech_convert(prosody_code, content_code, style)
display(Audio(data=waveform, rate=16000))