In [None]:
from datasets import Audio
from transformers import EncodecModel, AutoProcessor

model = EncodecModel.from_pretrained("facebook/encodec_24khz")
processor = AutoProcessor.from_pretrained("facebook/encodec_24khz")

In [None]:
from datasets import Dataset

dataset = Dataset.from_dict({"audio": ["./audio.mp3"]}).cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
data = dataset[0]

In [None]:
import math
from tqdm import tqdm

audio_sample = data["audio"]["array"]
sample_size = len(audio_sample)
block_size = 10000000
device = "cuda:7"

model.to(device)

codes = []

for i in tqdm(range(math.ceil(sample_size/block_size))):
    
    sample = audio_sample[i * block_size : (i + 1) * block_size]
    
    # preprocess the audio sample
    inputs = processor(raw_audio=sample, sampling_rate=processor.sampling_rate, return_tensors="pt")
    
    # generate the codes
    encoder_outputs = model.encode(inputs["input_values"].to(device), inputs["padding_mask"].to(device), bandwidth=6)
    
    codes.append(encoder_outputs.audio_codes.detach().cpu().numpy())

In [None]:
import torch

torch.cuda.empty_cache()

In [None]:
import numpy as np

encodes = np.concatenate(codes, axis=-1)[0, 0]
encodes

In [None]:
encodes = encodes.transpose().flatten()

In [None]:
encodes.min()

In [None]:
np.save("./audio_bw_6.npy", np.array(encodes, dtype=np.uint16))

In [None]:
idx = len(audio_sample)//2
sample_1 = audio_sample[:idx]
sample_2 = audio_sample[idx:]
inputs = processor(raw_audio=sample_1, sampling_rate=processor.sampling_rate, return_tensors="pt")

In [None]:
device = "cuda:6"
model.to(device)
encoder_outputs = model.encode(inputs["input_values"].to(device), inputs["padding_mask"].to(device))

In [None]:
import numpy as np
import torch

block_size = 3000
device = "cuda:7"
data = np.load("./audio_bw_3.npy")
codes = torch.from_numpy((data[:2 * block_size]).astype(np.int64))
codes = codes.view(-1, 4).transpose(0, 1)[None, None]

model.to(device)

audio_values = model.decode(codes.to(device), [None])[0]

In [None]:
values = audio_values.detach().cpu().numpy()[0, 0]

In [None]:
values.shape

In [None]:
import soundfile as sf
sf.write("./sample.wav", values, processor.sampling_rate, 'PCM_24')

In [None]:
np.load("./audio_bw_3.npy").shape

## Sampling Audio

In [None]:
import os
import torch
from datasets import Audio
from transformers import EncodecModel, AutoProcessor

audio_model = EncodecModel.from_pretrained("facebook/encodec_24khz")
processor = AutoProcessor.from_pretrained("facebook/encodec_24khz")

def load_model(file_path: str, latest: bool = True):
    """
    Loads the latest model in a give_dir
    if latest is false, it instead loads the model stored at file_path
    """
    if os.path.isdir(file_path):
        models = sorted(os.listdir("./models"))
        if latest:
            model_path = os.path.join(file_path, models[-1])
        else:
            model_path = os.path.join(file_path, models[0])
    else:
        model_path = file_path
        
    print(f"Loading model at path: {model_path}")
    
    model = torch.load(model_path)
    return model

model = load_model("./models")

In [None]:
import soundfile as sf

prompt = torch.tensor([[512]])
device = "cuda:7"
max_new_tokens = 2047

model.to(device)
audio_model.to(device)
tokens = model.generate(prompt.to(device), max_new_tokens=max_new_tokens)
tokens = tokens[0].view(-1, 4).transpose(0, 1)[None, None]
sample = audio_model.decode(tokens, [None])
sample = sample.audio_values.flatten().detach().cpu().numpy()

sf.write("./sample_3.wav", sample, processor.sampling_rate, 'PCM_24')

## Speech Tokenizer

In [47]:
from speechtokenizer import SpeechTokenizer

config_path = './SpeechTokenizer/config.json'
ckpt_path = './SpeechTokenizer/SpeechTokenizer.pt'
model = SpeechTokenizer.load_from_checkpoint(config_path, ckpt_path)
model.eval()

SpeechTokenizer(
  (encoder): SEANetEncoder(
    (model): Sequential(
      (0): SConv1d(
        (conv): NormConv1d(
          (conv): Conv1d(1, 64, kernel_size=(7,), stride=(1,))
          (norm): Identity()
        )
      )
      (1): SEANetResnetBlock(
        (block): Sequential(
          (0): ELU(alpha=1.0)
          (1): SConv1d(
            (conv): NormConv1d(
              (conv): Conv1d(64, 32, kernel_size=(3,), stride=(1,))
              (norm): Identity()
            )
          )
          (2): ELU(alpha=1.0)
          (3): SConv1d(
            (conv): NormConv1d(
              (conv): Conv1d(32, 64, kernel_size=(1,), stride=(1,))
              (norm): Identity()
            )
          )
        )
        (shortcut): SConv1d(
          (conv): NormConv1d(
            (conv): Conv1d(64, 64, kernel_size=(1,), stride=(1,))
            (norm): Identity()
          )
        )
      )
      (2): ELU(alpha=1.0)
      (3): SConv1d(
        (conv): NormConv1d(
          (conv):

In [None]:
import torchaudio
import torch
device = "cuda:7"

wav, sr = torchaudio.load('./audio.mp3')

if wav.shape[0] > 1:
    wav = wav[:1,:]

if sr != model.sample_rate:
    wav = torchaudio.functional.resample(wav, sr, model.sample_rate)

wav = wav.unsqueeze(0)

In [None]:
import math
from tqdm import tqdm

sample_size = wav.shape[2]
block_size = 30000000
codecs = []

model.to(device)
for i in tqdm(range(math.ceil(sample_size/block_size))):
    
    sample = wav[:, :, i * block_size : (i + 1) * block_size]
    
    with torch.no_grad():
        codes = model.encode(sample.to(device)) # codes: (n_q, B, T)
    
    codecs.append(codes.detach().cpu().numpy())

In [None]:
import numpy as np

tokens = np.concatenate(codecs, axis=-1)
tokens.shape

In [None]:
tokens.shape

In [None]:
np.save("./semantic_tokens.npy", np.array(tokens[0, 0], dtype=np.uint16))

In [None]:
st = np.load("./semantic_tokens.npy")

In [None]:
st.shape

In [None]:
np.save("./tokens.npy", np.array(tokens, dtype=np.uint16))

In [None]:
semantic_tokens = codes[0, :, :]
acoustic_tokens = codes[1:, :, :]

In [None]:
bs = 5000
idx = np.random.randint(tokens.shape[2] - bs)
audio_tokens = torch.from_numpy(tokens[:3, :, idx: idx + bs].astype(np.int64))
wav = model.decode(audio_tokens.to(device))
torchaudio.save("./sample_4.mp3", wav.squeeze(0).detach().cpu(), model.sample_rate)

In [None]:
audio_tokens = torch.from_numpy(st[1400:7856].reshape(1, 1, -1).astype(np.int64))
wav = model.decode(audio_tokens.to(device))

In [None]:
torchaudio.save("./reset_3.mp3", wav.squeeze(0).detach().cpu(), model.sample_rate)

In [49]:
import torch
import numpy as np
import torchaudio

tokens = np.load("./data/tokens.npy")
bs = 3000
device = "cuda:0"
model.to(device)

idx = np.random.randint(tokens.shape[2] - bs)

audio_tokens = torch.from_numpy(tokens[:2, :, idx: idx + bs].astype(np.int64))
wav = audio_model.decode(audio_tokens.to(device))
torchaudio.save("./sample.mp3", wav.squeeze(0).detach().cpu(), audio_model.sample_rate)

import IPython
IPython.display.Audio("./sample.mp3")

In [54]:
audio_tokens = torch.from_numpy(tokens[:2, :, idx: idx + bs].astype(np.int64))
wav = audio_model.decode(audio_tokens.to(device))
torchaudio.save("./sample.mp3", wav.squeeze(0).detach().cpu(), audio_model.sample_rate)

import IPython
IPython.display.Audio("./sample.mp3")

In [24]:
import torch
import torchaudio
from models import GPT

params = torch.load("./models/gpt_0.pt")
model = GPT(params["config"])
model = torch.compile(model)
model.load_state_dict(params["model"])
# params["model"]

Initialzed a new model with config:
     vocab_size: 1024
     context_length: 2048
     emb_dim: 384
     n_heads: 6
     n_layers: 6
     head_dim: 64
     drop_rate: 0.1
     qkv_bias: False
     mlp_hidd_dim: 1536
     verbose: True
     batch_size: 32
     device: cuda:6
     lr: 0.0001
     steps: 20000
     eval_iters: 100
Total parameters in the model: 12.21 Million
Total size of the model: 46.59 MB


<All keys matched successfully>

In [29]:
device = "cuda:0"

tokens = np.load("./data/tokens.npy")
idx = np.random.randint(tokens.shape[2] - bs)
bs = 512

model.eval()
model.to(device)
gens = model.generate(torch.from_numpy(tokens[:1, :, idx: idx + bs].astype(np.int64)).to(device)[0], 2048)

In [30]:
from speechtokenizer import SpeechTokenizer

config_path = './SpeechTokenizer/config.json'
ckpt_path = './SpeechTokenizer/SpeechTokenizer.pt'
audio_model = SpeechTokenizer.load_from_checkpoint(config_path, ckpt_path)
audio_model.eval()
audio_model.to(device)

wav = audio_model.decode(gens[:, bs:].unsqueeze(0))
torchaudio.save("./sample.mp3", wav.squeeze(0).detach().cpu(), audio_model.sample_rate)

import IPython
IPython.display.Audio("./sample.mp3")

## Create a Class

In [38]:
import torch
import torchaudio
from tqdm import tqdm
from models import GPT
from speechtokenizer import SpeechTokenizer

device = "cuda:0"
max_tokens = 3000
bs = 512 # batch size for the audio prompting

# Load the audio dataset in tokenized form
tokens = np.load("./data/tokens.npy")
idx = np.random.randint(tokens.shape[2] - bs)

# Load the audio tokenizer model
config_path = './SpeechTokenizer/config.json'
ckpt_path = './SpeechTokenizer/SpeechTokenizer.pt'
audio_model = SpeechTokenizer.load_from_checkpoint(config_path, ckpt_path)
audio_model.eval()
audio_model.to(device)

# Load the language model
params = torch.load("./models/gpt_1000.pt")
params["config"]["verbose"] = False # Set the verbosity to False to avoid unnecessary clutter
model = GPT(params["config"])
model = torch.compile(model)
model.load_state_dict(params["model"])
model.eval()
model.to(device)

# Generate the audio from the language model
gens = model.generate(torch.from_numpy(tokens[:2, :, idx: idx + bs].astype(np.int64)).to(device)[0], max_tokens)

# Decode the generated tokens from the language model
wav = audio_model.decode(gens[:, bs:].unsqueeze(0))

# Save the generated audio or optionally display it
torchaudio.save("./sample.mp3", wav.squeeze(0).detach().cpu(), audio_model.sample_rate)
IPython.display.Audio("./sample.mp3")

In [37]:
tokens.shape

(8, 1, 1952799)