In [172]:
# Change these

my_original_dataset_name = "canopylabs/zac-sample-dataset"


## CHANGE TO YOUR NAMESPACE
name_to_push_dataset_to = "cubbk/zac_sample-dataset-tokenised"

In [173]:
!sudo apt update && sudo apt install ffmpeg -y

Hit:1 http://archive.ubuntu.com/ubuntu focal InRelease
Hit:2 http://security.ubuntu.com/ubuntu focal-security InRelease
Hit:3 http://ppa.launchpad.net/flexiondotorg/nvtop/ubuntu focal InRelease
Hit:4 http://archive.ubuntu.com/ubuntu focal-updates InRelease
Hit:5 http://archive.ubuntu.com/ubuntu focal-backports InRelease
Reading package lists... Done[33m[33m[33m
Building dependency tree       
Reading state information... Done
All packages are up to date.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
ffmpeg is already the newest version (7:4.2.7-0ubuntu0.1).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.


In [174]:
%pip install transformers datasets torchaudio snac torch torchaudio huggingface_hub torchcodec

Note: you may need to restart the kernel to use updated packages.


In [175]:
#@title Installation & Setup
import locale
locale.getpreferredencoding = lambda: "UTF-8"
import torch
from snac import SNAC
from huggingface_hub import snapshot_download
from datasets import load_dataset

dsn = my_original_dataset_name

snapshot_download(
    repo_id=dsn,
    repo_type="dataset",
    revision="main",
    max_workers=64,
)


ds = load_dataset(dsn, split="train")
ds_sample_rate = ds[0]["audio"]["sampling_rate"]

model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
model = model.to("cuda")

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

In [176]:
#@title Tokenisation Function
import torchaudio.transforms as T
def tokenise_audio(waveform):
  waveform = torch.from_numpy(waveform).unsqueeze(0)
  waveform = waveform.to(dtype=torch.float32)
  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
  waveform = resample_transform(waveform)
  waveform = waveform.unsqueeze(0).to("cuda")
  #generate the codes from snac
  with torch.inference_mode():
    codes = model.encode(waveform)

  all_codes = []
  for i in range(codes[0].shape[1]):
    all_codes.append(codes[0][0][i].item()+128266)
    all_codes.append(codes[1][0][2*i].item()+128266+4096)
    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))


  return all_codes




In [177]:
#@title Map Tokenize
import random
def add_codes(example):
    # Always initialize codes_list to None
    codes_list = None

    answer_audio = example["audio"]
    # If there's a valid audio array, tokenise it
    audio_array = answer_audio["array"]
    codes_list = tokenise_audio(audio_array)
   
    example["codes_list"] = codes_list

    return example


ds = ds.map(add_codes, remove_columns=["audio"])

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [178]:
#@title Load Tokenizer
tokeniser_length = 128256
start_of_text = 128000
end_of_text = 128009

start_of_speech = tokeniser_length + 1
end_of_speech = tokeniser_length + 2

start_of_human = tokeniser_length + 3
end_of_human = tokeniser_length + 4

start_of_ai = tokeniser_length + 5
end_of_ai =  tokeniser_length + 6
pad_token = tokeniser_length + 7

audio_tokens_start = tokeniser_length + 10

tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"

from transformers import AutoTokenizer
import os
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
num_proc = os.cpu_count() - 2

for example in ds:
    print(example["text"])
    print(example["codes_list"])

ds = ds.filter(lambda x: x["codes_list"] is not None)
ds = ds.filter(lambda x: len(x["codes_list"]) > 0)

Jake, have you started packing for your camping trip this weekend?
[130334, 135924, 137439, 141689, 147667, 152775, 153416, 131482, 134788, 138037, 141514, 145086, 149250, 154338, 132085, 133025, 139026, 141444, 144688, 151164, 154862, 129332, 132393, 136849, 141599, 145340, 150074, 155307, 131068, 132560, 137849, 144087, 146722, 149013, 156318, 132085, 134318, 138357, 144522, 146447, 151983, 155155, 129745, 132938, 137503, 140757, 146548, 151818, 156549, 129535, 133218, 139171, 144151, 145105, 150587, 154067, 130703, 133860, 138072, 144497, 148222, 152410, 153102, 128270, 136032, 139386, 140761, 145865, 149191, 156499, 131948, 133604, 137782, 142527, 146478, 150701, 153196, 131720, 134387, 139847, 143528, 147584, 152403, 153287, 129103, 132621, 137738, 142973, 145745, 148783, 155925, 131449, 133160, 137584, 140821, 148530, 149604, 156908, 131813, 134370, 137866, 144245, 144920, 150148, 153617, 130136, 132518, 140275, 144584, 148269, 149890, 156336, 130489, 136288, 136572, 140845, 1452

Filter:   0%|          | 0/20 [00:00<?, ? examples/s]

Filter:   0%|          | 0/20 [00:00<?, ? examples/s]

In [179]:
#@title Create Input Ids
def remove_duplicate_frames(example):
    vals = example["codes_list"]
    if len(vals) % 7 != 0:
        raise ValueError("Input list length must be divisible by 7")

    result = vals[:7]

    removed_frames = 0

    for i in range(7, len(vals), 7):
        current_first = vals[i]
        previous_first = result[-7]

        if current_first != previous_first:
            result.extend(vals[i:i+7])
        else:
            removed_frames += 1

    example["codes_list"] = result

    return example

ds = ds.map(remove_duplicate_frames, num_proc=num_proc)

tok_info = '''*** HERE you can modify the text prompt
i.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass:
f"{example["source"]}:  {example["text"]}", as is passed.
'''
print(tok_info)

def create_input_ids(example):
    text_ids = tokenizer.encode(example["text"],  add_special_tokens=True)
    text_ids.append(end_of_text)
    example["text_tokens"] = text_ids
    input_ids = (
        [start_of_human]
        + example["text_tokens"]
        + [end_of_human]
        + [start_of_ai]
        + [start_of_speech]
        + example["codes_list"]
        + [end_of_speech]
        + [end_of_ai]
    )
    example["input_ids"] = input_ids
    example["labels"] = input_ids
    example["attention_mask"] = [1] * len(input_ids)

    return example

ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])


num_proc must be <= 20. Reducing num_proc to 20 for dataset of size 20.


Map (num_proc=20):   0%|          | 0/20 [00:00<?, ? examples/s]

num_proc must be <= 20. Reducing num_proc to 20 for dataset of size 20.


*** HERE you can modify the text prompt
i.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass:
f"{example["source"]}:  {example["text"]}", as is passed.



Map (num_proc=20):   0%|          | 0/20 [00:00<?, ? examples/s]

In [180]:
#@title Remove unnecessary columns
columns_to_keep = ["input_ids", "labels", "attention_mask"]
columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]

ds = ds.remove_columns(columns_to_remove)

In [181]:
ds.push_to_hub(name_to_push_dataset_to)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/cubbk/zac_sample-dataset-tokenised/commit/4cdcfa09f4c983b083f8b6fab1a4661131d28b31', commit_message='Upload dataset', commit_description='', oid='4cdcfa09f4c983b083f8b6fab1a4661131d28b31', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/cubbk/zac_sample-dataset-tokenised', endpoint='https://huggingface.co', repo_type='dataset', repo_id='cubbk/zac_sample-dataset-tokenised'), pr_revision=None, pr_num=None)