In [None]:
from datasets import load_dataset

name_to_push_dataset_to = "cubbk/rixvox-tokenised"


df = load_dataset("parquet", data_files="../dataset/dev_metadata.parquet")["train"].select(range(10)) # type: ignore

df = df.remove_columns(["dokid", "anforande_nummer", "observation_nr", "speaker", "party", "gender", "debatedate", "electoral_district", "birth_year", "intressent_id", "speaker_from_id", "speaker_audio_meta", "start", "end", "duration", "bleu_score", "speaker_total_hours"])

df

Dataset({
    features: ['text', 'filename'],
    num_rows: 10
})

In [44]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
import torch
from snac import SNAC


model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
model.to("cpu")

SNAC(
  (encoder): Encoder(
    (block): Sequential(
      (0): ParametrizedConv1d(
        1, 48, kernel_size=(7,), stride=(1,), padding=(3,)
        (parametrizations): ModuleDict(
          (weight): ParametrizationList(
            (0): _WeightNorm()
          )
        )
      )
      (1): EncoderBlock(
        (block): Sequential(
          (0): ResidualUnit(
            (block): Sequential(
              (0): Snake1d()
              (1): ParametrizedConv1d(
                48, 48, kernel_size=(7,), stride=(1,), padding=(3,), groups=48
                (parametrizations): ModuleDict(
                  (weight): ParametrizationList(
                    (0): _WeightNorm()
                  )
                )
              )
              (2): Snake1d()
              (3): ParametrizedConv1d(
                48, 48, kernel_size=(1,), stride=(1,)
                (parametrizations): ModuleDict(
                  (weight): ParametrizationList(
                    (0): _WeightNorm()
    

In [45]:
#@title Tokenisation Function
import torchaudio.transforms as T
from datasets import Audio

def tokenise_audio(waveform, sp):
  waveform = torch.from_numpy(waveform).unsqueeze(0)
  waveform = waveform.to(dtype=torch.float32)
  resample_transform = T.Resample(orig_freq=sp, new_freq=24000)
  waveform = resample_transform(waveform)
  waveform = waveform.unsqueeze(0).to("cpu")
  #generate the codes from snac
  with torch.inference_mode():
    codes = model.encode(waveform)

  all_codes = []
  for i in range(codes[0].shape[1]):
    all_codes.append(codes[0][0][i].item()+128266)
    all_codes.append(codes[1][0][2*i].item()+128266+4096)
    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))


  return all_codes

In [46]:
import os
import numpy as np
import soundfile as sf
from datasets import Dataset

# Build path column
ds = df.map(lambda ex: {"audio_path": os.path.join("../dataset/dev/", ex["filename"])}) # type: ignore

def load_audio(example):
    y, sr = sf.read(example["audio_path"])
    if y.ndim > 1:
        y = np.mean(y, axis=1)  # mono
    example["waveform"] = y.astype("float32")
    example["sampling_rate"] = sr
    return example

ds = ds.map(load_audio)

ds[0]["sampling_rate"] # type: ignore

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

16000

In [47]:
def add_codes(example):
    wf = np.asarray(example["waveform"], dtype=np.float32)  # ensure ndarray
    example["codes_list"] = tokenise_audio(wf, example["sampling_rate"])
    return example

ds = ds.map(add_codes, remove_columns=["waveform","sampling_rate","audio_path"])

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [48]:
#@title Load Tokenizer
tokeniser_length = 128256
start_of_text = 128000
end_of_text = 128009

start_of_speech = tokeniser_length + 1
end_of_speech = tokeniser_length + 2

start_of_human = tokeniser_length + 3
end_of_human = tokeniser_length + 4

start_of_ai = tokeniser_length + 5
end_of_ai =  tokeniser_length + 6
pad_token = tokeniser_length + 7

audio_tokens_start = tokeniser_length + 10

tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"

from transformers import AutoTokenizer
import os
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
num_proc = os.cpu_count() - 2

for example in ds:
    print(example["text"])
    print(example["codes_list"])

ds = ds.filter(lambda x: x["codes_list"] is not None)
ds = ds.filter(lambda x: len(x["codes_list"]) > 0)

Vänsterpartiet står som ett av tre samarbetspartier bakom föreliggande betänkande vad gäller medelsfördelningen på olika anslag inom ramen för utgiftsområde 17. Det innebär inte att vi till 100 % är nöjda med allt. Det finns vissa saker, som vi i samband med budgetförhandlingarna valt att prioritera i våra inlägg och skrivelser till regeringen, som inte fått genomslag, medan andra förslag gått igenom.
[131479, 134033, 139086, 142149, 145152, 149263, 155111, 129724, 132639, 137205, 144195, 147276, 151053, 156558, 129724, 135625, 140142, 143219, 145271, 150939, 156689, 129843, 136074, 139074, 141571, 145395, 152339, 156485, 128714, 134659, 139699, 142566, 145990, 152262, 156061, 128761, 134391, 139987, 143033, 148743, 151967, 153176, 131536, 134085, 139711, 141494, 145918, 150231, 152966, 129156, 132629, 138641, 142039, 148651, 149296, 155961, 130313, 133137, 138819, 143708, 145369, 152286, 156382, 128842, 136447, 138612, 144336, 145894, 151020, 155149, 129144, 133081, 137572, 143725, 14

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

In [49]:
#@title Create Input Ids
def remove_duplicate_frames(example):
    vals = example["codes_list"]
    if len(vals) % 7 != 0:
        raise ValueError("Input list length must be divisible by 7")

    result = vals[:7]

    removed_frames = 0

    for i in range(7, len(vals), 7):
        current_first = vals[i]
        previous_first = result[-7]

        if current_first != previous_first:
            result.extend(vals[i:i+7])
        else:
            removed_frames += 1

    example["codes_list"] = result

    return example

ds = ds.map(remove_duplicate_frames, num_proc=num_proc)

tok_info = '''*** HERE you can modify the text prompt
i.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass:
f"{example["source"]}:  {example["text"]}", as is passed.
'''
print(tok_info)

def create_input_ids(example):
    text_ids = tokenizer.encode(example["text"],  add_special_tokens=True)
    text_ids.append(end_of_text)
    example["text_tokens"] = text_ids
    input_ids = (
        [start_of_human]
        + example["text_tokens"]
        + [end_of_human]
        + [start_of_ai]
        + [start_of_speech]
        + example["codes_list"]
        + [end_of_speech]
        + [end_of_ai]
    )
    example["input_ids"] = input_ids
    example["labels"] = input_ids
    example["attention_mask"] = [1] * len(input_ids)

    return example

ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])

ds


Map (num_proc=8):   0%|          | 0/10 [00:00<?, ? examples/s]

*** HERE you can modify the text prompt
i.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass:
f"{example["source"]}:  {example["text"]}", as is passed.



Map (num_proc=8):   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset({
    features: ['filename', 'text_tokens', 'input_ids', 'labels', 'attention_mask'],
    num_rows: 10
})

In [53]:
columns_to_keep = ["input_ids", "labels", "attention_mask"]
columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]

ds = ds.remove_columns(columns_to_remove)


In [54]:
ds.push_to_hub(name_to_push_dataset_to)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        : 100%|##########|  135kB /  135kB            

CommitInfo(commit_url='https://huggingface.co/datasets/cubbk/rixvox-tokenised/commit/6aacf5aa5aab29eb8097112600d3ab5ca598536b', commit_message='Upload dataset', commit_description='', oid='6aacf5aa5aab29eb8097112600d3ab5ca598536b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/cubbk/rixvox-tokenised', endpoint='https://huggingface.co', repo_type='dataset', repo_id='cubbk/rixvox-tokenised'), pr_revision=None, pr_num=None)