In [1]:
%pip install datasets transformers 'accelerate>=0.26.0'

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.56.1-py3-none-any.whl.metadata (42 kB)
Collecting accelerate>=0.26.0
  Downloading accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Co

In [1]:
from datasets import load_dataset   
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
import numpy as np
import torch

dsn = "cubbk/zac_sample-dataset-tokenised"
model_name = "canopylabs/orpheus-tts-0.1-pretrained"

# Training Args
epochs = 1
batch_size = 1
number_processes = 1
pad_token = 128263
save_steps = 5000
learning_rate = 5.0e-5

# Naming and paths
save_folder = "checkpoints"
project_name = "tuning-orpheus"
run_name = "5e5-0"

# Ensure bf16 only when supported
bf16_supported = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
dtype = torch.bfloat16 if bf16_supported else torch.float32
if not bf16_supported:
    print("bfloat16 not supported on this device; using float32.")

# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = pad_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=dtype,
)

raw_ds = load_dataset(dsn, split="train")
split = raw_ds.train_test_split(test_size=0.05, seed=42)
train_ds, eval_ds = split["train"], split["test"]

training_args = TrainingArguments(
    overwrite_output_dir=True,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    logging_steps=1,
    bf16=bf16_supported,
    output_dir=f"./{save_folder}",
    # report_to="wandb",
    save_steps=save_steps,
    remove_unused_columns=True,
    learning_rate=learning_rate,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds, # type: ignore
    eval_dataset=eval_ds, # type: ignore
)

trainer.train()


tokenizer_config.json:   0%|          | 0.00/5.41M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/22.8M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/890 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.32G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/180 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/344 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/96.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20 [00:00<?, ? examples/s]

Step,Training Loss
1,4.9225
2,5.1167
3,5.0632
4,4.7545
5,4.742
6,4.5981
7,4.71
8,4.8189
9,4.7815
10,4.7565


TrainOutput(global_step=19, training_loss=4.779965551275956, metrics={'train_runtime': 36.7189, 'train_samples_per_second': 0.517, 'train_steps_per_second': 0.517, 'total_flos': 174198588825600.0, 'train_loss': 4.779965551275956, 'epoch': 1.0})

In [2]:
import math
eval_results = trainer.evaluate()

perplexity = math.exp(eval_results["eval_loss"])
print("Eval loss:", eval_results["eval_loss"])
print("Perplexity:", perplexity)

Eval loss: 4.690624237060547
Perplexity: 108.921151209337


In [3]:
# Save final model & tokenizer
save_path = "./final-model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('./final-model/tokenizer_config.json',
 './final-model/special_tokens_map.json',
 './final-model/chat_template.jinja',
 './final-model/tokenizer.json')

In [4]:
reloaded = AutoModelForCausalLM.from_pretrained(save_path, torch_dtype=dtype)
_ = reloaded.generate(**tokenizer("Test:", return_tensors="pt"), max_new_tokens=5)
print("Reload OK.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Reload OK.


In [6]:
prompt = "Hey, how are you doing today?"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
model.eval()
with torch.no_grad():
    out = model.generate(
        **inputs,
        max_new_tokens=50,
        temperature=0.8,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

tensor([[128000,  19182,     11,   1268,    527,    499,   3815,   3432,     30,
         128009]], device='cuda:0')


In [7]:
%pip install huggingface_hub
# huggingface-cli login
reloaded.push_to_hub("cubbk/orpheus-swedish")
tokenizer.push_to_hub("cubbk/orpheus-swedish")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.61G [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/22.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/cubbk/orpheus-swedish/commit/5121475167c2f3bb37e34448a302846b826d8e77', commit_message='Upload tokenizer', commit_description='', oid='5121475167c2f3bb37e34448a302846b826d8e77', pr_url=None, repo_url=RepoUrl('https://huggingface.co/cubbk/orpheus-swedish', endpoint='https://huggingface.co', repo_type='model', repo_id='cubbk/orpheus-swedish'), pr_revision=None, pr_num=None)