In [None]:
%pip install datasets transformers 'accelerate>=0.26.0'

In [None]:
from datasets import load_dataset   
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
import numpy as np
import torch

dsn = "cubbk/rixvox-tokenised"
model_name = "canopylabs/orpheus-tts-0.1-pretrained"

# Training Args
epochs = 1
batch_size = 1
number_processes = 1
pad_token = 128263
save_steps = 5000
learning_rate = 5.0e-5

# Naming and paths
save_folder = "checkpoints"
project_name = "tuning-orpheus"
run_name = "5e5-0"

# Ensure bf16 only when supported
bf16_supported = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
dtype = torch.bfloat16 if bf16_supported else torch.float32
if not bf16_supported:
    print("bfloat16 not supported on this device; using float32.")

# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = pad_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=dtype,
)

raw_ds = load_dataset(dsn, split="train").take(100)
split = raw_ds.train_test_split(test_size=0.05, seed=42)
train_ds, eval_ds = split["train"], split["test"]

training_args = TrainingArguments(
    overwrite_output_dir=True,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    logging_steps=1,
    bf16=bf16_supported,
    output_dir=f"./{save_folder}",
    # report_to="wandb",
    save_steps=save_steps,
    remove_unused_columns=True,
    learning_rate=learning_rate,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds, # type: ignore
    eval_dataset=eval_ds, # type: ignore
)

trainer.train()


In [None]:
import math
eval_results = trainer.evaluate()

perplexity = math.exp(eval_results["eval_loss"])
print("Eval loss:", eval_results["eval_loss"])
print("Perplexity:", perplexity)

In [None]:
# Save final model & tokenizer
save_path = "./final-model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

In [None]:
reloaded = AutoModelForCausalLM.from_pretrained(save_path, torch_dtype=dtype)
_ = reloaded.generate(**tokenizer("Test:", return_tensors="pt"), max_new_tokens=5)
print("Reload OK.")

In [None]:
prompt = "Hej, hur mår du idag?"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
model.eval()
with torch.no_grad():
    out = model.generate(
        **inputs,
        max_new_tokens=50,
        temperature=0.8,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

In [None]:
%pip install huggingface_hub
# huggingface-cli login
reloaded.push_to_hub("cubbk/orpheus-swedish")
tokenizer.push_to_hub("cubbk/orpheus-swedish")