# **Installation**

In [1]:
%%capture
import os, re

if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install --no-deps trl
!pip install omegaconf einx torchcodec "datasets>=3.4.1,<4.0.0"
!pip install transformers==4.56.2
!pip install git+https://github.com/ysharma3501/FastBiCodec.git onnxruntime-gpu git+https://github.com/ysharma3501/FlashSR.git
!pip install accelerate==1.8.1

# **Load model and settings**

In [2]:
import os
os.environ['UNSLOTH_FORCE_FLOAT32'] = '1'
!export ACCELERATE_MIXED_PRECISION=no

from unsloth import FastModel
import torch
from huggingface_hub import snapshot_download

max_seq_length = 30 * 50 ## is 30 seconds of audio, increase if your audio is longer

# Download model and code
model_path = snapshot_download("YatharthS/MiraTTS")

model, tokenizer = FastModel.from_pretrained(
    model_name = f"YatharthS/MiraTTS",
    max_seq_length = max_seq_length,
    dtype = torch.float32, # Only works in float32 or bfloat16, NaNs in fp16
    full_finetuning = True,
    load_in_4bit = False,
    torch_dtype='float32',
    #float32_mixed_precision=True
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


.gitattributes: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

decoders/detokenizer.onnx:   0%|          | 0.00/193M [00:00<?, ?B/s]

decoders/detokenizer.safetensors:   0%|          | 0.00/105M [00:00<?, ?B/s]

decoders/processer.onnx:   0%|          | 0.00/87.5M [00:00<?, ?B/s]

decoders/q_encoder.onnx:   0%|          | 0.00/122M [00:00<?, ?B/s]

decoders/s_encoder.onnx:   0%|          | 0.00/23.9M [00:00<?, ?B/s]

decoders/upsampler.pth:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

example1.wav:   0%|          | 0.00/541k [00:00<?, ?B/s]

example2.wav:   0%|          | 0.00/943k [00:00<?, ?B/s]

example3.wav:   0%|          | 0.00/879k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/170 [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.1M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

==((====))==  Unsloth 2026.1.3: Fast Qwen2 patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Float16 full finetuning uses more memory since we upcast weights to float32.


`torch_dtype` is deprecated! Use `dtype` instead!


# **Load dataset and audio codec**

In [3]:
from datasets import load_dataset
split = 'test' ## change this according to dataset, could be train or something else
text_column = 'text' ## change this according to dataset, could be reference_text or something else
audio_column = 'audio' ## change this according to dataset, could be audio or something else

dataset = load_dataset("WpythonW/elevenlabs_multilingual_v2-technical-speech", split = split)
dataset = dataset.rename_columns({text_column: "text", audio_column: "audio"})

from ncodec.codec import TTSCodec
import librosa
from ncodec.encoder.model import audio_volume_normalize
import numpy as np
tts_codec = TTSCodec()

@torch.inference_mode()
def encode(audio, encode_semantic=True, duration=8):

        """encodes audio file into speech tokens and context tokens"""
        self = tts_codec.audio_encoder
        #audio, sr = librosa.load(audio, duration=duration, sr=16000)

        audio = audio_volume_normalize(audio)

        ref_clip = self.get_ref_clip(audio)
        wav_ref = torch.from_numpy(ref_clip).unsqueeze(0).float()

        mel = self.mel_transformer(wav_ref).squeeze(1)
        new_arr = np.array(mel.transpose(1, 2).cpu())

        global_tokens = self.s_encoder.run(["global_tokens"], {"mel_spectrogram": new_arr})
        context_tokens = "".join([f"<|context_token_{i}|>" for i in global_tokens[0].squeeze()])
        if encode_semantic:
            feat = self.extract_wav2vec2_features(audio)
            speech_tokens = self.q_encoder.run(["semantic_tokens"], {"features": feat.cpu().detach().numpy()})
            speech_tokens = "".join([f"<|speech_token_{i}|>" for i in speech_tokens[0][0]])
            return speech_tokens, context_tokens
        else:
            return context_tokens

tts_codec.audio_encoder.encode = encode
tts_codec.audio_encoder.feature_extractor.config.output_hidden_states = True

README.md: 0.00B [00:00, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/24 [00:00<?, ? examples/s]

preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

# **Process audio dataset**

In [4]:
from datasets import load_dataset, Audio

num_samples = 20 ## change this to how many samples you want to train!!
small_dataset = dataset.select(range(num_samples))
small_dataset = small_dataset.cast_column("audio", Audio(sampling_rate=16_000))

example = small_dataset[0]["audio"]
print(example["sampling_rate"])

def process_wavs(example):
  audio_array = example["audio"]["array"]
  sr = example["audio"]["sampling_rate"]

  #audio_array = librosa.resample(y, orig_sr=sr_orig, target_sr=sr_target)
  text = example['text']
  print(audio_array.shape)
  semantic_tokens, global_tokens = tts_codec.audio_encoder.encode(audio_array, True, duration=30.0)
  prompt = f"<|task_tts|><|start_text|>{text}<|end_text|><|context_audio_start|>{global_tokens}<|context_audio_end|><|prompt_speech_start|>{semantic_tokens}"
  return {'text': prompt}

small_dataset = small_dataset.map(process_wavs, remove_columns=["audio"])


16000




Map:   0%|          | 0/20 [00:00<?, ? examples/s]

(214414,)
(231550,)
(204800,)
(196441,)
(219429,)
(226116,)
(177215,)
(210652,)
(216503,)
(224027,)
(206054,)
(238237,)
(222355,)
(215667,)
(211488,)
(233640,)
(220265,)
(219429,)
(214832,)
(210652,)


# **Define SFT trainer and check vram**

In [5]:
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = False, # We're doing full float32 s disable mixed precision
        bf16 = False, # We're doing full float32 s disable mixed precision
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.001,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use TrackIO/WandB etc
    ),
)

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/24 [00:00<?, ? examples/s]

🦥 Unsloth: Padding-free auto-enabled, enabling faster training.
GPU = Tesla T4. Max memory = 14.741 GB.
2.826 GB of memory reserved.


# **Train the model**

In [6]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 24 | Num Epochs = 20 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 506,634,112 of 506,634,112 (100.00% trained)


Step,Training Loss
1,24.7054
2,24.685
3,18.9832
4,24.6159
5,12.079
6,11.0622
7,9.8022
8,9.0497
9,9.1442
10,8.0759


# **Test out the finetuned model**

In [7]:
import librosa

device = 'cuda:0'
def infer(text, audio_file, top_k=50, top_p=1.0, temperature=0.8, repetition_penalty=1.2, max_new_audio_tokens=1024):

    audio_file, sr = librosa.load(audio_file, sr=16000)
    context_tokens = tts_codec.encode(audio_file)
    formatted_prompt = tts_codec.format_prompt(text, context_tokens, None)

    model_inputs = tokenizer([formatted_prompt], return_tensors="pt").to(device)

    print("Generating token sequence...")
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=max_new_audio_tokens, # Limit generation length
        do_sample=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        eos_token_id=tokenizer.eos_token_id, # Stop token
        pad_token_id=tokenizer.pad_token_id # Use models pad token id

    )
    print("Token sequence generated.")


    generated_ids_trimmed = generated_ids[:, model_inputs.input_ids.shape[1]:]
    predicts_text = tokenizer.batch_decode(generated_ids_trimmed, skip_special_tokens=False)[0]
    audio = tts_codec.decode(predicts_text, context_tokens)
    return audio

!wget https://huggingface.co/YatharthS/MiraTTS/resolve/main/example2.wav
audio = infer("Wow, that's really cool! What is that?", "example2.wav", 50, 1.0)

from IPython.display import Audio
display(Audio(audio, rate=48000))


--2026-01-15 19:16:22--  https://huggingface.co/YatharthS/MiraTTS/resolve/main/example2.wav
Resolving huggingface.co (huggingface.co)... 3.168.132.91, 3.168.132.126, 3.168.132.38, ...
Connecting to huggingface.co (huggingface.co)|3.168.132.91|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://us.gcp.cdn.hf.co/xet-bridge-us/694331b1f70011224d04e18b/10c70f1550f4db2616d9b591494ea18489df777be4b037c435fd92dcd8764e90?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27example2.wav%3B+filename%3D%22example2.wav%22%3B&response-content-type=audio%2Fwave&Expires=1768508182&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiRXBvY2hUaW1lIjoxNzY4NTA4MTgyfX0sIlJlc291cmNlIjoiaHR0cHM6Ly91cy5nY3AuY2RuLmhmLmNvL3hldC1icmlkZ2UtdXMvNjk0MzMxYjFmNzAwMTEyMjRkMDRlMThiLzEwYzcwZjE1NTBmNGRiMjYxNmQ5YjU5MTQ5NGVhMTg0ODlkZjc3N2JlNGIwMzdjNDM1ZmQ5MmRjZDg3NjRlOTBcXD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=V%7EXi

# **Upload finetuned model to the hub**

In [None]:
from huggingface_hub import login
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import snapshot_download, create_repo, upload_folder

model_repo = "username/my-awesome-finetuned-model" ## change this to your username and place you want to upload
token = "" ## change this to your write token

login(token)


model.push_to_hub(model_repo)
tokenizer.push_to_hub(model_repo)

# **Example code to use optimized library for finetuned model**

In [None]:
## will not work in T4 gpu, please install the library and then run it on any gpu 30xx or above that supports bfloat16

from mira.model import MiraTTS
from IPython.display import Audio
mira_tts = MiraTTS(model_repo) ## downloads model from huggingface

file = "example2.wav" ## can be mp3/wav/ogg or anything that librosa supports
text = "Alright, so have you ever heard of a little thing named text to speech? Well, it allows you to convert text into speech! I know, that's super cool, isn't it?"

context_tokens = mira_tts.encode_audio(file)
audio = mira_tts.generate(text, context_tokens)

Audio(audio, rate=48000)