In [1]:
import transformers
import time
import librosa
from hf_pipe.ultravox_model import UltravoxModel
from hf_pipe.ultravox_pipeline import UltravoxPipeline

pipe = transformers.pipeline(
    model="fixie-ai/ultravox-v0_3-llama-3_2-1b",
    trust_remote_code=True,
    device="mps",
)

ultravox-pipeline is already registered. Overwriting pipeline for task ultravox-pipeline...
Some weights of UltravoxModel were not initialized from the model checkpoint at fixie-ai/ultravox-v0_3-llama-3_2-1b and are newly initialized: ['language_model.lm_head.weight', 'language_model.model.embed_tokens.weight', 'language_model.model.layers.0.input_layernorm.weight', 'language_model.model.layers.0.mlp.down_proj.weight', 'language_model.model.layers.0.mlp.gate_proj.weight', 'language_model.model.layers.0.mlp.up_proj.weight', 'language_model.model.layers.0.post_attention_layernorm.weight', 'language_model.model.layers.0.self_attn.k_proj.weight', 'language_model.model.layers.0.self_attn.o_proj.weight', 'language_model.model.layers.0.self_attn.q_proj.weight', 'language_model.model.layers.0.self_attn.v_proj.weight', 'language_model.model.layers.1.input_layernorm.weight', 'language_model.model.layers.1.mlp.down_proj.weight', 'language_model.model.layers.1.mlp.gate_proj.weight', 'language_mode

In [1]:
import torch
from IPython.display import Audio
from ultravox.inference.ultravox_infer import UltravoxInference
from ultravox.data.data_sample import VoiceSample

ultravox = UltravoxInference(
    model_path="./hf_pipe",
    conversation_mode=True,
    device="mps",
    data_type="float16"
)

UltravoxModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


UltravoxConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "./hf_pipe",
  "architectures": [
    "UltravoxModel"
  ],
  "audio_latency_block_size": 100,
  "audio_model_id": "openai/whisper-small",
  "audio_model_lora_config": {
    "lora_alpha": 8,
    "r": 0,
    "target_modules": [
      "k_proj",
      "q_proj",
      "linear_k",
      "linear_q"
    ]
  },
  "auto_map": {
    "AutoConfig": "ultravox_config.UltravoxConfig",
    "AutoModel": "ultravox_model.UltravoxModel"
  },
  "custom_pipelines": {
    "ultravox-pipeline": {
      "impl": "ultravox_pipeline.UltravoxPipeline",
      "pt": [
        "AutoModel"
      ],
      "tf": [],
      "type": "multimodal"
    }
  },
  "hidden_size": 4096,
  "ignore_index": -100,
  "initializer_range": 0.02,
  "model_type": "ultravox",
  "norm_init": 0.4,
  "pad_token_id": 128009,
  "projector_act": "swiglu",
  "stack_factor": 8,
  "text_model_id": "meta-llama/Llama-3.2-1B-Instruct",
  "text_model_lora_config": {
    "lora_alph

In [2]:
sample = VoiceSample.from_prompt_and_file(
    path="./part1.wav",
    prompt="<|audio|>",
)

In [13]:
sample.messages

[{'role': 'user',
  'content': '<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|>'}]

In [22]:
inputs = ultravox.processor(
    audio=sample.audio,
    text=sample.messages[0]["content"],
    return_tensors="pt",
    sampling_rate=16000,
)
inputs

{'audio_token_len': tensor([10]), 'audio_values': tensor([[[-0.8673, -0.8673, -0.8673,  ..., -0.1982, -0.2331, -0.1076],
         [-0.8673, -0.8673, -0.8673,  ...,  0.4952,  0.5072,  0.5260],
         [-0.8673, -0.8673, -0.8673,  ...,  0.6264,  0.6277,  0.6493],
         ...,
         [-0.8673, -0.8673, -0.8673,  ..., -0.8673, -0.8673, -0.8673],
         [-0.8673, -0.8673, -0.8673,  ..., -0.8673, -0.8673, -0.8673],
         [-0.8673, -0.8673, -0.8673,  ..., -0.8673, -0.8673, -0.8673]]]), 'audio_len': tensor([148]), 'input_ids': tensor([[128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [3]:
Audio(sample.audio, rate=sample.sample_rate)

In [4]:
voice_output = ultravox.infer(
    sample=sample,
)

inputs {'audio_token_len': tensor([10]), 'audio_values': tensor([[[-0.8673, -0.8673, -0.8673,  ..., -0.1982, -0.2331, -0.1076],
         [-0.8673, -0.8673, -0.8673,  ...,  0.4952,  0.5072,  0.5260],
         [-0.8673, -0.8673, -0.8673,  ...,  0.6264,  0.6277,  0.6493],
         ...,
         [-0.8673, -0.8673, -0.8673,  ..., -0.8673, -0.8673, -0.8673],
         [-0.8673, -0.8673, -0.8673,  ..., -0.8673, -0.8673, -0.8673],
         [-0.8673, -0.8673, -0.8673,  ..., -0.8673, -0.8673, -0.8673]]]), 'audio_len': tensor([148]), 'audio_token_start_idx': tensor([30]), 'input_ids': tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1313,   4723,    220,   2366,     19,    271, 128009, 128006,
            882, 128007,    271, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128006,  78191, 128007,    271]]), 'attention_m

From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


In [25]:
voice_output

VoiceOutput(text="Hello Julian. It's nice to meet you. Is there something I can help you with or would you like to chat?", input_tokens=45, output_tokens=26)

In [31]:
print(voice_output.text)

Hello Julian. It's nice to meet you. Is there something I can help you with or would you like to chat?


In [32]:
sample = VoiceSample.from_prompt_and_file(
    path="./part2.wav",
    prompt="<|audio|>",
)

In [35]:
Audio(sample.audio, rate=sample.sample_rate)

In [43]:
from pprint import pprint

pprint(ultravox.past_messages)

[{'content': '<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|>',
  'role': 'user'},
 {'content': "Hello Julian. It's nice to meet you. Is there something I can "
             'help you with or would you like to chat?',
  'role': 'assistant'},
 {'content': '<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|>',
  'role': 'user'},
 {'content': "Bonjour Julian. I'm an artificial intelligence language model, "
             "so I don't have a nationality, but I'm happy to chat with you in "
             "French. How's your day going so far?",
  'role': 'assistant'}]


In [33]:
voice_output = ultravox.infer(
    sample=sample,
)

In [34]:
print(voice_output.text)

Bonjour Julian. I'm an artificial intelligence language model, so I don't have a nationality, but I'm happy to chat with you in French. How's your day going so far?


In [24]:
from pathlib import Path

import numpy as np
import soundfile as sf
import torch
import torchaudio

# Dummy AUDIO_EXTENSIONS variable for demonstration
AUDIO_EXTENSIONS = {".wav", ".mp3"}

def load_model(checkpoint_path, device="cpu"):
    print("Loading model...")
    model = torch.nn.Module()  # Replace with actual model instantiation
    state_dict = torch.load(checkpoint_path, map_location=device)
    
    if "state_dict" in state_dict:
        state_dict = state_dict["state_dict"]

    if any("generator" in k for k in state_dict):
        state_dict = {
            k.replace("generator.", ""): v
            for k, v in state_dict.items()
            if "generator." in k
        }

    model.load_state_dict(state_dict, strict=False)
    model.eval()
    model.to(device)

    print(f"Loaded model with state dict keys: {list(state_dict.keys())[:5]}... (truncated)")
    return model


@torch.no_grad()
def process_audio(input_path, output_path, checkpoint_path, device="cpu"):
    model = load_model(checkpoint_path, device=device)

    input_path = Path(input_path)
    output_path = Path(output_path)

    if input_path.suffix in AUDIO_EXTENSIONS:
        print(f"Processing in-place reconstruction of {input_path}")

        # Load audio
        audio, sr = torchaudio.load(str(input_path))
        if audio.shape[0] > 1:
            audio = audio.mean(0, keepdim=True)
        audio = torchaudio.functional.resample(
            audio, sr, model.spec_transform.sample_rate
        )

        audios = audio[None].to(device)
        print(f"Loaded audio with {audios.shape[2] / model.spec_transform.sample_rate:.2f} seconds")

        # VQ Encoder
        audio_lengths = torch.tensor([audios.shape[2]], device=device, dtype=torch.long)
        indices = model.encode(audios, audio_lengths)[0][0]

        print(f"Generated indices of shape {indices.shape}")

        # Save indices
        np.save(output_path.with_suffix(".npy"), indices.cpu().numpy())
    elif input_path.suffix == ".npy":
        print(f"Processing precomputed indices from {input_path}")
        indices = np.load(input_path)
        indices = torch.from_numpy(indices).to(device).long()
        assert indices.ndim == 2, f"Expected 2D indices, got {indices.ndim}"
    else:
        raise ValueError(f"Unknown input type: {input_path}")

    # Restore
    feature_lengths = torch.tensor([indices.shape[1]], device=device)
    fake_audios, _ = model.decode(
        indices=indices[None], feature_lengths=feature_lengths
    )
    audio_time = fake_audios.shape[-1] / model.spec_transform.sample_rate

    print(
        f"Generated audio of shape {fake_audios.shape}, equivalent to {audio_time:.2f} seconds "
        f"from {indices.shape[1]} features, features/second: {indices.shape[1] / audio_time:.2f}"
    )

    # Save audio
    fake_audio = fake_audios[0, 0].float().cpu().numpy()
    sf.write(output_path, fake_audio, model.spec_transform.sample_rate)
    print(f"Saved audio to {output_path}")


# Example usage in a notebook cell
# You can replace the paths and checkpoint with actual files
checkpoint_path = "path/to/checkpoint.pth"
input_audio = "path/to/input.wav"
output_audio = "path/to/output.wav"
device = "cpu"

# Run the function
process_audio(input_audio, output_audio, checkpoint_path, device)


VoiceOutput(text="This appears to be a description of Apple's M1 chip, a system-on-chip (SoC) designed for Apple's Mac computers. The M1 chip is a custom-designed processor that was announced by Apple in 2020 and released in 2020.\n\nThe M1 chip is designed to provide a more efficient and flexible way of running machine learning (ML) models on Apple's Mac computers. It is based on a 5-nanometer process node and features a range of innovative technologies, including:\n\n* Neural Engine: a dedicated neural processing unit (NPU) designed to accelerate ML workloads\n* Apple Silicon: a custom-designed processor architecture that provides improved performance and power efficiency\n* Integrated Graphics: a dedicated graphics processing unit (GPU) that provides improved graphics performance\n\nThe M1 chip is designed to provide a more flexible and efficient way of running machine learning models on Apple's Mac computers, allowing developers to create more complex and powerful ML models without

In [25]:
from fish.tools.llama.generate

config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ValueError: The checkpoint you are trying to load has model type `dual_ar` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.

In [16]:
local_inference = UltravoxInference(
    model_path="./hf_pipe",
    device="mps",
)

voice_output = local_inference.infer(
    sample=sample,
)
voice_output

UltravoxConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "./hf_pipe",
  "architectures": [
    "UltravoxModel"
  ],
  "audio_latency_block_size": 100,
  "audio_model_id": "openai/whisper-small",
  "audio_model_lora_config": {
    "lora_alpha": 8,
    "r": 0,
    "target_modules": [
      "k_proj",
      "q_proj",
      "linear_k",
      "linear_q"
    ]
  },
  "auto_map": {
    "AutoConfig": "ultravox_config.UltravoxConfig",
    "AutoModel": "ultravox_model.UltravoxModel"
  },
  "custom_pipelines": {
    "ultravox-pipeline": {
      "impl": "ultravox_pipeline.UltravoxPipeline",
      "pt": [
        "AutoModel"
      ],
      "tf": [],
      "type": "multimodal"
    }
  },
  "hidden_size": 4096,
  "ignore_index": -100,
  "initializer_range": 0.02,
  "model_type": "ultravox",
  "norm_init": 0.4,
  "pad_token_id": 128009,
  "projector_act": "swiglu",
  "stack_factor": 8,
  "text_model_id": "meta-llama/Llama-3.2-1B-Instruct",
  "text_model_lora_config": {
    "lora_alph

VoiceOutput(text="There is no text provided for me to summarize or interpret. Please provide the text you'd like me to work with, and I'll do my best to assist you.", input_tokens=40, output_tokens=35)

In [None]:
x = pipe({"audio": audio, "turns": turns, "sampling_rate": sr}, max_new_tokens=30)

In [None]:
from ultravox.inference.infer import 

In [None]:
import transformers
import time
import librosa

from hf_pipe.ultravox_pipeline import UltravoxPipeline

pipe = UltravoxPipeline(
    model="fixie-ai/ultravox-v0_3-llama-3_2-1b",
    trust_remote_code=True,
    device="mps",
)

path = "./sample.wav"  # TODO: pass the audio here
audio, sr = librosa.load(path, sr=16000)


turns = [
    {
        "role": "system",
        "content": "You are a friendly and helpful character. You love to answer questions for people.",
    },
]
start_time = time.time()

for k in range(10):
    # Benchmark the pipeline
    x = pipe({"audio": audio, "turns": turns, "sampling_rate": sr}, max_new_tokens=30)
end_time = time.time()

print(x)
print(f"Execution Time: {(end_time - start_time)/10:.2f} seconds")


In [None]:
from ultravox.inference.ultravox_infer import UltravoxInference

inference = UltravoxInference(model_path="fixie-ai/ultravox-v0_3-llama-3_2-1b", device="mps")


In [None]:

sample = datasets.VoiceSample.from_prompt("Hello?")
inference.infer(sample=sample)