In [1]:
import torch
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor

model_path = "Qwen/Qwen2.5-Omni-7B"
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="flash_attention_2",
)
processor = Qwen2_5OmniProcessor.from_pretrained(model_path)

`torch_dtype` is deprecated! Use `dtype` instead!
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour
Qwen2_5OmniToken2WavModel must inference with fp32, but flash_attention_2 only supports fp16 and bf16, attention implementation of Qwen2_5OmniToken2WavModel will fallback to sdpa.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [3]:
from qwen_omni_utils import process_mm_info

# @title inference function


def inference(audio_path):
    messages = [
        {"role": "system", "content": [
            {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
        ], },
        {"role": "user", "content": [
            {"type": "audio", "audio": audio_path},
        ]
        },
    ]
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True)
    audios, images, videos = process_mm_info(messages, use_audio_in_video=True)
    inputs = processor(
        text=text,
        audio=audios,
        images=images,
        videos=videos,
        return_tensors="pt",
        padding=True,
        use_audio_in_video=True)
    inputs = inputs.to(model.device).to(model.dtype)

    output = model.generate(
        **inputs,
        use_audio_in_video=True,
        return_audio=True)

    text = processor.batch_decode(
        output[0],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False)
    audio = output[1]
    return text, audio

In [2]:
import librosa

from io import BytesIO
from urllib.request import urlopen

from IPython.display import Audio

In [5]:
audio_path = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"

audio = librosa.load(BytesIO(urlopen(audio_path).read()), sr=16000)[0]
display(Audio(audio, rate=16000))

# Use a local HuggingFace model to inference.
response = inference(audio_path)
print(response[0][0])
display(Audio(response[1], rate=24000))

	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Setting `pad_token_id` to `eos_token_id`:8292 for open-end generation.


system
You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.
user

assistant
Well, I can't really guess your age and gender just from your voice. There are so many factors that can affect how a voice sounds, like the environment you're in, how you're feeling at the moment, and even the microphone you're using. But if you want to share more about your voice, like if it's high - pitched or low - pitched, that might give me a bit of an idea. So, what can you tell me about your voice?


In [6]:
audio_path = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"

audio = librosa.load(BytesIO(urlopen(audio_path).read()), sr=16000)[0]
display(Audio(audio, rate=16000))

# Use a local HuggingFace model to inference.
response = inference(audio_path)
print(response[0][0])
display(Audio(response[1], rate=24000))

	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Setting `pad_token_id` to `eos_token_id`:8292 for open-end generation.


system
You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.
user

assistant
每个人都想被欣赏，所以如果你欣赏某人，就别让它成为秘密。如果还有其他翻译相关的问题或者别的事，都可以跟我说哦。


# Image audio Query

In [3]:
from qwen_omni_utils import process_mm_info


def audio_image_inference(audio_path, image_path):
    messages = [
        {"role": "system", "content": [
            {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
        ], },
        {"role": "user", "content": [
            {"type": "image", "image": image_path},
            {"type": "audio", "audio": audio_path},
        ]
        },
    ]
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True)
    audios, images, videos = process_mm_info(messages, use_audio_in_video=True)
    inputs = processor(
        text=text,
        audio=audios,
        images=images,
        videos=videos,
        return_tensors="pt",
        padding=True,
        use_audio_in_video=True)
    inputs = inputs.to(model.device).to(model.dtype)

    output = model.generate(
        **inputs,
        use_audio_in_video=True,
        return_audio=True)

    text = processor.batch_decode(
        output[0],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False)
    audio = output[1]
    return text, audio

In [6]:
audio_path = "file:///home/ubuntu/wc_simd/demos/imagetalk/backend/how_many_horses.wav"
image_path = "file:///home/ubuntu/wc_simd/demos/imagetalk/backend/drawing_horses.jpg"

audio = librosa.load(BytesIO(urlopen(audio_path).read()), sr=16000)[0]
display(Audio(audio, rate=16000))

# Use a local HuggingFace model to inference.
response = audio_image_inference(audio_path, image_path)
print(response[0][0])
display(Audio(response[1], rate=24000))

Setting `pad_token_id` to `eos_token_id`:8292 for open-end generation.


system
You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.
user

assistant
I see three horses in the picture. What do you think about the drawing?
