In [1]:
from pathlib import Path

from transformers.models.qwen2_5_omni import Qwen2_5OmniProcessor

from data_util import iter_audio_samples
from model import QwenOmniWithMimiForConditionalGeneration

In [2]:
tts_data_path = Path("/") / "mnt/efs/fs1/wbl/webdataset/webdataset/train/tts_en"
embedding_data_path = Path("/") / "mnt/efs/fs1/extracted_audio_features/"
data_files = sorted([p for p in tts_data_path.iterdir() if p.is_file() and p.suffix in [".tar"]])

In [3]:
model = QwenOmniWithMimiForConditionalGeneration(
    text_model_name_or_path="Qwen/Qwen2.5-Omni-3B",
    adaptor_config_path="configs/adaptor_config.json",
    adaptor_state_dict_path="saves/adaptor_lag_2.pt",
).to("cuda:1")  # type: ignore

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-3B")  # type: ignore

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [7]:
sample = next(iter_audio_samples(data_files[-1]))
conversation = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.",
            }
        ],
    },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Transcribe the English audio into text without any punctuation marks."},
            {"type": "audio", "audio": None},
        ],
    },
]
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)  # type: ignore

print(text)
print("=" * 100)
print(sample.text)

<|im_start|>system
You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.<|im_end|>
<|im_start|>user
Transcribe the English audio into text without any punctuation marks.<|audio_bos|><|AUDIO|><|audio_eos|><|im_end|>
<|im_start|>assistant

I can't say anything about any figure.


In [8]:
output_text = model.generate_greedy(text, audio=sample.audio, audio_sample_rate=sample.audio_sample_rate, return_text=True)
print(output_text)

system
You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.
user
Transcribe the English audio into text without any punctuation marks.
assistant
I can't say anything about any figure.
