In [8]:
from pathlib import Path

import torch

from data_util import iter_audio_samples
from model import (
    QwenWithCausalAudioEncoderConfig,
    QwenWithCausalAudioEncoderForConditionalGeneration,
    process_qwen_with_causal_audio_encoder_inputs,
)

In [9]:
tts_data_path = Path("/") / "mnt/efs/fs1/wbl/webdataset/webdataset/train/tts_en"
audio_data_file = max(p for p in tts_data_path.iterdir() if p.is_file() and p.suffix in [".tar"])

In [10]:
from transformers.models.qwen3_omni_moe import Qwen3OmniMoeProcessor

model = QwenWithCausalAudioEncoderForConditionalGeneration.from_pretrained(
    "eewer/Qwen3WithMimi-30B-A3B",
    config=QwenWithCausalAudioEncoderConfig.from_pretrained("eewer/Qwen3WithMimi-30B-A3B"),
    dtype=torch.bfloat16,
    device_map="cuda:4",
)
processor = Qwen3OmniMoeProcessor.from_pretrained("Qwen/Qwen3-Omni-30B-A3B-Instruct")  # type: ignore

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_interleaved', 'interleaved', 'mrope_section'}


Loading checkpoint shards:   0%|          | 0/13 [00:00<?, ?it/s]

In [11]:
# from transformers.models.qwen2_5_omni import Qwen2_5OmniProcessor

# model = QwenWithCausalAudioEncoderForConditionalGeneration.from_pretrained(
#     "eewer/Qwen2.5WithMimi-3B",
#     config=QwenWithCausalAudioEncoderConfig.from_pretrained("eewer/Qwen2.5WithMimi-3B"),
#     dtype=torch.bfloat16,
#     device_map="cuda:2",
# )
# processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-3B")  # type: ignore

In [12]:
sample_iter = iter_audio_samples(audio_data_file)
samples = [next(sample_iter) for _ in range(2)]
conversations = [
    [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.",
                }
            ],
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Transcribe the English audio into text without any punctuation marks."},
                {"type": "audio", "audio": None},
            ],
        },
    ],
] * len(samples)

In [None]:
inputs = process_qwen_with_causal_audio_encoder_inputs(
    model=model,
    processor=processor,
    conversation=conversations,
    audio=[sample.audio for sample in samples],
    audio_sample_rate=[sample.audio_sample_rate for sample in samples],
    add_generation_prompt=True,
)

sequences = model.generate_greedy(**inputs)
outputs = processor.tokenizer.batch_decode(sequences, skip_special_tokens=True)  # type: ignore

for sample, output in zip(samples, outputs):
    print(f"<true_text>\n{sample.text}\n</true_text>\n<model_output>\n{output}\n</model_output>\n")

<true_text>
I can't say anything about any figure.
</true_text>
<model_output>
system
You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.
user
Transcribe the English audio into text without any punctuation marks.
assistant
I can't say anything about any figure.
</model_output>

<true_text>
A counter reaction was.
</true_text>
<model_output>
system
You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.
user
Transcribe the English audio into text without any punctuation marks.
assistant
A counter reaction was
user

</model_output>



: 