In [None]:
!pip install librosa
!pip install -U transformers
!pip install -U accelerate
!pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
from io import BytesIO
from urllib.request import urlopen
import librosa
import accelerate

from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration

In [None]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
model = Qwen2AudioForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-Audio-7B-Instruct",
    device_map="auto"
).eval()

In [None]:
audio_path = "../asr_bundestag_clean/wavs/7534776_0_218_18.wav"
audio, sr = librosa.load(audio_path, sr=processor.feature_extractor.sampling_rate)

instruction = "Transkribiere das Audio:"

conversation = [
    {"role": "user", "content": [
        {"type": "text", "text": instruction},
        {"type": "audio", "audio_url": None}
    ]}
]

# Prompt für das Modell erzeugen
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=False, tokenize=False)

# --- 4️⃣ Eingabetensor erzeugen ---
inputs = processor(
    text=text_prompt,
    audios=[audio],
    return_tensors="pt",
    padding=True
)
inputs = inputs.to(model.device)

# --- 5️⃣ Antwort generieren ---
generate_ids = model.generate(**inputs, max_new_tokens=256)
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]

print("Antwort:", response)