In [None]:
!pip install -q --upgrade bitsandbytes accelerate

In [None]:
import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch

In [None]:
Qwen = "Qwen/Qwen3-4B-Instruct-2507"

In [None]:
drive.mount("/content/drive")
audio_filename = "/content/drive/MyDrive/llms/denver_extract.mp3"

## Option 1: Use Open Source for Transcription - Hugging Face Pipelines

In [None]:
hf_token = userdata.get('HuggingFace')
login(hf_token, add_to_git_credential=True)


audio_file = open(audio_filename, "rb")

In [None]:
from transformers import pipeline

pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-medium.en",
    dtype=torch.float16,
    device='cuda',
    return_timestamps=True
)

result = pipe(audio_filename)
transcription = result["text"]
print(transcription)

In [None]:
open_source_transcription = transcription

## Option 2: Use OpenAI for Transcription

In [None]:
AUDIO_MODEL = "gpt-4o-mini-transcribe"

openai_api_key = userdata.get('OPENAI_API_KEY')
openai = OpenAI(api_key=openai_api_key)
transcription = openai.audio.transcriptions.create(model=AUDIO_MODEL, file=audio_file, response_format="text")
print(transcription)

In [None]:
display(Markdown(open_source_transcription))
print("\n\n")
display(Markdown(transcription))

# STEP 2: Analyze & Report

In [None]:
system_message = """
You produce minutes of meetings from transcripts, with summary, key discussion points,
takeaways and action items with owners, in markdown format without code blocks.
"""

user_prompt = f"""
Below is an extract transcript of a Denver council meeting.
Please write minutes in markdown without code blocks, including:
- a summary with attendees, location and date
- discussion points
- takeaways
- action items with owners

Transcription:
{transcription}
"""

messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_prompt}
  ]

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(Qwen)
# ensure pad token exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# apply chat template and ensure the generation marker is present
enc = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)

# normalize different return types to a 2-D tensor (batch, seq_len)
input_ids = enc["input_ids"] if isinstance(enc, dict) and "input_ids" in enc else enc

import torch
# convert lists -> tensor, and handle nested dims
if isinstance(input_ids, (list, tuple)):
    input_ids = torch.tensor(input_ids, dtype=torch.long)
elif not isinstance(input_ids, torch.Tensor):
    input_ids = torch.as_tensor(input_ids, dtype=torch.long)

# collapse unwanted extra dims: result should be (batch, seq_len)
if input_ids.dim() == 1:
    input_ids = input_ids.unsqueeze(0)
elif input_ids.dim() == 3:
    # common case: (batch, turns, tokens) -> flatten tokens per batch
    batch = input_ids.size(0)
    input_ids = input_ids.view(batch, -1)

# attention mask: use provided or make ones
attention_mask = None
if isinstance(enc, dict) and "attention_mask" in enc:
    attention_mask = enc["attention_mask"]
    if not isinstance(attention_mask, torch.Tensor):
        attention_mask = torch.as_tensor(attention_mask, dtype=torch.long)
    # align dims if needed
    if attention_mask.dim() == 1:
        attention_mask = attention_mask.unsqueeze(0)
    elif attention_mask.dim() == 3:
        batch = attention_mask.size(0)
        attention_mask = attention_mask.view(batch, -1)
else:
    attention_mask = torch.ones_like(input_ids, dtype=torch.long)

# move to device once normalized
device = "cuda" if torch.cuda.is_available() else "cpu"
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

# streaming and model load
streamer = TextStreamer(tokenizer, skip_prompt=True)
model = AutoModelForCausalLM.from_pretrained(Qwen, device_map="auto", quantization_config=quant_config)

# generate using explicit tensors (use a sensible max_new_tokens)
outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=512, streamer=streamer)

# decode
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
# ...existing code...


In [None]:
response = tokenizer.decode(outputs[0])

In [None]:
display(Markdown(response))