Minute creator in Gradio from day 5 of week 3.
A couple of points to note:


*   My access to llama hasn't been approved on Hugging Face and so I've experimented with some of the other models.
*   There is a fair bit of debugging code in the main function as I was getting an error and couldn't find it.  I've left it in just in case its useful for others trying to debug their code.
*   I was debugging with the help of Claude.  It suggested using <with torch.no_grad()> for the minute output.  The rationale is that it disables gradient computation which isn't necessary for inference and I found it did speed things up.



In [None]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2

In [None]:
import os
import requests
from openai import OpenAI
from IPython.display import Markdown, display, update_display
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gradio as gr

In [None]:
# keys

#openai
openai_api_key = userdata.get('OPENAI_API_KEY')
openai = OpenAI(api_key=openai_api_key)

#hf
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
# constants

AUDIO_MODEL = 'gpt-4o-transcribe'
OPENAI_MODEL = 'gpt-4o-mini'
QWEN2_MODEL = 'Qwen/Qwen2.5-7B-Instruct' # runs slowly no matter what size gpu - kept crashing on ram!
GEMMA2_MODEL = "google/gemma-2-2b-it" # doesn't use a system prompt
PHI3 = "microsoft/Phi-3-mini-4k-instruct"

In [None]:
# convert audio to text

def transcribe_audio(audio_file_path):
  try:
    with open (audio_file_path, 'rb') as audio_file:
      transcript = openai.audio.transcriptions.create(model = AUDIO_MODEL, file = audio_file, response_format="text")
    return transcript
  except Exception as e:
    return f"An error occurred: {str(e)}"

In [None]:
# use transcript to create minutes
# use open source model

def create_minutes(transcript):

  # first try is for debugging
  try:
    print(f"Starting to create minutes with transcript length: {len(str(transcript))}")

    if not transcript or len(str(transcript).strip()) == 0:
      return "Error: Empty or invalid transcript"

    #messages
    system_prompt = "You are an expert creator of meeting minutes.  Based on a meeting transcript you can summarise the meeting title and date, attendees, key discussion points, key outcomes, actions and owners and next steps.  Respond in Markdown."
    user_prompt = f"Create meeting minutes from the transcript provided.  The minutes should be clear but succint and should include title and date, attendees, key discussion points, key outcomes, actions and owners, and next steps. {transcript}"

    messages = [
      {"role":"system","content":system_prompt},
      {"role":"user","content":user_prompt}
    ]
    print("Messages prepared successfully") # for debugging

    # quantisation (for os model)

    quantization_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16
    )

  except Exception as e:
    return f"An error occurred in setup: {str(e)}"

  # model & tokeniser
  try:
    print("Loading tokeniser....")   # for debugging
    tokenizer = AutoTokenizer.from_pretrained(PHI3)
    tokenizer.pad_token = tokenizer.eos_token

    print("Loading model.....")  # for debugging
    model = AutoModelForCausalLM.from_pretrained(PHI3, device_map='auto', quantization_config=quantization_config)
    print(f"Model loaded on device {model.device}") # for debugging

  # chat template
    inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer(inputs, return_tensors="pt").to(model.device)

  # torch.no_grad suggested by claude.  This disables gradient computation which reduces memory usage and speeds things up
    print("Generating text....") # for debugging
    with torch.no_grad():
      outputs = model.generate(**model_inputs, max_new_tokens=2000, do_sample=True, temperature=0.7)
    print(f"Generation complete. Output shape: {outputs.shape}") # for debugging

  #***debugging****

    # Decode the generated text (excluding the input prompt)
    print("Starting text decoding...") # debugging
    input_length = len(model_inputs['input_ids'][0]) # debugging
    print(f"Input length: {input_length}, Output length: {len(outputs[0])}") # debugging

    if len(outputs[0]) <= input_length: # debugging
        return "Error: Model didn't generate any new tokens. Try reducing input length or increasing max_new_tokens." # debugging

    generated_tokens = outputs[0][input_length:] # debugging
    print(f"Generated tokens length: {len(generated_tokens)}") # debugging

  # decode generated text
    generated_text = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):],skip_special_tokens=True)
    print(f"Decoded text length: {len(generated_text)}")

    return generated_text.strip()

  except ImportError as e:
      return f"Import error - missing library: {str(e)}. Please install required packages."
  except torch.cuda.OutOfMemoryError as e:
      return f"CUDA out of memory: {str(e)}. Try reducing max_new_tokens to 500 or use CPU."
  except RuntimeError as e:
      return f"Runtime error: {str(e)}. This might be a CUDA/device issue."
  except Exception as e:
      return f"Unexpected error during text generation: {type(e).__name__}: {str(e)}"


In [None]:
# create process for gradio

def gr_process(audio_file, progress = gr.Progress()):

  if audio_file is None:
    return "Please provide an audio file"

  try:
    progress(0, desc="Analysing file")
    transcript = transcribe_audio(audio_file)

    if transcript.startswith("An error occurred"):
      return transcript

    progress(0.5, desc="File analysed, generating minutes")

    minutes = create_minutes(transcript)
    progress(0.9, desc="Nearly there")

    return minutes

  except Exception as e:
    return f"An error occurred: {str(e)}"

In [None]:
# gradio interface

demo = gr.Interface(
    fn=gr_process,
    inputs= gr.Audio(type="filepath",label="Upload MP3 file"),
    outputs= gr.Markdown(label="Meeting minutes"),
    title = "Meeting minute creator",
    description = "Upload an mp3 audio file for a meeting and I will provide the minutes!"
)

if __name__ == "__main__":
  demo.launch(debug=True)