In [None]:
# install dependencies
!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 openai anthropic google

In [None]:
# Import libraries
from dotenv import load_dotenv
import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
import anthropic
from google import genai
# from google.colab import drive
from huggingface_hub import login
#from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig, TextIteratorStreamer
import torch
import gradio as gr
import threading

### Environment Setup
I prefer to put my API Keys setups together in the beginning of the notebook, easier to reuse.

In [None]:
# Set up API keys and sign it to services if they exist
# Comment out the ones you're not using.

load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
weather_api_key = os.getenv('WEATHER_API_KEY')
hf_api_key = os.getenv('HF_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
    openai = OpenAI(api_key=openai_api_key)
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
    claude = anthropic.Anthropic()
else:
    print("Anthropic API Key not set")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:8]}")
    gemini =  genai.Client(api_key=google_api_key)
    #ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key="ollama")
else:
    print("Google API Key not set")

if weather_api_key:
    print(f"Weather API Key exists and begins {weather_api_key[:7]}")
else:
    print("Weather API Key not set")

if hf_api_key:
    print(f"HuggingFace API Key exists and begins {weather_api_key[:7]}")
    login(hf_api_key, add_to_git_credential=True)
else:
    print("HuggingFace API Key not set")

In [None]:
# Constants

AUDIO_MODEL = "whisper-1"
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [None]:
# quantization (4bits double quant)
quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_quant_type="nf4"
        )

In [None]:
# Gradio UI and handler functions

def process_file(file_path):
    if file_path is not None:
        print(f"Selected file: {file_path}")
        audio_file = open(file_path, "rb")
        transcription = openai.audio.transcriptions.create(model=AUDIO_MODEL, file=audio_file, response_format="text")
        print(f"File processed: {file_path}")
        return "File processed successfully", transcription
    return "No file selected",None

def process_transcription(transcription):
    if transcription is not None:
        # set up prompts
        system_message = "You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown."
        user_prompt = f"""Below is an extract transcript of a meeting. Please write minutes in markdown, including a summary with attendees, 
        location and date; discussion points; takeaways; and action items with owners.\n{transcription}"""
        messages = [
           {"role": "system", "content": system_message},
           {"role": "user", "content": user_prompt}
        ]
        # Tokenize the input, pass it to the model and  and stream the model response.
        tokenizer = AutoTokenizer.from_pretrained(LLAMA)
        tokenizer.pad_token = tokenizer.eos_token
        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
        model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)
        streamer = TextIteratorStreamer(
            tokenizer, 
            skip_prompt=True, 
            skip_special_tokens=True
        )
        def generate():
            model.generate(
                inputs, 
                max_new_tokens=2000, 
                streamer=streamer,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )  
        thread = threading.Thread(target=generate)
        thread.start()
        result=""
        for new_text in streamer:
            result += new_text or ""
            yield result
        thread.join()
      
        # streamer = TextStreamer(tokenizer)
        # outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)
        # Get the length of the original input
        # input_length = inputs.shape[1]
        # Extract only the newly generated tokens
        # new_tokens = outputs[0][input_length:]
        # Decode only the new tokens and make sure to delete the 'assistant' tag at the beginning after decoding
        # latest_response = tokenizer.decode(new_tokens, skip_special_tokens=True).split("assistant",1)[1]
        # return latest_response
    return "No data"

with gr.Blocks() as interface:
    # State variable - not displayed to user
    intermediate_data = gr.State()
    
    file_input = gr.File(label="Select a file", type='filepath')
    status_output = gr.Textbox(label="Status")
    final_output = gr.Markdown(label="Final Result")
    
    process_btn = gr.Button("Select a file")
    continue_btn = gr.Button("Transcribe")
    
    process_btn.click(
        process_file,
        inputs=file_input,
        outputs=[status_output, intermediate_data]  # Only status is displayed
    )
    
    continue_btn.click(
        process_transcription,
        inputs=intermediate_data,
        outputs=final_output
    )

interface.launch(debug=True)