### Create Meeting Minutes from an Audio File with a UI using Gradio

- Start by installing the necessary dependencies required for audio processing, model loading, and the Gradio interface.


In [None]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2 gradio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.4/321.4 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.2/73.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# imports
import os
import torch
import gradio as gr
from openai import OpenAI
from google.colab import userdata, drive
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from pathlib import Path

## Mount Google Drive to the Colab Environment

- Mount Google Drive to enable access to files stored in your Drive directly from the Colab environment.  
- This allows saving and loading models, files, and other resources persistently across sessions.


In [None]:
# mount Google Drive to the Colab environment, allowing access to files stored in the Drive.
drive.mount("/content/drive")

Mounted at /content/drive


## Define Required Constants

- Set up the necessary constants that will be used throughout the application, such as model-specific markers, token limits, and configuration values.


In [None]:
# Specifies the GPT speech-to-text model to be used for audio transcription (OpenAI's Whisper model version 1)
GPT_STT_MODEL = "whisper-1"
# Specifies the path or identifier for the LLaMA model to be used for generating meeting minutes
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# These paths are optional and can be customized based on user preference.

# DRIVE_DIR = "/content"  # Uncomment this line and comment the next one if you prefer to save the model in the temporary runtime session (non-persistent storage).
DRIVE_DIR = "/content/drive/MyDrive"  # Path to Google Drive for persistent storage.
DRIVE_MODELS_DIR = DRIVE_DIR + "/my_models"  # Directory within Google Drive to store the saved models.


# Special marker used by the LLaMA model to indicate the end of a specific section in the generated response.
MODEL_SPECIAL_MARKER = "<|end_header_id|>"
# End-of-sequence (EOS) token used by the LLaMA model to signify the end of the entire generated response.
MODEL_EOS = "<|eot_id|>"
# Both markers will be used during post-processing to clean up the output by removing unnecessary markers.

# Define the assistant's role and task instructions
SYSTEM_PROMPT = (
    "You are an AI assistant designed to generate detailed meeting minutes from transcripts in markdown format. "
    "Your output should include a summary, key discussion points, takeaways, and action items with assigned owners."
)

# Construct the user's prompt with detailed instructions and the provided transcript
USER_PROMPT = (
    "The following is an excerpt from a council meeting transcript. "
    "Please generate well-structured meeting minutes in markdown format, including: "
    "a summary with attendees, location, and date; key discussion points; takeaways; "
    "and action items with designated owners.\n\n"
)

# Maximum number of tokens allowed for the model's generation to control output length and prevent exceeding limits
MAX_TOKENS = 2000


## Add Secrets to the Colab Notebook

- Add your Hugging Face Hub credentials to sign in and access models.  
- Provide your OpenAI private API key to enable access to the OpenAI services.

In [None]:
# Sign in to HuggingFace Hub using Secrets in Colab
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
# Sign in to OpenAI using Secrets in Colab
openai_api_key = userdata.get('OPENAI_API_KEY')
openai = OpenAI(api_key=openai_api_key)

## Define Model and Tokenizer Variables

- Initialize the `model` and `tokenizer` variables by assigning them to `None`.  
- This ensures they can be properly loaded later when needed.

In [None]:
# Initialize the model and tokenizer variables
tokenizer = None
model = None

## `Whisper` OpenAI Model for Audio-to-Text Conversion

- Use the `Whisper` model by OpenAI to accurately transcribe the uploaded audio file into text.  
- This transcribed text will serve as the input for generating the meeting minutes.


In [None]:
def transcribe_audio(audio_filename, progress=gr.Progress()):
    # Update progress to indicate the transcription process has started
    progress(0.3, desc="Generating meeting transcript...")

    try:
        # Open the audio file in binary mode for reading
        with open(audio_filename, "rb") as audio_file:
            # Send the audio file to the OpenAI API for speech-to-text transcription
            transcription = openai.audio.transcriptions.create(
                model=GPT_STT_MODEL,          # Specify the speech-to-text model
                file=audio_file,              # Provide the audio file to be transcribed
                response_format="text"        # Set the desired response format to plain text
            )
            return transcription  # Return the generated transcription result
    except Exception as e:
        # Handle any errors that occur during the transcription process
        # Raise a custom exception with a detailed error message
        raise Exception(f"An error occurred while transcribing audio: {e}") from e


## Load the Model and Tokenizer

- If this is the first time using the runtime, load the model from the Hugging Face Hub and save it to the drive for future use (this ensures the model persists even after the runtime disconnects).  
- Alternatively, the model can be saved in the current temporary runtime session location, but note that it will not persist after the session ends or disconnects.  
- If the model is already saved on the drive, it will be loaded directly from there to save time.


In [None]:
def load_model(model_name, local_dir=DRIVE_MODELS_DIR):
    # Convert the local_dir to a Path object for easier path handling
    local_dir = Path(local_dir)

    # Create a subdirectory for the model, replacing '/' in model_name with '_'
    model_dir = local_dir / model_name.replace("/", "_")

    if model_dir.exists():  # Check if the model is already downloaded locally
        # Load the tokenizer and model from the existing directory
        tokenizer = AutoTokenizer.from_pretrained(str(model_dir))
        model = AutoModelForCausalLM.from_pretrained(str(model_dir))

    else:  # If the model is not available locally, download and configure it
        # Configure the quantization settings for loading the model in 4-bit precision
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,                   # Load the model in 4-bit precision to save memory
            bnb_4bit_use_double_quant=True,      # Enable double quantization for better performance
            bnb_4bit_compute_dtype=torch.bfloat16,  # Set computation data type to bfloat16
            bnb_4bit_quant_type="nf4"            # Use NF4 quantization type for improved accuracy
        )

        # Download the tokenizer with remote code support enabled
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

        # Set the padding token to the end-of-sequence (EOS) token for consistency
        tokenizer.pad_token = tokenizer.eos_token

        # Download and load the model with the specified quantization configuration
        model = AutoModelForCausalLM.from_pretrained(model_name, config=quant_config)

        # Save the downloaded model and tokenizer locally for future use
        model_dir.mkdir(parents=True, exist_ok=True)  # Create the directory if it doesn't exist
        model.save_pretrained(model_dir)  # Save the model to the specified directory
        tokenizer.save_pretrained(model_dir)  # Save the tokenizer to the specified directory

    # Return the loaded tokenizer and model for further use
    return tokenizer, model



## Generate Meeting Minutes from Recorded Audio

- Use the loaded model to process the transcript and produce a well-structured meeting minutes summary.


In [None]:
def generate_meeting_minutes(transcription, progress=gr.Progress()):
    # Declare tokenizer and model as global variables to allow their use and modification
    global tokenizer, model

    # Update progress to indicate that the meeting minutes generation process has started
    progress(0.5, desc="Preparing to generate meeting minutes...")

    try:
        # Check if the tokenizer or model is already loaded; if not, load or download them
        if tokenizer is None or model is None:
            # Load the required LLaMA tokenizer and model if missing
            tokenizer, model = load_model(LLAMA)

        # Create the message sequence following the chat template format
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},  # System role defines the assistant's task
            {"role": "user", "content": USER_PROMPT + transcription}  # User input includes the transcription
        ]

        # Tokenize the input messages and prepare them for model inference on GPU
        inputs = tokenizer.apply_chat_template(
            messages, return_tensors="pt",  # Return tokenized input as PyTorch tensors
            add_generation_prompt=True      # Add a generation prompt for the model
        ).to("cuda")  # Move the inputs to GPU for faster processing

        # Generate the response with a specified maximum number of new tokens
        progress(0.75, desc="Generating and decoding the result...")  # Update progress
        outputs = model.generate(inputs, max_new_tokens=MAX_TOKENS)

        # Decode the generated tokens into a readable text response
        response = tokenizer.decode(outputs[0])

        # Post-process the response to clean up unwanted markers or special tokens
        progress(0.9, desc="Finalizing and formatting meeting minutes...")  # Update progress
        response = response.split(MODEL_SPECIAL_MARKER)[-1].strip().replace(MODEL_EOS, "")

        return response  # Return the final cleaned meeting minutes

    except Exception as e:
        # Handle any exceptions and raise a custom error with a detailed message
        raise Exception(f"Error generating the meeting minutes summary: {str(e)}") from e


## Process Uploaded Audio to Generate a Summarized Meeting Minutes Recap

- Convert the uploaded audio file into text using the `Whisper` model.  
- Generate a detailed and concise meeting minutes summary.

In [None]:
def process_audio(audio_file, progress=gr.Progress()):
    # Update progress to indicate the start of audio processing
    progress(0.1, desc="Start Audio Processing ...")

    # Check if no audio file was provided by the user
    if audio_file is None:
        return "No audio file detected. Please upload a valid audio file to proceed."

    # Check if the uploaded file is not in MP3 format (only MP3 files are supported)
    elif not str(audio_file).lower().endswith(".mp3"):
        return "Unsupported file format. Please upload a valid MP3 file."

    try:
        # Transcribe the audio file into text using the Whisper model
        transcription = transcribe_audio(audio_file)

        # Generate meeting minutes from the transcribed text using the LLaMA model
        output = generate_meeting_minutes(transcription)

        # Update progress to indicate that the process is complete
        progress(1.0, desc="Meeting Minutes Complete!")
        return output  # Return the generated meeting minutes

    except Exception as e:
        # Handle any exceptions that occur during processing and return an error message
        return f"Error processing the audio file: {str(e)}"


## Create the User Interface (UI) with Gradio

- Design a simple and intuitive Gradio interface for uploading audio files and displaying the generated meeting minutes.
- Ensure the UI supports real-time progress updates and displays the final output in markdown format.


In [None]:
# Description displayed in the Gradio interface to guide users on how to use the app
description = """<div style="text-align: center;">
Upload an MP3 recording of your meeting and let MeetingRecap handle the rest.<br>
Our AI assistant will generate a clear and accurate set of meeting minutes, including key discussions, action items, and decisions.<br>
In just a few minutes, you'll receive a well-organized summary that saves you time and effort.
</div><br>"""

ui = gr.Interface(
    fn=process_audio,  # Function that processes the uploaded audio file and generates meeting minutes
    inputs=gr.Audio(type="filepath", label="Upload Recorded MP3 File", format="mp3"),  # Audio input widget restricts input to MP3 format
    outputs=gr.Markdown(label="Meeting Minutes", min_height=60),  # Displays the output as Markdown
    title="MeetingRecap",  # Title of the Gradio interface
    description=description,  # Description shown below the title
    flagging_mode="never"  # Disables the flagging feature in the UI
)

# Launches the Gradio interface and opens it in a browser
ui.launch(inbrowser=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7c9a827bef3c6564dc.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7c9a827bef3c6564dc.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




## Contributing
Contributions are welcome! Here are some ways you can contribute to the project:
- Report bugs and issues.
- Suggest new features or improvements.
- Submit pull requests with bug fixes or enhancements.

You can contribute to this project by visiting the [GitHub repository](https://github.com/emads22/MeetingRecap).

## Author
- **Emad**  
  [<img src="https://img.shields.io/badge/GitHub-Profile-blue?logo=github" width="150">](https://github.com/emads22)

## License
This project is licensed under the MIT License, which grants permission for free use, modification, distribution, and sublicense of the code, provided that the copyright notice (attributed to [emads22](https://github.com/emads22)) and permission notice are included in all copies or substantial portions of the software. This license is permissive and allows users to utilize the code for both commercial and non-commercial purposes.

Please see the [LICENSE](https://github.com/emads22/MeetingRecap/blob/main/LICENSE) file for more details.
