<a href="https://colab.research.google.com/github/damola936/AI-ML-LLM/blob/main/LLM_Audio_to_minutes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Create meeting minutes from an Audio file

dataset : https://huggingface.co/datasets/huuuyeah/meetingbank

In [None]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai datasets

In [None]:
# imports

import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from datasets import load_dataset
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig

In [None]:
# Constants

AUDIO_MODEL = "whisper-1"
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [None]:
!wget https://huggingface.co/datasets/huuuyeah/MeetingBank_Audio/resolve/main/Alameda/mp3/alameda-9.zip
import zipfile

# Path to the downloaded zip file
zip_path = "/content/alameda-9.zip"

# Where you want to extract the contents
extract_to = "alameda-9"

# Open and extract
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print(f"Extracted files to: {extract_to}")

In [None]:
audio_path = "/content/alameda-9/alameda_de1abff5-e7fd-45af-aa26-29e663bd40b6.mp3"

In [None]:
!pip install pydub

In [None]:
from pydub import AudioSegment

# Load the audio file (pydub uses milliseconds)
audio_path = "/content/alameda-9/alameda_de1abff5-e7fd-45af-aa26-29e663bd40b6.mp3"
audio = AudioSegment.from_mp3(audio_path)

# Define the first 10 minutes in milliseconds
ten_minutes = 10 * 60 * 1000  # 600,000 ms

# Trim the audio
trimmed_audio = audio[:ten_minutes]

# Save the trimmed version (optional)
trimmed_audio_path = "/content/alameda-9/trimmed_alameda_10min.mp3"
trimmed_audio.export(trimmed_audio_path, format="mp3")

print(f"Trimmed audio saved to: {trimmed_audio_path}")


In [None]:
# Sign into Huggingface hub

hf_token = userdata.get("HF_TOKEN")
login(hf_token, add_to_git_credential=True)

In [None]:
# Sign in to OpenAI using secrets in Colab

openai_token = userdata.get("OPENAI_API_KEY")
openai = OpenAI(api_key=openai_token)

In [None]:
# Use the whisper OpenAI model to convert the Audio to text

audio_file = open(trimmed_audio_path, "rb")
transcription = openai.audio.transcriptions.create(model=AUDIO_MODEL,
                                                   file=audio_file, response_format="text")
print(transcription)

In [None]:
system_message = "You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown."
user_prompt = f"Below is an extract transcript of a council meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\n{transcription}"

messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_prompt}
  ]


In [None]:
import torch

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_computs_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_toke = tokenizer.eos_token
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
streamer = TextStreamer(tokenizer)
model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)
outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)

In [None]:
response = tokenizer.decode(outputs[0])

In [None]:
display(Markdown(response))

## Turning this to a gradio app

In [None]:
!pip install gradio
import gradio as gr

In [None]:
import random
folder_audio_path = "/content/alameda-1"
folder_audio_path_dirs = os.listdir(folder_audio_path)
audio_path = random.choice(folder_audio_path_dirs)
print(audio_path)

In [None]:
# FUNCTIONS--------

# Commenting this out to remove download time, uncomment to download link if new link, already downloaded so just set the audo link directory
# def unpack_dataset(link):
#     import zipfile

#     # Use the link to get the zip file name
#     zip_filename = link.split("/")[-1]
#     zip_path = f"/content/{zip_filename}"
#     extract_to = zip_filename.split(".")[0]

#     # Download the file
#     !wget -O "$zip_path" "$link"

#     # Extract the zip file
#     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#         zip_ref.extractall(f"/content/{extract_to}")

#     print(f"Extracted files to: /content/{extract_to}")
#     return f"/content/{extract_to}"



def trim_audio(link):
    import os
    import random
    from pydub import AudioSegment

    # Load the audio file (pydub uses milliseconds)
    folder_audio_path = link
    folder_audio_path_dirs = os.listdir(folder_audio_path)
    audio_path = f"{folder_audio_path}/{random.choice(folder_audio_path_dirs)}"

    audio = AudioSegment.from_mp3(audio_path)

    # Define the first 10 minutes in milliseconds
    ten_minutes = 10 * 60 * 1000  # 600,000 ms

    # Trim the audio
    trimmed_audio = audio[:ten_minutes]

    # Save the trimmed version (optional)
    trimmed_audio_path = f'/content/{link.split("/")[-1]}/trimmed_{link.split("/")[-1]}_10min.mp3'
    trimmed_audio.export(trimmed_audio_path, format="mp3")

    print(f"Trimmed audio saved to: {trimmed_audio_path}")
    return trimmed_audio_path


def generate_text_from_audio_link(link):
    # Audio files from Hugging face datasets are usually long so we will have to trim it, let's load the dataset first
    print("Unpacking dataset...")
    # audio_link = unpack_dataset(link) use this when you uncomment  unpack_dataset function
    audio_link = "/content/alameda-1"
    print("Trimming audio... this will take a while...")
    trimmed_audio_path = trim_audio(audio_link)

    print("Extracting audio...")
    # Transcribe audio
    audio_file = open(trimmed_audio_path, "rb")
    transcription = openai.audio.transcriptions.create(model=AUDIO_MODEL,
                                                    file=audio_file, response_format="text")

    # Set the quantization object
    quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_computs_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4")

    print("Generating summary...")
    # Get the tokenizer, model and generate summary and minutes
    tokenizer = AutoTokenizer.from_pretrained(LLAMA)
    tokenizer.pad_token = tokenizer.eos_token
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
    streamer = TextStreamer(tokenizer)
    model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)
    outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)
    decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
    response = decoded_text.split("<|end_header_id|>")[-1]
    return response

In [None]:
gr.Interface(fn=generate_text_from_audio_link, inputs="textbox", outputs="textbox").launch(inbrowser=True, debug=True)