## Comprehensive Notes Generator

In [None]:
import os
from groq import Groq
import moviepy

The video file can be mp4, mkv and many other formats(refer the groq whisper api api to view all the formats) . The file is first converted into audio (.wav) file. 

In [None]:
from moviepy.editor import VideoFileClip

video = "test_video.mp4"

VideoFileClip(video).audio.write_audiofile("audio.wav", codec='pcm_s16le')

This is the groq api key stored in a text file. Can be obtained via https://groq.com/

In [None]:
with open("key.txt", "r") as f:
    key = f.read().strip()

The first client is used for transcription of the audio chunks.

In [None]:
client = Groq(api_key=key)

The below code segments the audio wav file into small 1 minute chunks which makes it possible to call the API to the opanAI whisper model via groq.

In [None]:
from pydub import AudioSegment
import os

audio = AudioSegment.from_wav("audio.wav")

# chunk length ( 1 minute vids)
chunk_length_ms = 1 * 60 * 1000 


output_dir = "chunks"
os.makedirs(output_dir, exist_ok=True)

for i in range(0, len(audio), chunk_length_ms):
    chunk = audio[i:i + chunk_length_ms]
    chunk_name = os.path.join(output_dir, f"chunk_{i // chunk_length_ms + 1}.wav")
    chunk.export(chunk_name, format="wav")
    print(f"Exported {chunk_name}")


In [None]:
filename_audio = "chunks\chunk_1.wav"


with open(filename_audio, "rb") as file:
    transcription = client.audio.transcriptions.create(
        file=(filename_audio, file.read()),
        model="whisper-large-v3",
        response_format="verbose_json",
    )
    print(transcription.text)



The below code completes the transcription.

In [None]:
import os
from tqdm import tqdm

chunk_dir = "chunks"
transcript_dict = {}

for chunk_file in sorted(os.listdir(chunk_dir)):
    if not chunk_file.endswith(".wav"):
        continue

    filename_audio = os.path.join(chunk_dir, chunk_file)
    print(f"\n🔊 Transcribing {chunk_file}...")

    with open(filename_audio, "rb") as file:
        total_size = os.path.getsize(filename_audio)
        
        with tqdm(total=total_size, unit='B', unit_scale=True, desc=f"Uploading {chunk_file}") as pbar:
            file_content = file.read(1024 * 1024)
            content = b""

            while file_content:
                content += file_content
                pbar.update(len(file_content))
                file_content = file.read(1024 * 1024)


    try:
        chunk_num = int(''.join(filter(str.isdigit, chunk_file)))
    except ValueError:
        chunk_num = chunk_file  

    # Send to groq API. 
    try:
        transcription = client.audio.transcriptions.create(
            file=(chunk_file, content),
            model="whisper-large-v3",
            response_format="verbose_json",
        )
        transcript_dict[chunk_num] = transcription.text
        print("📝 Transcription:", transcription.text)
    except Exception as e:
        print(f"❌ Failed to transcribe {chunk_file}: {e}")

# (Optional) print final dict keys
print("\n📚 Final transcript dictionary keys:", list(transcript_dict.keys()))


In [None]:
transcript_dict

In [None]:
# sort the dictionary
sorted_transcript_dict = {k: transcript_dict[k] for k in sorted(transcript_dict)}

sorted_transcript_dict

In [None]:
import json
json_data = json.dumps(sorted_transcript_dict, indent=4)

In [None]:
# Save to JSON file
with open("sorted_transcript.json", "w", encoding="utf-8") as f:
    json.dump(sorted_transcript_dict, f, indent=4, ensure_ascii=False)

print("✅ JSON saved as sorted_transcript.json")

In [None]:
print(json_data)


Transcripts from the video generated and stored in as json. This will be injected into the langchain prompt later via chromaDB

populate the vector databse. 

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

generate vector embeddings first using HuggingFaceEmbeddings

In [None]:
docs = [
    Document(page_content=chunk, metadata={"chunk_id": chunk_id})
    for chunk_id, chunk in sorted_transcript_dict.items()
]

In [None]:

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embedding_model,
    persist_directory="./chroma_lecture_db"
)

vectordb.persist()

### PPT processing

Its not that complicated. The goal is to extract the title, heading and slides content into a json object.

In [None]:
import json
import os
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE
from PIL import Image

def extract_pptx_content(pptx_path, output_dir=r'<project directory>/extracted', json_file='slides_data.json'):
    os.makedirs(output_dir, exist_ok=True)

    prs = Presentation(pptx_path)
    slides_data = []

    for i, slide in enumerate(prs.slides, start=1):
        slide_data = {
            "slide_number": i,
            "heading": "",
            "subheading": "",
            "text": "",
            "images": []
        }

        text_boxes = []

        for shape in slide.shapes:
            # Heading from title placeholder
            if shape.is_placeholder and shape.placeholder_format.idx == 0 and shape.has_text_frame:
                slide_data["heading"] = shape.text.strip()

            # Collect all text boxes
            elif shape.has_text_frame:
                text = shape.text.strip()
                if text:
                    text_boxes.append(text)

            # Extract images
            if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                image = shape.image
                image_bytes = image.blob
                image_ext = image.ext
                image_filename = f"{output_dir}/slide_{i}_image_{len(slide_data['images'])+1}.{image_ext}"
                with open(image_filename, 'wb') as f:
                    f.write(image_bytes)
                slide_data["images"].append(image_filename)

        # Assign subheading and main text
        if text_boxes:
            slide_data["subheading"] = text_boxes[0]
        if len(text_boxes) > 1:
            slide_data["text"] = "\n".join(text_boxes[1:])

        slides_data.append(slide_data)

    # Save to JSON
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(slides_data, f, indent=4, ensure_ascii=False)

    print(f"Extraction complete. Data saved to {json_file} and images in '{output_dir}' folder.")

# Run it
extract_pptx_content(
    r"<your lecture ppt>.pptx"
)


Get all the unique topics from the slides, we would later iterate over these topics to generate a topicwise summary.

In [None]:
# get all the unique topics from the json file

import json

def extract_unique_topics(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        slides_data = json.load(f)

    topics = set()

    for slide in slides_data:
        subheading = slide.get("heading", "").strip()
        if subheading:
            # Take only the first line if multi-line
            main_line = subheading.split('\n')[0].strip()
            topics.add(main_line)

    return list(topics)

# Example usage
topics_vector = extract_unique_topics('slides_data.json')
print(topics_vector)


Build a topic content mapping where the unique topics are mapped to the concatenated slides content. 

In [None]:
import json
from collections import defaultdict
def build_topic_content_mapping(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        slides_data = json.load(f)

    topic_map = defaultdict(str)
    

    for slide in slides_data:
        subheading = slide.get("heading", "").strip()
        if subheading:
            topic = subheading.split('\n')[0].strip()  # Normalize topic name
            text = slide.get("text", "").strip()
            if text:
                topic_map[topic] += text + "\n"

    return dict(topic_map)

# Usage
topic_content_dict = build_topic_content_mapping("slides_data.json")


In [None]:
# Usage
topic_content_dict = build_topic_content_mapping("slides_data.json")
for topic, content in topic_content_dict.items():
    print(f"\n=== {topic} ===\n{content}")

In [None]:
topic_content_dict

## Langchain to generate the pdf

In [None]:
from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain_core.prompts import PromptTemplate

Use the same key.txt as used earlier.

In [None]:
from langchain_groq import ChatGroq
f = open("key.txt")
key = f.read()

llm = ChatGroq(
    model="llama3-70b-8192",
    temperature=0,
    groq_api_key = key,
)

Define a clear prompt template.

In [None]:
prompt_temp = PromptTemplate.from_template(
    '''
    ### UNIVERSITY LECTURE TRANSCRIPT:
    {lecture_transcript}

    ### LECTURE SLIDE CONTENT:
    {slides_content}

    ###TOPIC:
    {unique_topic}

    ###INSTRUCTIONS:
    You are John, An expert at making comprehensive academic notes. 
    You are required do exactly what you are good at. Given the lecture transcripts and the lecture slide content for a particular topic, generate
    comprehensive lecture notes for the same covering all important details, merging the information from the slides and the transcripts.
    ensure the text is correctly formatted.

    DO NOT provide a preamble.
    ### ANSWER (NO PREAMBLE):

    '''
)

chain the prompt into the llm

In [None]:
chain = prompt_temp | llm

In [None]:
from markdown2 import markdown
from xhtml2pdf import pisa

# To store all markdown outputs
final_md = ""

for topic in topics_vector:
    try: 
        slide_content = topic_content_dict[topic]
    except :
        continue
    query = topic
    results = vectordb.similarity_search(query, k=5)

    transcript_content = ""
    for i, doc in enumerate(results, 1):
        transcript_content += doc.page_content + "\n"

    res = chain.invoke({
        "lecture_transcript": transcript_content,
        "slides_content": slide_content,
        "unique_topic": topic
    })

    # Append each topic's notes to final markdown string
    final_md += f"## {topic}\n\n{res.content}\n\n---\n\n"

# Convert markdown to HTML
html_content = markdown(final_md)

# Write HTML to PDF
with open("qlora_lecture_notes.pdf", "wb") as pdf_file:
    pisa.CreatePDF(html_content, dest=pdf_file)

print("✅ PDF generated: qlora_lecture_notes.pdf")
