In [9]:
import fitz  # PyMuPDF

# Function to extract headings and subheadings page by page
def extract_headings_page_by_page(pdf_path):
    page_data = {}  # Dictionary to store data for each page

    with fitz.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf, start=1):
            organized_text = {}
            current_heading = None
            
            # Extract text with font information
            blocks = page.get_text("dict")["blocks"]
            
            for block in blocks:
                for line in block.get("lines", []):
                    for span in line.get("spans", []):
                        # Check if the text is bold
                        if "bold" in span["font"].lower():
                            text = span["text"].strip().replace(":", "")  # Remove colon
                            
                            # Identify main headings (e.g., "1. GEN AI Overview")
                            if text.startswith(tuple(f"{i}." for i in range(1, 10))) and not text.startswith(tuple(f"{i}.{j}" for i in range(1, 10) for j in range(1, 10))):
                                current_heading = text
                                if current_heading not in organized_text:
                                    organized_text[current_heading] = []
                            
                            # Identify subheadings (e.g., "1.1 Intro to Gen AI")
                            elif current_heading and text.startswith(tuple(f"{current_heading.split('.')[0]}.{j}" for j in range(1, 10))):
                                organized_text[current_heading].append(text)
            
            # Store organized text for the current page
            page_data[f"Page {page_number}"] = organized_text

    return page_data

# Provide the path to your PDF file
pdf_path = "rk3.pdf"  # Replace with your file path
page_wise_data = extract_headings_page_by_page(pdf_path)

# Print the extracted data page by page
for page, data in page_wise_data.items():
    print(page)  # Page number
    for heading, subheadings in data.items():
        print(f"  Heading: {heading}")
        for subheading in subheadings:
            print(f"    Subheading: {subheading}")


Page 1
  Heading: 1.GEN AI Overview
    Subheading: 1.1 Intro to gen ai 
    Subheading: 1.2 History about Gen AI 
    Subheading: 1.3 Basic Fundamentals of Gen AI 
    Subheading: 1.4 Prerequists for gen AI 
    Subheading: 1.5 NLP for Gen AI
  Heading: 2.NLP Overview 
    Subheading: 2.1 NLP 
    Subheading: 2.2 Basics of NLP
    Subheading: 2.3 Text Preprocessing techniques
Page 2
  Heading: 3.Word Embedding
    Subheading: 3.1 Word 2 Vec
    Subheading: 3.2 One Hot Encoding 
    Subheading: 3.3 Bag Of Words 
    Subheading: 3.4 TF-IDF 
    Subheading: 3.5 Avg Bag Of Words 
Page 3
  Heading: 4.MODEL’S
    Subheading: 4.1 MODELS 
    Subheading: 4.2 RNN Model 
    Subheading: 4.3 LSTM Model 
    Subheading: 4.4 GRU Model 
    Subheading: 4.5 Bidirectional LSTM
  Heading: 5.Advanced Learning
    Subheading: 5.1 ENCODER AND DECODER 
    Subheading: 5.2 ENCODER AND DECODER with Attention 
    Subheading: 5.3 INTRO TO TRANSFORMERS 
