In [12]:
import re
import os
import pdfplumber
import json
from collections import defaultdict

def extract_pdf_sections(pdf_path, max_pages=None):
    """
    Extract sections and tables from a PDF file.
    Sections are identified by patterns like (#.#.#)* or similar numbering formats.
    
    Args:
        pdf_path: Path to the PDF file
        max_pages: Maximum number of pages to process (None for all pages)
    """
    # Store all extracted content with page info
    all_content = []
    
    # Open the PDF with pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        # Determine how many pages to process
        total_pages = len(pdf.pages)
        pages_to_process = min(total_pages, max_pages) if max_pages is not None else total_pages
        
        print(f"Processing {pages_to_process} of {total_pages} pages")
        
        for page_num, page in enumerate(pdf.pages[:pages_to_process], 1):
            # Extract text and tables from the page
            text = page.extract_text()
            tables = extract_tables_from_page(page)
            
            # Remove footer (assuming footer is in last line)
            lines = text.split('\n')
            if len(lines) > 1:
                text = '\n'.join(lines[:-1])
            
            # Store page content
            all_content.append({
                'page_num': page_num,
                'text': text,
                'tables': tables
            })
    
    # Process content to identify sections and their tables
    return process_content_into_sections(all_content)

def extract_tables_from_page(page):
    """
    Extract tables from a page with improved cell structure detection.
    """
    try:
        # Try using built-in table extraction first
        tables = page.extract_tables()
        
        # If no tables found or tables appear malformed, try with explicit settings
        if not tables or any(len(row) == 0 for table in tables for row in table if table):
            # Try with explicit table settings
            tables = page.extract_tables(
                table_settings={
                    "vertical_strategy": "text", 
                    "horizontal_strategy": "text",
                    "keep_blank_chars": True,
                    "text_tolerance": 3,
                    "text_x_tolerance": 3,
                    "text_y_tolerance": 3,
                    "intersection_tolerance": 3
                }
            )
        
        return tables
    except Exception as e:
        print(f"Error extracting tables: {e}")
        return []

def process_content_into_sections(all_content):
    """
    Process extracted content to identify sections based on numbering pattern (#.#.#)*
    and associate them with tables.
    """
    sections = []
    # Enhanced section pattern to match numbering formats like (1.2.3) or (1.2) or (1)
    section_pattern = r'(\(\d+(?:\.\d+)*\))(.*?)(?=\(\d+(?:\.\d+)*\)|$)'
    
    # Concatenate all text first to properly detect sections that span multiple pages
    all_text = '\n'.join([page['text'] for page in all_content])
    
    # Find all section matches in the combined text
    section_matches = list(re.finditer(section_pattern, all_text, re.DOTALL))
    
    if not section_matches:
        # No sections found with the pattern, create a single section with all content
        return [{
            'description': 'Entire Document',
            'content': all_text,
            'tables': [process_table(table) for page in all_content for table in page['tables']]
        }]
    
    # Create sections based on matches
    for i, match in enumerate(section_matches):
        section_number = match.group(1).strip()
        section_desc = match.group(2).strip()
        
        # Determine section end position
        next_pos = match.end()
        if i < len(section_matches) - 1:
            next_pos = section_matches[i+1].start()
        
        # Create section
        section = {
            'number': section_number,
            'description': section_desc,
            'content': match.group(0).strip(),  # Full section text
            'tables': []
        }
        
        # Find start and end positions in the original text
        start_pos = match.start()
        end_pos = next_pos
        
        # Now iterate through pages to find tables that belong to this section
        current_pos = 0
        for page_content in all_content:
            page_text = page_content['text']
            page_tables = page_content['tables']
            
            # Calculate the position range for this page in the combined text
            page_start = current_pos
            page_end = current_pos + len(page_text)
            current_pos = page_end + 1  # +1 for the added newline when joining
            
            # Check if section overlaps with this page
            if not (end_pos < page_start or start_pos > page_end):
                # This page contains content for the current section
                for table in page_tables:
                    # Process and add table
                    processed_table = process_table(table)
                    if processed_table:  # Only add non-empty tables
                        section['tables'].append(processed_table)
        
        sections.append(section)
    
    return sections

def process_table(table):
    """
    Process and clean table data from pdfplumber.
    Returns a list of rows, where each row is a list of cell values.
    Handles multi-line text within cells.
    """
    if not table or len(table) == 0:
        return []
        
    processed_rows = []
    
    for row in table:
        # Clean row data
        processed_row = []
        for cell in row:
            if cell is None:
                processed_row.append("")
            else:
                # Replace newlines with spaces within cells
                cell_text = str(cell).strip().replace('\n', ' ')
                # Remove extra spaces
                cell_text = ' '.join(cell_text.split())
                processed_row.append(cell_text)
        
        # Skip completely empty rows
        if any(cell != "" for cell in processed_row):
            processed_rows.append(processed_row)
    
    return processed_rows

def save_output_to_json(sections, output_file):
    """Save the structured output to a JSON file."""
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(sections, f, indent=2, ensure_ascii=False)

def print_sections_summary(sections):
    """Print a summary of the extracted sections and tables."""
    print(f"Found {len(sections)} sections:")
    for i, section in enumerate(sections, 1):
        section_num = section.get('number', '')
        section_desc = section.get('description', '')
        section_desc_preview = section_desc[:50] + '...' if len(section_desc) > 50 else section_desc
        
        print(f"  {i}. {section_num} {section_desc_preview}")
        print(f"     Tables: {len(section['tables'])}")
        
        # Print table previews
        for j, table in enumerate(section['tables'], 1):
            if table and len(table) > 0:
                print(f"     Table {j}: {len(table)} rows x {len(table[0]) if table else 0} columns")
                if len(table) > 0:
                    first_row_preview = str(table[0])[:100] + '...' if len(str(table[0])) > 100 else table[0]
                    print(f"       First row: {first_row_preview}")

def debug_table_extraction(pdf_path, page_num, output_dir='debug_output'):
    """
    Debug function to visualize table extraction for a specific page.
    Creates visual output of how tables are being detected.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    with pdfplumber.open(pdf_path) as pdf:
        if page_num < 1 or page_num > len(pdf.pages):
            print(f"Page number {page_num} out of range (1-{len(pdf.pages)})")
            return
        
        page = pdf.pages[page_num - 1]
        
        # Extract tables with default settings
        tables = page.extract_tables()
        
        # Create debug image
        img = page.to_image(resolution=150)
        img.draw_rects(page.extract_words())
        
        # Draw table boundaries
        for table in page.find_tables():
            rect = (table.bbox[0], table.bbox[1], table.bbox[2], table.bbox[3])
            img.draw_rect(rect, stroke='red', fill=None, stroke_width=3)
            
            # Draw cell boundaries
            for cell in table.cells:
                rect = (cell[0], cell[1], cell[2], cell[3])
                img.draw_rect(rect, stroke='blue', fill=None)
        
        # Save debug image
        output_path = os.path.join(output_dir, f"debug_page_{page_num}.png")
        img.save(output_path)
        print(f"Debug image saved to {output_path}")
        
        # Also save extracted text for comparison
        text_output_path = os.path.join(output_dir, f"text_page_{page_num}.txt")
        with open(text_output_path, 'w', encoding='utf-8') as f:
            f.write(page.extract_text())
        print(f"Page text saved to {text_output_path}")

def main():
    pdf_path = 'pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6.pdf'  # Replace with your PDF file path
    output_file = 'output_sections.json'
    max_pages = 10  # Process only the first 10 pages, set to None for all pages
    
    # Uncomment this line to debug table extraction on a specific page
    # debug_table_extraction(pdf_path, page_num=1)
    
    # Process the PDF
    sections = extract_pdf_sections(pdf_path, max_pages)
    save_output_to_json(sections, output_file)
    
    # Print summary to console
    print_sections_summary(sections)

if __name__ == "__main__":
    main()

  pdf_path = 'pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6.pdf'  # Replace with your PDF file path
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Processing 10 of 368 pages
Error extracting tables: TableSettings.__init__() got an unexpected keyword argument 'keep_blank_chars'
Error extracting tables: TableSettings.__init__() got an unexpected keyword argument 'keep_blank_chars'
Error extracting tables: TableSettings.__init__() got an unexpected keyword argument 'keep_blank_chars'
Error extracting tables: TableSettings.__init__() got an unexpected keyword argument 'keep_blank_chars'
Error extracting tables: TableSettings.__init__() got an unexpected keyword argument 'keep_blank_chars'
Error extracting tables: TableSettings.__init__() got an unexpected keyword argument 'keep_blank_chars'


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Found 124 sections:
  1. (1.1) In which language are you submitting your response...
     Tables: 0
  2. (1.2) Select the currency used for all financial informa...
     Tables: 0
  3. (1.3) Provide an overview and introduction to your organ...
     Tables: 0
  4. (1.4) State the end date of the year for which you are r...
     Tables: 0
  5. (1.5) Provide details on your reporting boundary. .........
     Tables: 0
  6. (1.6) Does your organization have an ISIN code or anothe...
     Tables: 0
  7. (1.7) Select the countries/areas in which you operate. ....
     Tables: 0
  8. (1.8) Are you able to provide geolocation data for your ...
     Tables: 0
  9. (1.8.1) Please provide all available geolocation data for ...
     Tables: 0
  10. (1.9) What was the size of your organization based on to...
     Tables: 0
  11. (1.10) Which activities does your organization undertake,...
     Tables: 0
  12. (1.11) Are greenhouse gas emissions and/or water-related ...
     Tables: 0
  13. (1.12) 