In [6]:
from pdf_utils import *
from pathlib import Path

# =============================================================================
# PDF TEXT CLEANING DEMONSTRATION
# =============================================================================

# Define PDF file paths for testing
pdf_files = [
    "datafile/codice etico fittizio_Salute e sicurezza dei lavoratori.pdf",
    "datafile/BIS - Regolamento Aziendale.pdf", 
    "datafile/ccnl_commercio.pdf",
]

print("PDF TEXT CLEANING DEMONSTRATION")
print("=" * 60)

# =============================================================================
# PROCESS EACH PDF FILE
# =============================================================================

for file_idx, pdf_path in enumerate(pdf_files):
    print(f"\nProcessing file {file_idx + 1}/{len(pdf_files)}: {Path(pdf_path).name}")
    print("-" * 60)
    
    # Extract pages from current PDF
    pages = extract_pages(pdf_path)
    print(f"Extracted {len(pages)} pages from PDF")
    
    # Skip processing if no pages found
    if not pages:
        print("No pages found in PDF, skipping...")
        continue
    
    # =============================================================================
    # HEADER REMOVAL PROCESS
    # =============================================================================
    
    # Identify and remove common headers
    clean_headers, common_header_lines = remove_header(pages)
    print(f"\nHeader analysis results:")
    print(f"Common header lines identified: {len(common_header_lines)}")
    
    if common_header_lines:
        print("Header lines found:")
        for i, line in enumerate(sorted(common_header_lines), 1):
            print(f"  {i}. '{line}'")
    
    # =============================================================================
    # WHITESPACE CLEANING
    # =============================================================================
    
    # Clean excessive whitespace and newlines
    final_clean = [clean_whitespace(page) for page in clean_headers]
    
    # =============================================================================
    # CLEANING STATISTICS
    # =============================================================================
    
    # Calculate improvement statistics
    original_total_chars = sum(len(page) for page in pages)
    cleaned_total_chars = sum(len(page) for page in final_clean)
    reduction_percent = ((original_total_chars - cleaned_total_chars) / original_total_chars) * 100
    
    print(f"\nCleaning statistics:")
    print(f"Original total characters: {original_total_chars:,}")
    print(f"Cleaned total characters: {cleaned_total_chars:,}")
    print(f"Character reduction: {reduction_percent:.1f}%")
    
    # =============================================================================
    # BEFORE/AFTER COMPARISON FOR FIRST 8 PAGES
    # =============================================================================
    
    # Determine how many pages to show (max 8 or all available)
    pages_to_show = min(8, len(pages))
    print(f"\nBefore/After comparison (first and last 100 chars of first {pages_to_show} pages):")
    
    print("\nBEFORE - First 100 characters:")
    for page_idx in range(pages_to_show):
        print(f"  Page {page_idx + 1}: {repr(pages[page_idx][:100])}")
    
    print("\nAFTER - First 100 characters:")
    for page_idx in range(pages_to_show):
        print(f"  Page {page_idx + 1}: {repr(final_clean[page_idx][:100])}")
        
    print("\nBEFORE - Last 100 characters:")
    for page_idx in range(pages_to_show):
        page_content = pages[page_idx]
        last_chars = page_content[-100:] if len(page_content) > 100 else page_content
        print(f"  Page {page_idx + 1}: {repr(last_chars)}")
    
    print("\nAFTER - Last 100 characters:")
    for page_idx in range(pages_to_show):
        page_content = final_clean[page_idx]
        last_chars = page_content[-100:] if len(page_content) > 100 else page_content
        print(f"  Page {page_idx + 1}: {repr(last_chars)}")
    
    print(f"\nProcessing completed for {Path(pdf_path).name}")

print("\n" + "=" * 60)
print("All PDF files processed successfully")

PDF TEXT CLEANING DEMONSTRATION

Processing file 1/3: codice etico fittizio_Salute e sicurezza dei lavoratori.pdf
------------------------------------------------------------
Extracted 1 pages from PDF

Header analysis results:
Common header lines identified: 0

Cleaning statistics:
Original total characters: 1,946
Cleaned total characters: 1,905
Character reduction: 2.1%

Before/After comparison (first and last 100 chars of first 1 pages):

BEFORE - First 100 characters:
  Page 1: 'Salute e sicurezza sui luoghi di lavoro \n• Diritti del lavoratore \n \nL’Azienda si pone come obiettiv'

AFTER - First 100 characters:
  Page 1: 'Salute e sicurezza sui luoghi di lavoro\n• Diritti del lavoratore\nL’Azienda si pone come obiettivo pr'

BEFORE - Last 100 characters:
  Page 1: '\no quella dei colleghi. Evitando ad esempio di manomettere o rimuovere i dispositivi di \nsicurezza. '

AFTER - Last 100 characters:
  Page 1: 'za\no quella dei colleghi. Evitando ad esempio di manomettere o rimuovere 