In [None]:
# Install required libraries
! pip install pypdfium2 PyMuPDF


In [None]:
from snowflake.snowpark.context import get_active_session
import pypdfium2 as pdfium
import fitz  # PyMuPDF
import json
import os
import sys
import tempfile
import urllib.parse
from typing import List, Tuple
from datetime import datetime

# FLEXIBLE PARAMETER PARSING - HANDLES SINGLE FILE OR ALL FILES
# Parameters: input_stage (arg 0), output_stage (arg 1), file_name (arg 2+, optional)
# Files with spaces like 'akasha july 2025.pdf' are handled correctly
# Default values are provided for testing/development

# Set default values
input_stage = "@ADVANCED_ANALYTICS.REDACT_PDF_DEMO.ORIG_PDFS"
output_stage = "@ADVANCED_ANALYTICS.REDACT_PDF_DEMO.REDACTED_PDFS"
specific_file = None  # None means process all files
process_all_files = True  # Default to processing all files

# ========================================
# FLEXIBLE PARAMETER PARSING
# ========================================
print(f"🔍 FLEXIBLE PARAMETER DEBUGGING & EXTRACTION:")
print(f"   Total sys.argv length: {len(sys.argv)}")
print(f"   Raw sys.argv: {sys.argv}")
print(f"   Individual arguments:")
for i, arg in enumerate(sys.argv):
    print(f"     sys.argv[{i}] = '{arg}'")

# Parse parameters based on count
if len(sys.argv) >= 2:  # At least input_stage and output_stage
    try:
        print(f"\n📋 Parsing parameters from sys.argv...")
        
        # Position 1: input_stage
        input_stage = sys.argv[0]
        print(f"   ✅ Set input_stage from sys.argv[0] = '{input_stage}'")
        
        # Position 2: output_stage  
        output_stage = sys.argv[1]
        print(f"   ✅ Set output_stage from sys.argv[1] = '{output_stage}'")
        
        # Position 3+: file_name (optional, may contain spaces)
        if len(sys.argv) >= 3:
            # Everything from position 2 onwards is part of the filename
            file_parts = sys.argv[2:]
            specific_file = " ".join(file_parts)
            process_all_files = False
            print(f"   ✅ Set specific_file from sys.argv[2:] = '{specific_file}'")
            print(f"   📝 File parts were: {file_parts}")
        else:
            process_all_files = True
            specific_file = None
            print(f"   📋 No specific file provided - will process ALL files in stage")
            
    except Exception as e:
        print(f"⚠️ Error parsing parameters, using defaults: {e}")
else:
    print(f"\n📋 Not enough parameters provided ({len(sys.argv)}), using defaults.")
    print(f"📋 Will process ALL files in default stage")

# Parameter validation and correction
print(f"\n🔧 Validating and fixing parameters...")
if not input_stage.startswith('@'):
    print(f"⚠️ WARNING: input_stage missing '@' prefix. Got: '{input_stage}'")
    input_stage = f"@{input_stage}"
    print(f"🔧 Fixed: input_stage = '{input_stage}'")

if not output_stage.startswith('@'):
    print(f"⚠️ WARNING: output_stage missing '@' prefix. Got: '{output_stage}'")
    output_stage = f"@{output_stage}"
    print(f"🔧 Fixed: output_stage = '{output_stage}'")

if specific_file and not specific_file.endswith('.pdf'):
    print(f"⚠️ WARNING: specific_file should be a PDF file. Got: '{specific_file}'")

print(f"\n🎯 FINAL PARAMETERS TO BE USED:")
print(f"   Input Stage: '{input_stage}'")
print(f"   Output Stage: '{output_stage}'")
if process_all_files:
    print(f"   Processing Mode: ALL FILES in input stage")
else:
    print(f"   Processing Mode: SINGLE FILE")
    print(f"   Specific File: '{specific_file}'")
    print(f"   File has spaces: {'yes' if ' ' in specific_file else 'no'}")
print(f"   " + "="*50)

# Get the active Snowflake session
session = get_active_session()

def setup_tracking_table():
    """
    Create the PII redaction tracking table if it doesn't exist.
    """
    try:
        print(f"📋 Setting up tracking table...")
        
        create_table_sql = """
        CREATE TABLE IF NOT EXISTS PII_REDACTION_LOG (
            LOG_ID NUMBER AUTOINCREMENT,
            TIMESTAMP TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(),
            INPUT_STAGE STRING,
            INPUT_FILE STRING,
            OUTPUT_STAGE STRING,
            STATUS STRING, -- 'SUBMITTED', 'COMPLETED', 'FAILED'
            ERROR_MESSAGE STRING,
            TEXT_LENGTH NUMBER,
            PII_FOUND_COUNT NUMBER,
            PII_ITEMS STRING, -- JSON array of PII items
            REDACTIONS_MADE NUMBER,
            OUTPUT_FILE STRING,
            PROCESSING_TIME_SECONDS NUMBER,
            WARNINGS STRING, -- JSON array of warnings
            CONSTRAINT PK_PII_LOG PRIMARY KEY (LOG_ID)
        )
        """
        
        session.sql(create_table_sql).collect()
        print(f"✅ Tracking table PII_REDACTION_LOG ready")
        return True
        
    except Exception as e:
        print(f"⚠️ Failed to create tracking table: {e}")
        return False

def log_processing_start(input_stage: str, input_file: str, output_stage: str) -> int:
    """
    Log the start of processing and return the log_id for updates.
    """
    try:
        insert_sql = f"""
        INSERT INTO PII_REDACTION_LOG 
        (INPUT_STAGE, INPUT_FILE, OUTPUT_STAGE, STATUS)
        VALUES 
        ('{input_stage.replace("'", "''")}', 
         '{input_file.replace("'", "''")}', 
         '{output_stage.replace("'", "''")}', 
         'SUBMITTED')
        """
        
        session.sql(insert_sql).collect()
        
        # Get the log_id of the inserted record
        get_id_sql = """
        SELECT MAX(LOG_ID) as LOG_ID 
        FROM PII_REDACTION_LOG 
        WHERE STATUS = 'SUBMITTED'
        """
        
        result = session.sql(get_id_sql).collect()
        log_id = result[0][0] if result else None
        
        print(f"📝 Started tracking with LOG_ID: {log_id}")
        return log_id
        
    except Exception as e:
        print(f"⚠️ Failed to log processing start: {e}")
        return None

def log_processing_result(log_id: int, results: dict, processing_time: float):
    """
    Update the log with final processing results.
    """
    try:
        if log_id is None:
            print(f"⚠️ No log_id provided, skipping result logging")
            return
            
        status = 'COMPLETED' if results['success'] else 'FAILED'
        error_msg = results.get('error', '').replace("'", "''")
        pii_items_json = json.dumps(results.get('pii_found', [])).replace("'", "''")
        warnings_json = json.dumps(results.get('warnings', [])).replace("'", "''")
        output_file = results.get('output_file', '').replace("'", "''")
        
        update_sql = f"""
        UPDATE PII_REDACTION_LOG 
        SET 
            STATUS = '{status}',
            ERROR_MESSAGE = '{error_msg}',
            TEXT_LENGTH = {results.get('extracted_text_length', 0)},
            PII_FOUND_COUNT = {len(results.get('pii_found', []))},
            PII_ITEMS = '{pii_items_json}',
            REDACTIONS_MADE = {results.get('total_redactions', 0)},
            OUTPUT_FILE = '{output_file}',
            PROCESSING_TIME_SECONDS = {processing_time:.2f},
            WARNINGS = '{warnings_json}'
        WHERE LOG_ID = {log_id}
        """
        
        session.sql(update_sql).collect()
        print(f"📝 Updated tracking log {log_id} with status: {status}")
        
    except Exception as e:
        print(f"⚠️ Failed to log processing result: {e}")

def get_all_pdf_files(stage_path: str) -> List[str]:
    """
    Get all PDF files from the specified stage.
    """
    try:
        print(f"📁 Getting all PDF files from stage: {stage_path}")
        
        # List files in the stage
        list_query = f"LIST {stage_path}"
        print(f"📝 LIST query: {list_query}")
        
        files_result = session.sql(list_query).collect()
        print(f"📊 Found {len(files_result)} total files in stage")
        
        # Extract PDF file names from the result
        pdf_files = []
        for row in files_result:
            file_path = row[0]  # The file path is typically in the first column
            
            # Extract just the filename from the full path
            if '/' in file_path:
                filename = file_path.split('/')[-1]
            else:
                filename = file_path
            
            # Only include PDF files
            if filename.lower().endswith('.pdf'):
                pdf_files.append(filename)
                print(f"   📄 Found PDF: '{filename}'")
        
        print(f"✅ Total PDF files found: {len(pdf_files)}")
        return pdf_files
        
    except Exception as e:
        print(f"❌ Error getting PDF files from stage: {e}")
        raise

def check_file_exists(stage_path: str, file_name: str) -> bool:
    """
    Check if a file exists in the specified stage.
    Enhanced to handle files with spaces and special characters.
    """
    try:
        print(f"🔍 Checking if file exists: {stage_path}/{file_name}")
        
        # List files in the stage and check if our file exists
        list_query = f"LIST {stage_path}"
        
        files_result = session.sql(list_query).collect()
        
        # Extract file names from the result
        existing_files = []
        for row in files_result:
            file_path = row[0]  # The file path is typically in the first column
            
            # Extract just the filename from the full path
            if '/' in file_path:
                filename = file_path.split('/')[-1]
            else:
                filename = file_path
            existing_files.append(filename)
        
        # Check for exact match first
        file_exists = file_name in existing_files
        
        # If not found, check for case-insensitive match
        if not file_exists:
            file_name_lower = file_name.lower()
            for existing_file in existing_files:
                if existing_file.lower() == file_name_lower:
                    print(f"📝 Found case-insensitive match: '{existing_file}' vs '{file_name}'")
                    file_exists = True
                    break
        
        print(f"✅ File '{file_name}' exists: {file_exists}")
        return file_exists
        
    except Exception as e:
        print(f"❌ Error checking file existence: {e}")
        return False

def extract_text_from_pdf(stage_path: str, file_name: str) -> str:
    """
    Extract text from a PDF using Snowflake Cortex parse_document.
    Enhanced to handle files with spaces and special characters.
    """
    try:
        print(f"🔍 extract_text_from_pdf called with:")
        print(f"   stage_path: '{stage_path}'")
        print(f"   file_name: '{file_name}'")
        
        # Validate inputs before building query
        if not stage_path.startswith('@'):
            raise ValueError(f"Stage path must start with '@'. Got: '{stage_path}'")
        if not file_name.endswith('.pdf'):
            raise ValueError(f"File name must end with '.pdf'. Got: '{file_name}'")
        
        # Remove @ symbol if present for the stage reference
        stage_ref = stage_path.lstrip('@')
        
        # Properly escape file name for SQL (handles spaces and special characters)
        escaped_file_name = file_name.replace("'", "''")
        
        # Build query with proper escaping
        parse_query = f"""
            SELECT snowflake.cortex.parse_document(
                @{stage_ref},
                '{escaped_file_name}'
            )
        """
        
        parsed_result = session.sql(parse_query).collect()
        
        if parsed_result and len(parsed_result) > 0:
            json_string = parsed_result[0][0]
            if json_string:
                data = json.loads(json_string)
                content = data.get("content", "No content found.").replace('\n', ' ')
                print(f"✅ Successfully extracted {len(content)} characters")
                return content
            else:
                raise Exception("Parse result is empty or null")
        else:
            raise Exception("No content extracted from PDF - empty result set")
            
    except Exception as e:
        print(f"❌ Failed to extract text from document: {e}")
        raise

def detect_pii(text_content: str) -> List[str]:
    """
    Detect PII elements in text using Snowflake Cortex AI.
    """
    try:
        if not text_content or len(text_content.strip()) == 0:
            print(f"⚠️ No text content to analyze for PII")
            return []
            
        print(f"🔍 Analyzing {len(text_content)} characters for PII...")
        
        prompt = f"""
            Extract all PII from the text. PII includes names, phone numbers, emails, addresses, and unique URLs.
            List each item. If none, state 'No PII detected'. Text: {text_content}
        """
        pii_query = f"""
            SELECT AI_COMPLETE(
                model => 'claude-3-5-sonnet', 
                prompt => $${prompt}$$, 
                response_format => {{
                    'type': 'json', 
                    'schema': {{
                        'type': 'object', 
                        'properties': {{
                            'pii_list': {{
                                'type': 'array', 
                                'items': {{'type': 'string'}}
                            }}
                        }}
                    }}
                }}
            ) as pii_results
        """
        pii_results_df = session.sql(pii_query).to_pandas()
        pii_output = pii_results_df['PII_RESULTS'][0]
        pii_data = json.loads(pii_output)
        pii_list = pii_data.get('pii_list', [])
        print(f"✅ PII analysis complete: found {len(pii_list)} items")
        return pii_list
    except Exception as e:
        print(f"❌ Failed to analyze PII: {e}")
        raise

def redact_pii_from_pdf(pdf_path: str, pii_list: List[str]) -> Tuple[bytes, int]:
    """
    Redacts PII from a PDF by finding the coordinates of each word, matching them against
    the PII list, and then redacting the identified phrases.
    """
    try:
        print(f"🔍 Opening PDF for redaction: {pdf_path}")
        doc = fitz.open(pdf_path)
        total_redactions = 0
        
        # Clean up the PII list for efficient searching
        pii_to_find = list(set(item.strip().lower() for item in pii_list if item.strip()))
        
        if not pii_to_find:
            print("⚠️ PII list is empty, no redactions to perform.")
            return doc.tobytes(), 0

        print(f"🔍 Searching for {len(pii_to_find)} unique PII elements across {len(doc)} pages...")

        # Iterate through each page of the PDF
        for page_num, page in enumerate(doc):
            redactions_on_page = 0
            # Get all words on the page with their coordinates
            words = page.get_text("words")
            
            if not words:
                continue

            for pii_item in pii_to_find:
                pii_words = pii_item.split()
                if not pii_words:
                    continue

                # Search for sequences of words on the page that match the PII item
                for i in range(len(words) - len(pii_words) + 1):
                    phrase_to_check = " ".join(words[j][4] for j in range(i, i + len(pii_words)))
                    
                    # Check for a case-insensitive match
                    if phrase_to_check.lower() == pii_item:
                        start_rect = fitz.Rect(words[i][:4])
                        end_rect = fitz.Rect(words[i + len(pii_words) - 1][:4])
                        redaction_rect = start_rect | end_rect
                        
                        page.add_redact_annot(redaction_rect, fill=(0, 0, 0))
                        total_redactions += 1
                        redactions_on_page += 1
            
            if redactions_on_page > 0:
                print(f"✅ Marked {redactions_on_page} redactions on page {page_num + 1}.")
                # Apply redactions, which also removes the underlying text
                page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_PIXELS) 

        pdf_bytes = doc.tobytes()
        doc.close()
        print(f"✅ PDF redaction complete: {total_redactions} total redactions")
        
        return pdf_bytes, total_redactions
        
    except Exception as e:
        print(f"❌ Error during PDF redaction: {e}")
        raise

def safe_file_download(stage_path: str, file_name: str, temp_dir: str) -> str:
    """
    Safely download a file with special handling for files with spaces.
    Returns the local file path.
    """
    try:
        print(f"📥 Starting safe file download for: '{file_name}'")
        
        local_file_path = os.path.join(temp_dir, file_name)
        
        # Method 1: Direct download
        try:
            download_result = session.file.get(f"{stage_path}/{file_name}", temp_dir)
            if os.path.exists(local_file_path):
                file_size = os.path.getsize(local_file_path)
                print(f"✅ File downloaded successfully: {file_size} bytes")
                return local_file_path
        except Exception as e:
            print(f"⚠️ Standard download failed: {e}")
        
        # Method 2: Try with URL encoding
        try:
            encoded_file_name = urllib.parse.quote(file_name)
            download_result = session.file.get(f"{stage_path}/{encoded_file_name}", temp_dir)
            
            encoded_local_path = os.path.join(temp_dir, encoded_file_name)
            if os.path.exists(encoded_local_path):
                os.rename(encoded_local_path, local_file_path)
                file_size = os.path.getsize(local_file_path)
                print(f"✅ File downloaded successfully (encoded): {file_size} bytes")
                return local_file_path
        except Exception as e:
            print(f"⚠️ Encoded download failed: {e}")
        
        # Method 3: SQL GET command
        try:
            escaped_file_name = file_name.replace("'", "''")
            get_sql = f"""
            GET {stage_path}/'{escaped_file_name}' 
            FILE:///{temp_dir.replace(chr(92), '/')}/
            """
            
            result = session.sql(get_sql).collect()
            
            if os.path.exists(local_file_path):
                file_size = os.path.getsize(local_file_path)
                print(f"✅ File downloaded successfully (SQL GET): {file_size} bytes")
                return local_file_path
        except Exception as e:
            print(f"⚠️ SQL GET download failed: {e}")
        
        raise Exception(f"All download methods failed for file '{file_name}'")
        
    except Exception as e:
        print(f"❌ Safe file download failed: {e}")
        raise

def safe_file_upload(local_file_path: str, stage_path: str, target_filename: str) -> bool:
    """
    Safely upload a file with special handling for files with spaces.
    """
    try:
        print(f"📤 Starting safe file upload for: '{target_filename}'")
        
        if not os.path.exists(local_file_path):
            raise Exception(f"Local file not found: {local_file_path}")
        
        file_size = os.path.getsize(local_file_path)
        print(f"📊 Local file size: {file_size} bytes")
        
        # Method 1: Standard upload
        try:
            upload_result = session.file.put(local_file_path, stage_path, auto_compress=False, overwrite=True)
            print(f"✅ File uploaded successfully")
            return True
        except Exception as e:
            print(f"⚠️ Standard upload failed: {e}")
        
        # Method 2: SQL PUT command
        try:
            put_sql = f"""
            PUT FILE:///{local_file_path.replace(chr(92), '/')}
            {stage_path}
            AUTO_COMPRESS=FALSE
            OVERWRITE=TRUE
            """
            
            result = session.sql(put_sql).collect()
            print(f"✅ File uploaded successfully (SQL PUT)")
            return True
        except Exception as e:
            print(f"⚠️ SQL PUT upload failed: {e}")
        
        raise Exception(f"All upload methods failed for file '{target_filename}'")
        
    except Exception as e:
        print(f"❌ Safe file upload failed: {e}")
        raise

def process_single_pdf(input_stage: str, input_file: str, output_stage: str) -> dict:
    """
    Process PII redaction for a single PDF file.
    """
    start_time = datetime.now()
    
    results = {
        'input_file': input_file,
        'success': False,
        'extracted_text_length': 0,
        'pii_found': [],
        'total_redactions': 0,
        'output_file': None,
        'error': None,
        'warnings': []
    }
    
    # Log the start
    log_id = log_processing_start(input_stage, input_file, output_stage)
    
    try:
        print(f"🚀 Processing single file: '{input_file}'")
        
        # Step 0: Check if input file exists
        if not check_file_exists(input_stage, input_file):
            error_msg = f"Input file '{input_file}' not found in stage '{input_stage}'"
            print(f"❌ {error_msg}")
            results['error'] = error_msg
            return results
        
        # Step 1: Extract text from PDF
        print(f"\n📝 Step 1: Extracting text from PDF...")
        extracted_text = extract_text_from_pdf(input_stage, input_file)
        results['extracted_text_length'] = len(extracted_text)
        print(f"✅ Extracted {len(extracted_text)} characters of text")
        
        if len(extracted_text) < 10:
            warning = f"Very little text extracted ({len(extracted_text)} chars). File might be image-based or corrupted."
            print(f"⚠️ {warning}")
            results['warnings'].append(warning)
        
        # Step 2: Detect PII
        print(f"\n🔍 Step 2: Detecting PII elements...")
        pii_list = detect_pii(extracted_text)
        results['pii_found'] = pii_list
        print(f"✅ Found {len(pii_list)} PII elements")
        
        if not pii_list:
            print(f"\nℹ️ No PII found - copying original file to output stage")
            with tempfile.TemporaryDirectory() as temp_dir:
                try:
                    local_input_path = safe_file_download(input_stage, input_file, temp_dir)
                    output_filename = f"redacted_{input_file}"
                    safe_file_upload(local_input_path, output_stage, output_filename)
                    results['output_file'] = output_filename
                    results['success'] = True
                except Exception as e:
                    error_msg = f"Failed to copy file (no PII case): {str(e)}"
                    print(f"❌ {error_msg}")
                    results['error'] = error_msg
                    return results
            return results
        
        # Step 3: Download PDF and redact PII
        print(f"\n🖤 Step 3: Redacting PII from PDF...")
        with tempfile.TemporaryDirectory() as temp_dir:
            try:
                local_input_path = safe_file_download(input_stage, input_file, temp_dir)
                
                # Redact PII
                redacted_pdf_bytes, total_redactions = redact_pii_from_pdf(local_input_path, pii_list)
                results['total_redactions'] = total_redactions
                
                # Save redacted PDF
                output_filename = f"redacted_{input_file}"
                local_output_path = os.path.join(temp_dir, output_filename)
                
                with open(local_output_path, 'wb') as f:
                    f.write(redacted_pdf_bytes)
                
                # Upload to output stage
                safe_file_upload(local_output_path, output_stage, output_filename)
                results['output_file'] = output_filename
                
            except Exception as e:
                error_msg = f"Failed during redaction process: {str(e)}"
                print(f"❌ {error_msg}")
                results['error'] = error_msg
                return results
                
        print(f"\n✅ Successfully applied {total_redactions} redactions")
        results['success'] = True
        
    except Exception as e:
        error_msg = f"Unexpected error processing '{input_file}': {str(e)}"
        print(f"❌ {error_msg}")
        results['error'] = error_msg
    
    finally:
        # Log the final results
        processing_time = (datetime.now() - start_time).total_seconds()
        log_processing_result(log_id, results, processing_time)
    
    return results

def process_all_pdfs(input_stage: str, output_stage: str) -> dict:
    """
    Process all PDF files in the input stage.
    """
    overall_start_time = datetime.now()
    
    summary = {
        'total_files': 0,
        'successful': 0,
        'failed': 0,
        'files_processed': [],
        'errors': []
    }
    
    try:
        print(f"🚀 Starting bulk processing of all PDFs in stage: {input_stage}")
        
        # Get all PDF files
        pdf_files = get_all_pdf_files(input_stage)
        summary['total_files'] = len(pdf_files)
        
        if not pdf_files:
            print(f"⚠️ No PDF files found in stage: {input_stage}")
            return summary
        
        print(f"📋 Found {len(pdf_files)} PDF files to process")
        
        # Process each file
        for i, pdf_file in enumerate(pdf_files, 1):
            print(f"\n" + "="*60)
            print(f"📄 Processing file {i}/{len(pdf_files)}: '{pdf_file}'")
            print("="*60)
            
            try:
                result = process_single_pdf(input_stage, pdf_file, output_stage)
                
                if result['success']:
                    summary['successful'] += 1
                    print(f"✅ Successfully processed: {pdf_file}")
                else:
                    summary['failed'] += 1
                    error_info = f"Failed to process '{pdf_file}': {result.get('error', 'Unknown error')}"
                    summary['errors'].append(error_info)
                    print(f"❌ Failed to process: {pdf_file}")
                
                summary['files_processed'].append({
                    'file': pdf_file,
                    'success': result['success'],
                    'pii_found': len(result['pii_found']),
                    'redactions': result['total_redactions'],
                    'output_file': result.get('output_file'),
                    'error': result.get('error')
                })
                
            except Exception as e:
                summary['failed'] += 1
                error_info = f"Exception processing '{pdf_file}': {str(e)}"
                summary['errors'].append(error_info)
                print(f"❌ Exception processing {pdf_file}: {e}")
        
        # Print summary
        processing_time = (datetime.now() - overall_start_time).total_seconds()
        print(f"\n" + "="*60)
        print(f"📊 BULK PROCESSING SUMMARY")
        print("="*60)
        print(f"Total files found: {summary['total_files']}")
        print(f"Successfully processed: {summary['successful']}")
        print(f"Failed: {summary['failed']}")
        print(f"Success rate: {(summary['successful'] / summary['total_files'] * 100):.1f}%")
        print(f"Total processing time: {processing_time:.2f} seconds")
        
        if summary['errors']:
            print(f"\n❌ Errors encountered:")
            for error in summary['errors']:
                print(f"   - {error}")
        
    except Exception as e:
        error_msg = f"Failed during bulk processing: {str(e)}"
        print(f"❌ {error_msg}")
        summary['errors'].append(error_msg)
    
    return summary

# ========================================
# MAIN EXECUTION LOGIC
# ========================================

# Setup tracking table
setup_tracking_table()

print("\n" + "="*50)
print("🔒 FLEXIBLE PII REDACTION BATCH PROCESSOR")
print("="*50)

# Execute based on parameters
if process_all_files:
    print(f"🚀 MODE: Processing ALL PDF files in stage")
    print(f"📂 Input Stage: {input_stage}")
    print(f"📂 Output Stage: {output_stage}")
    
    summary = process_all_pdfs(input_stage, output_stage)
    
    print(f"\n🎉 BULK PROCESSING COMPLETED!")
    print(f"📊 Summary: {summary['successful']}/{summary['total_files']} files processed successfully")
    
else:
    print(f"🚀 MODE: Processing SINGLE file")
    print(f"📂 Input Stage: {input_stage}")
    print(f"📄 File: {specific_file}")
    print(f"📂 Output Stage: {output_stage}")
    
    result = process_single_pdf(input_stage, specific_file, output_stage)
    
    # Print detailed results for single file
    print(f"\n" + "="*50)
    print(f"📊 SINGLE FILE PROCESSING RESULTS")
    print("="*50)
    print(f"Input File: '{result['input_file']}'")
    print(f"Success: {result['success']}")
    print(f"Text Length: {result['extracted_text_length']} characters")
    print(f"PII Found: {len(result['pii_found'])} items")
    if result['pii_found']:
        for i, pii in enumerate(result['pii_found'], 1):
            print(f"  {i}. {pii}")
    print(f"Total Redactions: {result['total_redactions']}")
    print(f"Output File: {result['output_file']}")
    
    if result['error']:
        print(f"\n🚨 ERROR: {result['error']}")
    
    if result.get('warnings'):
        print(f"\n⚠️ WARNINGS:")
        for warning in result['warnings']:
            print(f"   - {warning}")
    
    if result['success']:
        print(f"\n🎉 PROCESSING COMPLETED SUCCESSFULLY!")
    else:
        print(f"\n❌ PROCESSING FAILED")

print(f"\n📊 Check PII_REDACTION_LOG table for detailed tracking:")
print(f"📝 Query: SELECT * FROM PII_REDACTION_LOG ORDER BY TIMESTAMP DESC LIMIT 10;")
print("="*50)
