In [1]:
# This code is for the Tax Return documents only
# Cell 1: Imports and Configuration
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import pdf2image
import re
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import pandas as pd
import os
import sys
import numpy as np

# === CONFIGURATION ===
if sys.platform == "win32":
    poppler_path = r"C:\Program Files\poppler\Library\bin"  # <-- UPDATE THIS PATH
    if os.path.exists(poppler_path):
        os.environ["PATH"] += os.pathsep + poppler_path

In [2]:
# Cell 2: Data Classes
@dataclass
class Addback:
    """Represents a single addback item"""
    line_item: str
    amount: float
    reason: str
    calculation: str = ""
    source: str = "Tax Return"
    confidence: float = 1.0  # 0-1 confidence score

In [3]:
# Cell 3: Complete ProductionAddbackAnalyzer Class
class ProductionAddbackAnalyzer:
    """Production-ready analyzer that works automatically without manual intervention"""
    
    def __init__(self):
        self.market_rate_salary = 195700
        
        # Enhanced patterns with multiple variations for better OCR matching
        self.extraction_patterns = {
            "depreciation": {
                "primary_patterns": [
                    # EXACT text from the form
                    r"MACRS deductions for assets placed in service.*?(\d{1,3}[,.]?\d{3})",
                    r"MACRS.*beginning before.*?(\d{1,3}[,.]?\d{3})",
                    # Also try just the number pattern near MACRS
                    r"MACRS.*?(\d{1,3}[,.]?\d{3})",
                    # Line 17 reference
                    r"(?:line\s*)?17[^\d]*?(\d{1,3}[,.]?\d{3})"
                ],
                "context_patterns": [
                    # Look for 1,373 specifically
                    r"1[,.]?373",
                    # Depreciation but NOT total
                    r"(?<!total).*depreciation.*?(\d{1,3}[,.]?\d{3})"
                ],
                "exclusion_patterns": [
                    r"total.*depreciation",
                    r"line\s*21",
                    r"line\s*22"
                ],
                "form_line": "Form 4562 Line 17 (MACRS)"
            },
            "officer_compensation": {
                "primary_patterns": [
                    r"HARVEY.*SEYBOLD.*?[\$\s]*(\d{3}[,.]?\d{3})",
                    r"compensation.*officers.*?[\$\s]*(\d{3}[,.]?\d{3})",
                    r"Form\s+1125-E.*line\s+[24].*?[\$\s]*(\d{3}[,.]?\d{3})"
                ],
                "context_patterns": [
                    r"200[,.]?115"  # Look for the specific amount
                ],
                "form_line": "Form 1125-E Line 2"
            },
            # In your __init__ method, update the automobile patterns:
            "automobile": {
                "primary_patterns": [
                    # More flexible patterns
                    r"automobile\s+and\s+truck\s+expense[s]?[:\s]*\$?\s*([0-9]{1,2}[,.]?[0-9]{3})",
                    r"automobile[^0-9]{0,30}([0-9]{1,2}[,.]?[0-9]{3})",
                    r"auto.*?truck.*?([0-9]{1,2}[,.]?[0-9]{3})",
                    # Look for the amount even if OCR mangles the text
                    r"(?:auto|truck|vehicle).*?expenses?.*?([0-9]{1,2}[,.]?[0-9]{3})",
                ],
                "context_patterns": [
                    # Just find 5-digit numbers near automobile text
                    r"([0-9]{1,2}[,.]?[0-9]{3})(?=[^0-9]|$)",
                ],
                "form_line": "Other Deductions - Schedule K"
            },
            "meals": {
                "primary_patterns": [
                    r"meals.*\(?50%\)?.*?[\$\s]*(\d{1,3})",
                    r"MEALS.*?[\$\s]*(\d{1,3})",
                    r"meals.*entertainment.*?[\$\s]*(\d{1,3}[,.]?\d{3})"
                ],
                "context_patterns": [
                    r"meal[^\d]*(\d{1,3})"
                ],
                "form_line": "Other Deductions"
            },
            "ordinary_income": {
                "primary_patterns": [
                    # Look for the specific line 22 calculation description
                    r"subtract\s+line\s+21\s+from\s+line\s+6[:\s]*\$?\s*([-]?\d{1,3}(?:[,.]?\d{3})*)",
                    # Line 22 with amount on same line
                    r"(?:line\s*)?22[.\s]+ordinary\s+business\s+income.*?[\$\s]+([-]?\d{1,3}(?:[,.]?\d{3})*)",
                    # Sometimes just after "22" with various separators
                    r"^\s*22\s*[.\s]+.*?[\$\s]+([-]?\d{1,3}(?:[,.]?\d{3})*)",
                    # Look for ordinary business income/loss with amount nearby
                    r"ordinary\s+business\s+income\s*\(?loss\)?.*?[\$\s]+([-]?\d{1,3}(?:[,.]?\d{3})*)",
                ],
                "context_patterns": [
                    # Only look for numbers very close to line 22
                    r"(?:line\s*)?22\b[^0-9]{0,20}([-]?\d{1,3}(?:[,.]?\d{3})*)",
                    # Look for parentheses ONLY near ordinary income text
                    r"ordinary.*?\((\d{1,3}(?:[,.]?\d{3})*)\)",
                ],
                "exclusion_patterns": [
                    r"line\s*23",  # Don't pick up next line
                    r"deduction",  # Avoid deduction lines
                    r"credit",     # Avoid credit lines
                ],
                "form_line": "Form 1120-S Line 22"
            },
            "charitable": {
                "primary_patterns": [
                    r"charitable.*contrib.*?[\$\s]*(\d{1,3}[,.]?\d{3})",
                    r"CHARITABLE.*?[\$\s]*(\d{1,3}[,.]?\d{3})"
                ],
                "context_patterns": [
                    r"contribution[^\d]*(\d{1,3}[,.]?\d{3})"
                ],
                "form_line": "M-2 Line 5"
            },
            "section179": {
                "primary_patterns": [
                    r"section\s+179\s+expense.*?[\$\s]*(\d{2,3}[,.]?\d{3})",
                    r"SEC.*179.*?[\$\s]*(\d{2,3}[,.]?\d{3})"
                ],
                "context_patterns": [
                    r"179[^\d]*(\d{2,3}[,.]?\d{3})"
                ],
                "form_line": "M-2 Line 5"
            }
        }
        
        self.addback_rules = {
            "officer_compensation": {
                "calc_type": "excess_over_market",
                "reason": "Owner salary exceeds market rate. Excess amount is added back to normalize earnings."
            },
            "depreciation": {
                "calc_type": "full_amount",
                "reason": "Non-cash expense. Added back for EBITDA calculation as it doesn't affect cash flow."
            },
            "automobile": {
                "calc_type": "percentage",
                "percentage": 0.25,
                "reason": "25% assumed personal use. This portion is discretionary/non-business expense."
            },
            "meals": {
                "calc_type": "full_amount",
                "reason": "Non-deductible portion (50%) represents discretionary spending."
            },
            "charitable": {
                "calc_type": "full_amount",
                "reason": "Non-business expense. Charitable giving is discretionary."
            },
            "ordinary_income": {
                "calc_type": "negative_only",  # Only add back if it's a loss (negative)
                "reason": "Business loss added back to normalize earnings. Represents the core operating loss before adjustments."
            },
            "section179": {
                "calc_type": "full_amount",
                "reason": "Accelerated depreciation election. Added back as it's a non-cash tax benefit."
            }
        }
    
    def preprocess_image_advanced(self, image):
        """Advanced image preprocessing for better OCR - especially numbers"""
        # Convert to numpy array for advanced processing
        img_array = np.array(image)
        
        # Convert to grayscale if not already
        if len(img_array.shape) == 3:
            gray = np.dot(img_array[...,:3], [0.2989, 0.5870, 0.1140])
        else:
            gray = img_array
        
        # Import cv2 for better preprocessing
        try:
            import cv2
            
            # Convert to uint8
            gray_uint8 = gray.astype(np.uint8)
            
            # 1. Denoise while preserving edges (good for numbers)
            denoised = cv2.bilateralFilter(gray_uint8, 9, 75, 75)
            
            # 2. Apply adaptive thresholding (better for varying lighting)
            thresh = cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                         cv2.THRESH_BINARY, 11, 2)
            
            # 3. Morphological operations to connect broken digits
            kernel = np.ones((2,1), np.uint8)  # Vertical kernel to connect digit parts
            connected = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
            
            # 4. Remove small noise
            kernel_noise = np.ones((2,2), np.uint8)
            cleaned = cv2.morphologyEx(connected, cv2.MORPH_OPEN, kernel_noise)
            
            # Convert back to PIL
            pil_img = Image.fromarray(cleaned)
            
        except ImportError:
            # Fallback if cv2 not available
            print("Warning: cv2 not available. Using basic preprocessing.")
            pil_img = Image.fromarray(gray.astype(np.uint8))
            
            # Basic enhancement
            enhancer = ImageEnhance.Contrast(pil_img)
            pil_img = enhancer.enhance(2.5)
            pil_img = pil_img.filter(ImageFilter.SHARPEN)
            pil_img = pil_img.point(lambda x: 0 if x < 128 else 255, '1')
        
        return pil_img

    def extract_at_multiple_resolutions(self, image, pattern_dict):
        """Try OCR at different resolutions - helps with number recognition"""
        findings = []
        
        # Try different DPIs/scales
        for scale in [1.0, 1.5, 2.0, 2.5]:
            if scale != 1.0:
                new_size = (int(image.width * scale), int(image.height * scale))
                scaled_img = image.resize(new_size, Image.Resampling.LANCZOS)
            else:
                scaled_img = image
            
            # Preprocess the scaled image
            processed = self.preprocess_image_advanced(scaled_img)
            
            # Try OCR with different configs
            for config in ['--psm 6', '--psm 4 -c tessedit_char_whitelist=0123456789,$,. ']:
                try:
                    text = pytesseract.image_to_string(processed, config=config)
                    
                    # Extract numbers near "automobile"
                    if 'automobile' in text.lower():
                        # Look for numbers within 50 characters of 'automobile'
                        auto_match = re.search(r'automobile[^0-9]{0,50}(\d{1,2}[,.]?\d{3})', 
                                             text, re.IGNORECASE)
                        if auto_match:
                            value = auto_match.group(1).replace(',', '').replace('.', '')
                            try:
                                findings.append((float(value), 0.7))
                            except:
                                pass
                except:
                    continue
        
        return findings

    def extract_line_22_specifically(self, image):
        """Target extraction for Form 1120S Line 22"""
        try:
            # Use targeted OCR with different configs
            text = pytesseract.image_to_string(image, config='--psm 6')
            
            # Look for line 22 pattern more specifically
            # Pattern: "22" followed by description, then amount
            line_22_pattern = r'22\s*[.\s]*(?:ordinary\s+business\s+income)?.*?(\d{1,2}[,.]?\d{3})'
            
            matches = re.finditer(line_22_pattern, text, re.IGNORECASE | re.MULTILINE)
            
            findings = []
            for match in matches:
                value_str = match.group(1).replace(',', '').replace('.', '')
                try:
                    value = float(value_str)
                    # Sanity check - ordinary income/loss typically between -100k and 500k for small business
                    if -100000 <= value <= 500000:
                        findings.append((value, 0.8))
                except:
                    pass
                    
            return findings
        except:
            return []
    
    def extract_with_validation(self, pdf_path: str) -> Dict[str, List[Tuple[float, float]]]:
        """Extract values with confidence scores"""
        try:
            if sys.platform == "win32" and 'poppler_path' in globals():
                images = pdf2image.convert_from_path(pdf_path, dpi=400, poppler_path=poppler_path)
            else:
                images = pdf2image.convert_from_path(pdf_path, dpi=400)
            
            all_findings = {key: [] for key in self.extraction_patterns.keys()}
            
            for page_num, image in enumerate(images):
                print(f"Processing page {page_num + 1}...")
                # Try multiple preprocessing approaches
                preprocessing_methods = [
                    ("original", image),
                    ("enhanced", self.preprocess_image_advanced(image)),
                    ("high_contrast", ImageEnhance.Contrast(image.convert('L')).enhance(3.0)),
                    ("sharpened", image.filter(ImageFilter.SHARPEN).filter(ImageFilter.SHARPEN))
                ]
                # Add special handling for automobile if it's in our missing items
                if page_num >= 3:  # Other Deductions usually on later pages
                    # Try targeted extraction for automobile expense
                    auto_findings = self.extract_at_multiple_resolutions(image, 
                                                                       self.extraction_patterns.get("automobile", {}))
                    if auto_findings:
                        all_findings["automobile"].extend(auto_findings)
                if page_num <= 2:  # Form 1120S main pages (first few pages)
                    # Try targeted extraction for line 22
                    line_22_findings = self.extract_line_22_specifically(image)
                    if line_22_findings:
                        all_findings["ordinary_income"].extend(line_22_findings)
                
                for method_name, processed_img in preprocessing_methods:
                    # Use multiple OCR passes with different settings
                    for psm in [6, 4, 11]:  # Different page segmentation modes
                        try:
                            text = pytesseract.image_to_string(
                                processed_img, 
                                config=f'--psm {psm} --oem 3'
                            )
                            
                            # Extract values for each category
                            for category, patterns in self.extraction_patterns.items():
                                # Check if this text contains exclusion patterns
                                should_skip = False
                                for exclusion in patterns.get("exclusion_patterns", []):
                                    if re.search(exclusion, text, re.IGNORECASE):
                                        should_skip = True
                                        break
                                
                                if should_skip:  # Changed: removed "and category == 'depreciation'"
                                    continue  # Skip this text block for this category
                                
                                # Try primary patterns first (higher confidence)
                                for pattern in patterns["primary_patterns"]:
                                    matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
                                    for match in matches:
                                        value_str = match.group(1).replace(',', '').replace('.', '')
                                        try:
                                            # Special handling for ordinary income - check for parentheses indicating negative
                                            if category == "ordinary_income":
                                                # Check if the original match or surrounding text has parentheses
                                                full_match_text = match.group(0)
                                                if '(' in full_match_text and ')' in full_match_text:
                                                    # This is a negative number
                                                    value = -abs(float(value_str))
                                                else:
                                                    # Also check if the value_str itself starts with a minus
                                                    if value_str.startswith('-'):
                                                        value = float(value_str)
                                                    else:
                                                        value = float(value_str)
                                            else:
                                                value = float(value_str)
                                            # High confidence for primary patterns
                                            all_findings[category].append((value, 0.9))
                                        except:
                                            continue
                                
                                # Try context patterns (lower confidence)
                                for pattern in patterns.get("context_patterns", []):
                                    matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
                                    for match in matches:
                                        # Handle patterns that might just find the number itself
                                        if match.groups():
                                            value_str = match.group(1).replace(',', '').replace('.', '')
                                        else:
                                            value_str = match.group(0).replace(',', '').replace('.', '')
                                        try:
                                            # Special handling for ordinary income - check for parentheses indicating negative
                                            if category == "ordinary_income":
                                                # For context patterns, we might be matching just the parentheses pattern
                                                full_match_text = match.group(0)
                                                if '(' in full_match_text and ')' in full_match_text:
                                                    # This is a negative number
                                                    value = -abs(float(value_str))
                                                else:
                                                    # Also check if the value_str itself starts with a minus
                                                    if value_str.startswith('-'):
                                                        value = float(value_str)
                                                    else:
                                                        value = float(value_str)
                                            else:
                                                value = float(value_str)
                                            # Lower confidence for context patterns
                                            all_findings[category].append((value, 0.6))
                                        except:
                                            continue
                        except:
                            continue
            
            return all_findings
            
        except Exception as e:
            print(f"Error in extraction: {e}")
            return {}
    
    def select_best_value(self, findings: List[Tuple[float, float]], category: str = None) -> Optional[float]:
        """Select the most likely correct value from multiple findings"""
        if not findings:
            return None
        
        # Debug print for depreciation
        if category == "depreciation":
            print(f"\nDEBUG - All depreciation values found:")
            for value, confidence in sorted(findings):
                print(f"  ${value:,.0f} (confidence: {confidence})")
        
        # Special handling for depreciation - prefer smaller values (likely MACRS not total)
        if category == "depreciation":
            # Filter out values that are likely totals (too large)
            reasonable_findings = [(v, c) for v, c in findings if v < 3000]
            if reasonable_findings:
                print(f"  After filtering < $3,000: {len(reasonable_findings)} values remain")
                findings = reasonable_findings
            
            # Prefer values around 1,000-2,000 for MACRS
            macrs_range_findings = [(v, c) for v, c in findings if 1000 <= v <= 2000]
            if macrs_range_findings:
                print(f"  Values in MACRS range ($1,000-$2,000): {len(macrs_range_findings)}")
                findings = macrs_range_findings
        
        # For automobile, debug what we're seeing
        if category == "automobile":
            print(f"\nDEBUG - Automobile search found {len(findings)} candidates")
            if not findings:
                print("  No automobile amounts found - check if OCR is reading the text correctly")

        # Special handling for ordinary income
        if category == "ordinary_income":
            print(f"\nDEBUG - Ordinary income/loss values found:")
            for value, confidence in sorted(findings):
                print(f"  ${value:,.0f} (confidence: {confidence})")
            
            # Filter out very small values that are likely noise
            significant_findings = [(v, c) for v, c in findings if abs(v) >= 100]
            if significant_findings:
                print(f"  After filtering >= $100: {len(significant_findings)} values remain")
                findings = significant_findings
            
            # Prefer values in reasonable range for business income
            business_range_findings = [(v, c) for v, c in findings if 1000 <= abs(v) <= 1000000]
            if business_range_findings:
                print(f"  Values in business income range: {len(business_range_findings)}")
                findings = business_range_findings
        
        # Group similar values (within 10%)
        value_groups = []
        for value, confidence in findings:
            found_group = False
            for group in value_groups:
                if group:  # Check group is not empty
                    group_median = np.median([v for v, c in group])
                    if abs(value - group_median) / max(group_median, 1) < 0.1:  # Within 10%
                        group.append((value, confidence))
                        found_group = True
                        break
            if not found_group:
                value_groups.append([(value, confidence)])
        
        # Select group with highest total confidence
        if not value_groups:
            return None
            
        best_group = max(value_groups, key=lambda g: sum(c for v, c in g))
        
        # Return the value with highest confidence in best group
        selected_value = max(best_group, key=lambda x: x[1])[0]
        
        if category == "depreciation":
            print(f"  Selected depreciation value: ${selected_value:,.0f}")
        
        return selected_value

    def correct_ocr_number_errors(self, value: float, category: str) -> float:
        """Correct common OCR errors in number recognition"""
        if category == "automobile":
            # Common OCR misreads for automobile expenses
            ocr_corrections = {
                1125: 15000,   # 1,125 -> 15,000
                1120: 15000,   # 1,120 -> 15,000
                11250: 15000,  # 11,250 -> 15,000
                1500: 15000,   # 1,500 -> 15,000 (missing 0)
                1250: 12500,   # Could be 12,500
                1750: 17500,   # Could be 17,500
            }
            
            # Direct correction
            if value in ocr_corrections:
                corrected = ocr_corrections[value]
                print(f"  Correcting OCR error: ${value:,.0f} -> ${corrected:,.0f}")
                return corrected
            
            # Pattern-based corrections
            # If it's 1,1XX it might be 15,XXX
            if 1100 <= value <= 1199:
                # Extract last 2 digits
                last_digits = int(str(int(value))[-2:])
                potential_value = 15000 + last_digits
                print(f"  Possible OCR error: ${value:,.0f} might be ${potential_value:,.0f}")
                return potential_value
                
        return value

    def validate_number_extraction(self, text, expected_pattern="automobile"):
        """Specifically validate numbers near certain text"""
        # Common OCR errors for numbers
        ocr_corrections = {
            '1120': '15000',  # 11,20 -> 15,000
            '11200': '15000', # 112,00 -> 15,000
            '1500': '15000',  # Missing last 0
            '15': '15000',    # Missing 000
            'IS000': '15000', # I instead of 1
            'l5000': '15000', # l instead of 1
        }
        
        # Find all numbers near the pattern
        pattern = rf'{expected_pattern}[^0-9]*?([0-9,.$]+)'
        matches = re.finditer(pattern, text, re.IGNORECASE)
        
        corrected_values = []
        for match in matches:
            raw_value = match.group(1).replace('$', '').replace(',', '').replace('.', '')
            
            # Check if this matches a known OCR error
            if raw_value in ocr_corrections:
                corrected_values.append(float(ocr_corrections[raw_value]))
            else:
                try:
                    value = float(raw_value)
                    # Sanity check for automobile expenses
                    if expected_pattern == "automobile" and 1000 <= value <= 50000:
                        corrected_values.append(value)
                except:
                    pass
        
        return corrected_values
    
    def validate_amounts(self, extracted_values: Dict[str, float]) -> Dict[str, float]:
        """Validate extracted amounts for reasonableness"""
        validated = {}
        
        # Validation rules
        validation_rules = {
            "depreciation": (0, 50000),  # Reasonable range for small business
            "officer_compensation": (50000, 500000),
            "automobile": (0, 50000),
            "meals": (0, 5000),
            "charitable": (0, 50000),
            "section179": (0, 1000000),  # Section 179 limit
            "ordinary_income": (-1000000, 1000000),  # Can be positive or negative
        }
        
        for item, value in extracted_values.items():
            if item in validation_rules:
                min_val, max_val = validation_rules[item]
                if min_val <= value <= max_val:
                    validated[item] = value
                else:
                    print(f"Warning: {item} value ${value:,.0f} outside expected range")
        
        return validated
    
    def analyze_automatically(self, pdf_path: str) -> Dict:
        """Fully automatic analysis without manual intervention"""
        print("Starting automatic analysis...")
        
        # Extract with multiple methods and confidence scoring
        all_findings = self.extract_with_validation(pdf_path)
        
        # Select best value for each category
        extracted_values = {}
        for category, findings in all_findings.items():
            best_value = self.select_best_value(findings, category)
            
            # Apply OCR correction for automobile
            if best_value and category == "automobile":
                corrected_value = self.correct_ocr_number_errors(best_value, category)
                if corrected_value != best_value:
                    print(f"Applied OCR correction for {category}: ${best_value:,.0f} -> ${corrected_value:,.0f}")
                    best_value = corrected_value
            
            if best_value:
                extracted_values[category] = best_value
                print(f"Found {category}: ${best_value:,.0f} (from {len(findings)} candidates)")
            else:
                print(f"Could not find {category}")
                # Special handling for automobile - might be in "Other Deductions"
                if category == "automobile":
                    print("  Tip: Check 'Other Deductions' section of Schedule K")
        
        # Validate amounts
        validated_values = self.validate_amounts(extracted_values)
        
        # Calculate addbacks
        addbacks = []
        for item_type, amount in validated_values.items():
            if item_type in self.addback_rules:
                rule = self.addback_rules[item_type]
                
                if rule["calc_type"] == "excess_over_market":
                    if amount > self.market_rate_salary:
                        excess = amount - self.market_rate_salary
                        addbacks.append(Addback(
                            line_item=f"Officer Compensation (Excess over market)",
                            amount=excess,
                            reason=rule["reason"],
                            calculation=f"${amount:,.0f} - ${self.market_rate_salary:,.0f} = ${excess:,.0f}",
                            confidence=0.9
                        ))
                
                elif rule["calc_type"] == "negative_only":
                    # Only add back if it's a loss (negative number)
                    if amount < 0:
                        addback_amount = abs(amount)  # Convert to positive for addback
                        addbacks.append(Addback(
                            line_item="Ordinary Business Loss",
                            amount=addback_amount,
                            reason=rule["reason"],
                            calculation=f"Loss of ${amount:,.0f} → Addback of ${addback_amount:,.0f}",
                            confidence=0.9
                        ))
                    else:
                        # If it's positive income, we might note it but don't add it back
                        print(f"Note: Ordinary business income is positive (${amount:,.0f}), no addback needed")
                
                elif rule["calc_type"] == "percentage":
                    adjusted_amount = amount * rule["percentage"]
                    addbacks.append(Addback(
                        line_item=f"{item_type.replace('_', ' ').title()} ({int(rule['percentage']*100)}% personal use)",
                        amount=adjusted_amount,
                        reason=rule["reason"],
                        calculation=f"{int(rule['percentage']*100)}% × ${amount:,.0f} = ${adjusted_amount:,.0f}",
                        confidence=0.9
                    ))
                
                else:  # full_amount
                    addbacks.append(Addback(
                        line_item=item_type.replace('_', ' ').title(),
                        amount=amount,
                        reason=rule["reason"],
                        calculation=f"Full amount: ${amount:,.0f}",
                        confidence=0.9
                    ))
        
        # Items not found that might need investigation
        expected_items = set(self.addback_rules.keys())
        found_items = set(validated_values.keys())
        missing_items = expected_items - found_items
        
        if missing_items:
            print(f"\nWarning: Could not extract: {', '.join(missing_items)}")
        
        return {
            "extracted_values": validated_values,
            "addbacks": addbacks,
            "total_addbacks": sum(ab.amount for ab in addbacks),
            "missing_items": list(missing_items),
            "extraction_confidence": len(validated_values) / len(expected_items)
        }
    
    def generate_report(self, results: Dict) -> str:
        """Generate comprehensive report"""
        report = "\nAUTOMATIC EBITDA ADDBACK ANALYSIS\n"
        report += "=" * 60 + "\n"
        
        # Extraction confidence
        confidence = results.get("extraction_confidence", 0)
        report += f"\nExtraction Confidence: {confidence:.0%}\n"
        
        if confidence < 0.7:
            report += "⚠️  Low extraction confidence - manual review recommended\n"
        
        # Addbacks
        report += "\nIDENTIFIED ADDBACKS:\n"
        report += "-" * 60 + "\n"
        
        for addback in results["addbacks"]:
            report += f"\n{addback.line_item}"
            if addback.confidence < 0.8:
                report += " ⚠️"
            report += f"\n  Amount: ${addback.amount:,.2f}"
            report += f"\n  Reason: {addback.reason}"
            report += f"\n  Calculation: {addback.calculation}\n"
        
        # Total
        report += "-" * 60 + "\n"
        report += f"TOTAL ADDBACKS: ${results['total_addbacks']:,.2f}\n"
        
        # Missing items
        if results.get("missing_items"):
            report += f"\n⚠️  Could not extract: {', '.join(results['missing_items'])}\n"
            report += "These items may need manual review.\n"
        
        return report

In [10]:
# Cell 4: Main Execution
analyzer = ProductionAddbackAnalyzer()

# Simply provide the PDF path - no manual values needed!
pdf_path = "P&L and Taxes/Roselle_Dental_Center_2024_Modified.pdf"  # <-- CHANGE THIS

# Fully automatic analysis
results = analyzer.analyze_automatically(pdf_path)

# Generate report
report = analyzer.generate_report(results)
print(report)

# Export to Excel
if results["addbacks"]:
    df = pd.DataFrame([{
        "Item": ab.line_item,
        "Amount": ab.amount,
        "Reason": ab.reason,
        "Calculation": ab.calculation,
        "Confidence": f"{ab.confidence:.0%}"
    } for ab in results["addbacks"]])
    
    df.to_excel("automatic_addback_analysis.xlsx", index=False)
    print("\nResults exported to automatic_addback_analysis.xlsx")

Starting automatic analysis...
Processing page 1...
Processing page 2...
Processing page 3...
Could not find depreciation
Found officer_compensation: $143,231 (from 13 candidates)

DEBUG - Automobile search found 849 candidates
  Correcting OCR error: $1,125 -> $15,000
Applied OCR correction for automobile: $1,125 -> $15,000
Found automobile: $15,000 (from 849 candidates)
Could not find meals

DEBUG - Ordinary income/loss values found:
  $0 (confidence: 0.8)
  $0 (confidence: 0.8)
Could not find ordinary_income
Could not find charitable
Found section179: $50,000 (from 8 candidates)


AUTOMATIC EBITDA ADDBACK ANALYSIS

Extraction Confidence: 43%
⚠️  Low extraction confidence - manual review recommended

IDENTIFIED ADDBACKS:
------------------------------------------------------------

Automobile (25% personal use)
  Amount: $3,750.00
  Reason: 25% assumed personal use. This portion is discretionary/non-business expense.
  Calculation: 25% × $15,000 = $3,750

Section179
  Amount: $50,000

In [1]:
# This code is for the P&L documents only
# Cell 1: Imports
import pdfplumber
import re
import pandas as pd
from decimal import Decimal

In [28]:
class PLAnalyzer2025:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.debug_mode = False
        self.results = {}
    
    def decode_basic_cids(self, text):
        """Basic CID decoding for key characters"""
        replacements = {
            '(cid:54)': '6', '(cid:55)': '7', '(cid:84)': 'T', '(cid:108)': 'l',
        }
        
        decoded = text
        for cid, char in replacements.items():
            decoded = decoded.replace(cid, char)
        return decoded
    
    def extract_amounts_from_line(self, line):
        """Extract dollar amounts from a line"""
        pattern = r'\b(\d{1,3}(?:,\d{3})*\.\d{2})\b'
        return re.findall(pattern, line)
    
    def clean_amount(self, amount_str):
        """Convert amount string to Decimal"""
        if not amount_str:
            return Decimal('0')
        cleaned = str(amount_str).replace(',', '').replace('$', '').strip()
        try:
            return Decimal(cleaned)
        except:
            return Decimal('0')
    
    def find_exact_lines(self, lines):
        """Find expense lines using exact patterns"""
        exact_searches = {
            'automobile': 'Total 6050',
            'contributions': 'Total 6180',
            'legal_fees': 'Total 6660',
            'miscellaneous': 'Total 6530',
            'travel_entertainment': 'Total 6900'
        }
        
        results = {}
        
        for category, search_text in exact_searches.items():
            if self.debug_mode:
                print(f"Searching for {category}...")
            
            found = False
            for i, line in enumerate(lines):
                decoded_line = self.decode_basic_cids(line)
                
                if search_text in decoded_line:
                    if self.debug_mode:
                        print(f"Found {category} at line {i}: {line}")
                    
                    amounts = self.extract_amounts_from_line(line)
                    if len(amounts) >= 2:
                        debit = self.clean_amount(amounts[0])
                        credit = self.clean_amount(amounts[1])
                        results[category] = {
                            'debit': debit,
                            'credit': credit,
                            'net': debit - credit,
                            'line': line,
                            'decoded_line': decoded_line
                        }
                        found = True
                        break
            
            if not found:
                results[category] = {
                    'debit': Decimal('0'), 'credit': Decimal('0'), 'net': Decimal('0'),
                    'line': 'Not found', 'decoded_line': 'Not found'
                }
        
        return results
    
    def find_net_income_direct(self, lines):
        """Find Net Income using direct line matching"""
        if self.debug_mode:
            print("Searching for Net Income...")
        
        for i, line in enumerate(lines):
            if line.startswith('Net Income '):
                if self.debug_mode:
                    print(f"Found Net Income at line {i}: {line}")
                
                decoded_line = self.decode_basic_cids(line)
                amounts = self.extract_amounts_from_line(decoded_line)
                
                if len(amounts) >= 2:
                    debit = self.clean_amount(amounts[0])
                    credit = self.clean_amount(amounts[1])
                    return {
                        'debit': debit,
                        'credit': credit,
                        'net': credit - debit,
                        'line': line,
                        'decoded_line': decoded_line
                    }
        
        return {'debit': Decimal('0'), 'credit': Decimal('0'), 'net': Decimal('0'),
                'line': 'Not found', 'decoded_line': 'Not found'}
    
    def find_owner_salary_2025(self, lines):
        """Find owner salary using 2025-specific corruption patterns"""
        if self.debug_mode:
            print("Searching for Owner's Salary using 2025 patterns...")
        
        owners_found = {}
        market_rate = Decimal('195700')
        
        paycheck_count = 0
        owner_paycheck_count = 0
        
        for i, line in enumerate(lines):
            # 2025 paycheck pattern: (cid:80)a(cid:121)chec(cid:107)
            if re.search(r'\(cid:80\)a\(cid:121\)chec\(cid:107\)', line):
                paycheck_count += 1
                
                if self.debug_mode and paycheck_count <= 30:
                    print(f"\nPaycheck {paycheck_count} at line {i}:")
                    print(f"  Line: {line}")
                
                is_owner = False
                owner_name = None
                
                # Look for Chang OWNER pattern: Chang, (cid:80)eter S (cid:79)(cid:87)(cid:78)(cid:69)(cid:82)(cid:58)
                if re.search(r'Chang.*?\(cid:80\)eter.*?S.*?\(cid:79\)\(cid:87\)\(cid:78\)\(cid:69\)\(cid:82\)', line):
                    is_owner = True
                    owner_name = "Chang, Peter S"
                    if self.debug_mode and paycheck_count <= 30:
                        print(f"    MATCHED Chang OWNER pattern")
                
                # Look for SEYBOLD OWNER pattern
                elif re.search(r'SEYBOLD.*?HARVE.*?\(cid:79\)\(cid:87\)\(cid:78\)\(cid:69\)\(cid:82\)', line):
                    is_owner = True
                    owner_name = "SEYBOLD, HARVE"
                    if self.debug_mode and paycheck_count <= 30:
                        print(f"    MATCHED SEYBOLD OWNER pattern")
                
                # Fallback: Look for OWNER CID pattern and try to identify owner
                elif re.search(r'\(cid:79\)\(cid:87\)\(cid:78\)\(cid:69\)\(cid:82\)', line):
                    is_owner = True
                    if re.search(r'Chang', line) or re.search(r'\(cid:80\)eter', line):
                        owner_name = "Chang, Peter S"
                    elif re.search(r'SEYBOLD', line) or re.search(r'HARVE', line):
                        owner_name = "SEYBOLD, HARVE"
                    else:
                        owner_name = "Unknown Owner"
                    
                    if self.debug_mode and paycheck_count <= 30:
                        print(f"    MATCHED OWNER CID fallback: {owner_name}")
                
                if is_owner:
                    owner_paycheck_count += 1
                    
                    if self.debug_mode:
                        print(f"  *** OWNER PAYCHECK FOUND: {owner_name} ***")
                    
                    # Extract amounts
                    amounts = self.extract_amounts_from_line(line)
                    if amounts:
                        amount = self.clean_amount(amounts[-1])
                        
                        if owner_name not in owners_found:
                            owners_found[owner_name] = Decimal('0')
                        owners_found[owner_name] += amount
                        
                        if self.debug_mode:
                            print(f"    Added ${amount:,.2f} to {owner_name}")
                            print(f"    Running total: ${owners_found[owner_name]:,.2f}")
                    else:
                        if self.debug_mode:
                            print(f"    No amounts found in line")
        
        if self.debug_mode:
            print(f"\nPayroll Search Summary:")
            print(f"  Total paycheck lines found: {paycheck_count}")
            print(f"  Owner paycheck lines found: {owner_paycheck_count}")
            print(f"  Owners identified: {list(owners_found.keys()) if owners_found else 'None'}")
            for owner, total in owners_found.items():
                print(f"    {owner}: ${total:,.2f}")
        
        total_salary = sum(owners_found.values())
        excess = total_salary - market_rate if total_salary > market_rate else Decimal('0')
        
        return {
            'total': total_salary,
            'market_rate': market_rate,
            'excess': excess,
            'owner_name': "Multiple owners" if len(owners_found) > 1 else (
                list(owners_found.keys())[0] if owners_found else "Not found"
            ),
            'owner_details': owners_found
        }
    
    def analyze(self):
        """Run the complete analysis"""
        if self.debug_mode:
            print("Starting 2025 analysis...")
        
        # Extract all lines
        with pdfplumber.open(self.pdf_path) as pdf:
            all_lines = []
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    all_lines.extend(text.split('\n'))
        
        if self.debug_mode:
            print(f"Extracted {len(all_lines)} lines")
        
        # Find all data
        self.results = self.find_exact_lines(all_lines)
        self.results['net_income'] = self.find_net_income_direct(all_lines)
        self.results['owner_salary'] = self.find_owner_salary_2025(all_lines)
        
        # Calculate SDE
        self.calculate_sde()
        
        return self.results
    
    def calculate_sde(self):
        """Calculate SDE"""
        net_income = self.results.get('net_income', {}).get('net', Decimal('0'))
        sde = net_income
        addbacks = Decimal('0')
        
        if self.debug_mode:
            print(f"\nCalculating SDE starting with Net Income: ${net_income:,.2f}")
        
        # Add discretionary expenses
        for category in ['contributions', 'legal_fees', 'miscellaneous', 'travel_entertainment']:
            if category in self.results and self.results[category]['net'] > 0:
                amount = self.results[category]['net']
                sde += amount
                addbacks += amount
                if self.debug_mode:
                    print(f"+ {category}: ${amount:,.2f}")
        
        # Add 25% of automobile
        if 'automobile' in self.results and self.results['automobile']['net'] > 0:
            auto_total = self.results['automobile']['net']
            auto_addback = auto_total * Decimal('0.25')
            sde += auto_addback
            addbacks += auto_addback
            self.results['automobile_25_percent'] = auto_addback
            if self.debug_mode:
                print(f"+ Automobile (25% of ${auto_total:,.2f}): ${auto_addback:,.2f}")
        
        # Add owner excess
        if 'owner_salary' in self.results:
            excess = self.results['owner_salary']['excess']
            if excess > 0:
                sde += excess
                addbacks += excess
                if self.debug_mode:
                    print(f"+ Owner excess: ${excess:,.2f}")
        
        self.results['total_sde'] = sde
        self.results['total_addbacks'] = addbacks
        
        if self.debug_mode:
            print(f"Total SDE: ${sde:,.2f}")
    
    def display_summary(self):
        """Display a clean summary"""
        print("\n" + "="*60)
        print("2025 P&L ANALYSIS SUMMARY")
        print("="*60)
        
        categories = {
            'automobile': 'Automobile Expense',
            'contributions': 'Contributions',
            'legal_fees': 'Legal Fees', 
            'miscellaneous': 'Miscellaneous',
            'travel_entertainment': 'Travel & Entertainment'
        }
        
        print("\nEXPENSE CATEGORIES FOUND:")
        for category, name in categories.items():
            if category in self.results and self.results[category]['net'] > 0:
                net = self.results[category]['net']
                print(f"  {name:<25} ${net:>10,.2f}")
        
        print(f"\nNET INCOME:")
        net_income = self.results.get('net_income', {}).get('net', Decimal('0'))
        print(f"  Net Income                   ${net_income:>10,.2f}")
        
        print(f"\nOWNER SALARY ANALYSIS:")
        if 'owner_salary' in self.results:
            owner_data = self.results['owner_salary']
            print(f"  Owner: {owner_data['owner_name']}")
            print(f"  Total Salary                 ${owner_data['total']:>10,.2f}")
            print(f"  Market Rate                  ${owner_data['market_rate']:>10,.2f}")
            print(f"  Excess over Market           ${owner_data['excess']:>10,.2f}")
            
            # Show breakdown by owner if multiple
            if len(owner_data.get('owner_details', {})) > 1:
                print(f"\n  Owner Breakdown:")
                for owner_name, amount in owner_data['owner_details'].items():
                    print(f"    {owner_name:<20} ${amount:>10,.2f}")
        else:
            print(f"  No owner salary data found")

        print(f"\nSDE CALCULATION:")
        print(f"  Net Income                   ${net_income:>10,.2f}")
        addbacks = self.results.get('total_addbacks', Decimal('0'))
        print(f"  Total Addbacks               ${addbacks:>10,.2f}")
        
        # Show addback breakdown
        if addbacks > 0:
            print(f"\n  Addback Breakdown:")
            for category in categories:
                if category in self.results and self.results[category]['net'] > 0:
                    if category == 'automobile':
                        amount = self.results.get('automobile_25_percent', 0)
                        if amount > 0:
                            print(f"    Automobile (25%)           ${amount:>10,.2f}")
                    else:
                        amount = self.results[category]['net']
                        print(f"    {categories[category]:<25} ${amount:>10,.2f}")
            
            if 'owner_salary' in self.results and self.results['owner_salary']['excess'] > 0:
                excess = self.results['owner_salary']['excess']
                print(f"    Owner Excess Compensation  ${excess:>10,.2f}")
        
        sde = self.results.get('total_sde', Decimal('0'))
        print(f"\n  SELLER'S DISCRETIONARY EARNINGS  ${sde:>10,.2f}")
        print("="*60)

In [29]:
# Run the 2025 analysis
def run_2025_analysis(pdf_path, debug=True):
    analyzer = PLAnalyzer2025(pdf_path)
    analyzer.debug_mode = debug
    
    print(f"Running 2025 analysis on: {pdf_path}")
    results = analyzer.analyze()
    analyzer.display_summary()
    
    return results

# Execute - change the path to your 2025 file
pdf_path_2025 = 'P&L and Taxes/P&L 2025.pdf'  # UPDATE THIS PATH
results = run_2025_analysis(pdf_path_2025, debug=True)

Running 2025 analysis on: P&L and Taxes/P&L 2025.pdf
Starting 2025 analysis...
Extracted 1037 lines
Searching for automobile...
Found automobile at line 34: Total 6050 · Automobile (cid:69)(cid:120)pense 2,185.46 0.00
Searching for contributions...
Searching for legal_fees...
Found legal_fees at line 890: Total 6660 · (cid:76)egal (cid:70)ees 1,500.00 0.00
Searching for miscellaneous...
Found miscellaneous at line 316: Total 6530 · (cid:77)iscellaneous 72.02 0.00
Searching for travel_entertainment...
Found travel_entertainment at line 997: Total 6900 · Tra(cid:118)el (cid:38) (cid:69)ntertrainment (cid:45) (cid:79)ther 0.00 0.00
Searching for Net Income...
Found Net Income at line 1035: Net Income 409,9(cid:55)1.34 380,(cid:54)8(cid:54).91
Searching for Owner's Salary using 2025 patterns...

Paycheck 1 at line 341:
  Line: (cid:80)a(cid:121)chec(cid:107) 01/02/2025 30807 Cabatino, (cid:69)smeralda Schaumburg B... 6.31

Paycheck 2 at line 342:
  Line: (cid:80)a(cid:121)chec(cid:107) 01/

In [23]:
def find_payroll_section(pdf_path):
    """Find the payroll section by looking for section headers"""
    print("="*80)
    print("PAYROLL SECTION FINDER")
    print("="*80)
    
    with pdfplumber.open(pdf_path) as pdf:
        all_lines = []
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                all_lines.extend(text.split('\n'))
    
    print(f"Total lines extracted: {len(all_lines)}")
    
    # Look for payroll-related section headers
    payroll_headers = [
        r'[Pp]ayro(cid:108)(cid:108)',
        r'Payro(cid:108)(cid:108)',
        r'[Gg]ross.*?[Ww]ages',
        r'6560.*?[Pp]ayro(cid:108)(cid:108)',
        r'6575.*?[Gg]ross',
        r'[Pp]ayro(cid:108)(cid:108).*?E(cid:120)pe(cid:110)ses'
    ]
    
    print("\n1. SEARCHING FOR PAYROLL SECTION HEADERS:")
    print("-" * 60)
    
    payroll_start = None
    payroll_end = None
    
    for i, line in enumerate(all_lines):
        # Look for payroll section start
        if (re.search(r'[Pp]ayro.*?E.*?pe.*?ses', line, re.IGNORECASE) or
            re.search(r'6560.*?ayro.*?E.*?pe.*?ses', line, re.IGNORECASE) or
            '6560 · Payroll Expenses' in line or
            'Payroll Expenses' in line):
            print(f"PAYROLL SECTION START at line {i}: {line}")
            payroll_start = i
            break
    
    if payroll_start is None:
        print("Could not find payroll section start")
        return None, None
    
    # Look for gross wages subsection
    gross_wages_start = None
    for i in range(payroll_start, min(payroll_start + 50, len(all_lines))):
        line = all_lines[i]
        if (re.search(r'[Gg]ross.*?[Ww]ages', line, re.IGNORECASE) or
            re.search(r'6575.*?ross.*?ages', line, re.IGNORECASE) or
            '6575 · Gross Wages' in line):
            print(f"GROSS WAGES START at line {i}: {line}")
            gross_wages_start = i
            break
    
    if gross_wages_start is None:
        print("Could not find gross wages section")
        return payroll_start, None
    
    # Look for end of gross wages (next major section or total)
    gross_wages_end = None
    for i in range(gross_wages_start + 1, min(gross_wages_start + 2000, len(all_lines))):
        line = all_lines[i]
        if (re.search(r'Total.*?6575.*?Gross.*?Wages', line, re.IGNORECASE) or
            re.search(r'Total.*?ross.*?ages', line, re.IGNORECASE) or
            'Total 6575 · Gross Wages' in line):
            print(f"GROSS WAGES END at line {i}: {line}")
            gross_wages_end = i
            break
    
    return gross_wages_start, gross_wages_end

def analyze_gross_wages_section(pdf_path):
    """Analyze the gross wages section line by line"""
    
    with pdfplumber.open(pdf_path) as pdf:
        all_lines = []
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                all_lines.extend(text.split('\n'))
    
    start, end = find_payroll_section(pdf_path)
    
    if start is None:
        print("Cannot analyze - payroll section not found")
        return
    
    if end is None:
        end = min(start + 500, len(all_lines))  # Look at next 500 lines
        print(f"Using estimated end at line {end}")
    
    print(f"\n2. ANALYZING GROSS WAGES SECTION (lines {start} to {end}):")
    print("-" * 60)
    
    potential_paychecks = []
    potential_owners = []
    
    for i in range(start, end):
        line = all_lines[i]
        
        # Look for patterns that might be paycheck entries
        # In corrupted form, might be like: "(cid:80)aycheck" or similar
        if (re.search(r'\(cid:\d+\).*?aycheck', line, re.IGNORECASE) or
            re.search(r'[Cc]heck.*?\d{2}/\d{2}/2024', line) or  # Check + date pattern
            re.search(r'^\s*\(cid:\d+\)', line)):  # Lines starting with CID (might be Paycheck)
            
            potential_paychecks.append((i, line))
            
            # Check if this line might contain owner info
            if (re.search(r'[Oo].*?[Ww].*?[Nn].*?[Ee].*?[Rr]', line, re.IGNORECASE) or
                re.search(r'[Cc].*?[Hh].*?[Aa].*?[Nn].*?[Gg]', line, re.IGNORECASE) or
                re.search(r'[Ss].*?[Ee].*?[Yy].*?[Bb].*?[Oo].*?[Ll].*?[Dd]', line, re.IGNORECASE)):
                potential_owners.append((i, line))
    
    print(f"Found {len(potential_paychecks)} potential paycheck lines:")
    for i, (line_num, line) in enumerate(potential_paychecks[:20]):  # Show first 20
        print(f"{i+1:2d}. Line {line_num}: {line}")
        
        # Try to identify if this is an owner
        if (re.search(r'[Oo].*?[Ww].*?[Nn].*?[Ee].*?[Rr]', line, re.IGNORECASE) or
            re.search(r'[Cc].*?[Hh].*?[Aa].*?[Nn].*?[Gg]', line, re.IGNORECASE) or
            re.search(r'[Ss].*?[Ee].*?[Yy].*?[Bb].*?[Oo].*?[Ll].*?[Dd]', line, re.IGNORECASE)):
            print(f"    *** POTENTIAL OWNER LINE ***")
            
            # Extract amounts
            amounts = re.findall(r'\b(\d{1,3}(?:,\d{3})*\.\d{2})\b', line)
            if amounts:
                print(f"    Amounts: {amounts}")
    
    if len(potential_paychecks) > 20:
        print(f"... and {len(potential_paychecks) - 20} more potential paycheck lines")
    
    print(f"\n3. POTENTIAL OWNER LINES: {len(potential_owners)}")
    print("-" * 60)
    
    for i, (line_num, line) in enumerate(potential_owners[:10]):
        print(f"{i+1:2d}. Line {line_num}: {line}")
        amounts = re.findall(r'\b(\d{1,3}(?:,\d{3})*\.\d{2})\b', line)
        if amounts:
            print(f"    Amounts: {amounts}")
    
    return potential_paychecks, potential_owners

In [24]:
# Cell 2: Run the analysis on 2025 document
pdf_path_2025 = 'P&L and Taxes/P&L 2025.pdf'  # Change this to your 2025 file path
print("ANALYZING 2025 P&L DOCUMENT:")
print("="*60)
paycheck_lines_2025, owner_lines_2025 = analyze_gross_wages_section(pdf_path_2025)

# Cell 3: Alternative approach - search by date patterns and amounts
def find_entries_by_date_and_amount():
    """Find payroll entries by looking for date patterns with amounts"""
    print(f"\n" + "="*60)
    print("ALTERNATIVE SEARCH: DATE + AMOUNT PATTERNS")
    print("="*60)
    
    with pdfplumber.open('P&L and Taxes/P&L 2024.pdf') as pdf:
        all_lines = []
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                all_lines.extend(text.split('\n'))
    
    # Look for lines with 2024 dates and dollar amounts (potential payroll entries)
    payroll_candidates = []
    
    for i, line in enumerate(all_lines):
        # Look for 2024 date pattern with amounts
        if (re.search(r'\d{2}/\d{2}/2024', line) and 
            re.search(r'\d{1,3}(?:,\d{3})*\.\d{2}', line)):
            
            payroll_candidates.append((i, line))
    
    print(f"Found {len(payroll_candidates)} lines with 2024 dates and amounts")
    
    # Filter for potential owner entries
    owner_candidates = []
    for line_num, line in payroll_candidates:
        if (re.search(r'[Oo].*?[Ww].*?[Nn].*?[Ee].*?[Rr]', line, re.IGNORECASE) or
            re.search(r'[Cc].*?[Hh].*?[Aa].*?[Nn].*?[Gg]', line, re.IGNORECASE) or
            re.search(r'[Ss].*?[Ee].*?[Yy].*?[Bb].*?[Oo].*?[Ll].*?[Dd]', line, re.IGNORECASE) or
            re.search(r'[Hh].*?[Aa].*?[Rr].*?[Vv].*?[Ee]', line, re.IGNORECASE)):
            owner_candidates.append((line_num, line))
    
    print(f"Of these, {len(owner_candidates)} might be owner entries:")
    
    total_owner_amount = Decimal('0')
    for i, (line_num, line) in enumerate(owner_candidates[:20]):
        print(f"{i+1:2d}. Line {line_num}: {line}")
        amounts = re.findall(r'\b(\d{1,3}(?:,\d{3})*\.\d{2})\b', line)
        if amounts:
            amount = Decimal(amounts[-1].replace(',', ''))
            total_owner_amount += amount
            print(f"    Amount: ${amount:,.2f}")
    
    print(f"\nTotal potential owner compensation: ${total_owner_amount:,.2f}")
    
    return owner_candidates

# Run alternative search
owner_candidates = find_entries_by_date_and_amount()

ANALYZING 2025 P&L DOCUMENT:
PAYROLL SECTION FINDER
Total lines extracted: 1037

1. SEARCHING FOR PAYROLL SECTION HEADERS:
------------------------------------------------------------
PAYROLL SECTION START at line 338: (cid:54)5(cid:54)0 · Payroll Expenses
Could not find gross wages section
Using estimated end at line 838

2. ANALYZING GROSS WAGES SECTION (lines 338 to 838):
------------------------------------------------------------
Found 399 potential paycheck lines:
 1. Line 338: (cid:54)5(cid:54)0 · Payroll Expenses
 2. Line 339: (cid:54)5(cid:55)5 · (cid:71)ross (cid:87)ages
 3. Line 340: (cid:54)5(cid:54)1 · (cid:42)Payroll Expenses
 4. Line 341: (cid:80)a(cid:121)chec(cid:107) 01/02/2025 30807 Cabatino, (cid:69)smeralda Schaumburg B... 6.31
    *** POTENTIAL OWNER LINE ***
    Amounts: ['6.31']
 5. Line 342: (cid:80)a(cid:121)chec(cid:107) 01/02/2025 30808 Chiafulio(cid:45)(cid:90)asada, (cid:77)i... (cid:69)(cid:77)(cid:80)(cid:76)(cid:79)(cid:89)... Schaumburg B... 3.19
 6. L