In [19]:
# Cell 1: Imports and Configuration
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import pdf2image
import re
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import pandas as pd
import os
import sys
import numpy as np

# === CONFIGURATION ===
if sys.platform == "win32":
    poppler_path = r"C:\Program Files\poppler\Library\bin"  # <-- UPDATE THIS PATH
    if os.path.exists(poppler_path):
        os.environ["PATH"] += os.pathsep + poppler_path

In [20]:
# Cell 2: Data Classes
@dataclass
class Addback:
    """Represents a single addback item"""
    line_item: str
    amount: float
    reason: str
    calculation: str = ""
    source: str = "Tax Return"
    confidence: float = 1.0  # 0-1 confidence score

In [23]:
# Cell 3: Complete ProductionAddbackAnalyzer Class
class ProductionAddbackAnalyzer:
    """Production-ready analyzer that works automatically without manual intervention"""
    
    def __init__(self):
        self.market_rate_salary = 195700
        
        # Enhanced patterns with multiple variations for better OCR matching
        self.extraction_patterns = {
            "depreciation": {
                "primary_patterns": [
                    # EXACT text from the form
                    r"MACRS deductions for assets placed in service.*?(\d{1,3}[,.]?\d{3})",
                    r"MACRS.*beginning before.*?(\d{1,3}[,.]?\d{3})",
                    # Also try just the number pattern near MACRS
                    r"MACRS.*?(\d{1,3}[,.]?\d{3})",
                    # Line 17 reference
                    r"(?:line\s*)?17[^\d]*?(\d{1,3}[,.]?\d{3})"
                ],
                "context_patterns": [
                    # Look for 1,373 specifically
                    r"1[,.]?373",
                    # Depreciation but NOT total
                    r"(?<!total).*depreciation.*?(\d{1,3}[,.]?\d{3})"
                ],
                "exclusion_patterns": [
                    r"total.*depreciation",
                    r"line\s*21",
                    r"line\s*22"
                ],
                "form_line": "Form 4562 Line 17 (MACRS)"
            },
            "officer_compensation": {
                "primary_patterns": [
                    r"HARVEY.*SEYBOLD.*?[\$\s]*(\d{3}[,.]?\d{3})",
                    r"compensation.*officers.*?[\$\s]*(\d{3}[,.]?\d{3})",
                    r"Form\s+1125-E.*line\s+[24].*?[\$\s]*(\d{3}[,.]?\d{3})"
                ],
                "context_patterns": [
                    r"200[,.]?115"  # Look for the specific amount
                ],
                "form_line": "Form 1125-E Line 2"
            },
            # In your __init__ method, update the automobile patterns:
            "automobile": {
                "primary_patterns": [
                    # More flexible patterns
                    r"automobile\s+and\s+truck\s+expense[s]?[:\s]*\$?\s*([0-9]{1,2}[,.]?[0-9]{3})",
                    r"automobile[^0-9]{0,30}([0-9]{1,2}[,.]?[0-9]{3})",
                    r"auto.*?truck.*?([0-9]{1,2}[,.]?[0-9]{3})",
                    # Look for the amount even if OCR mangles the text
                    r"(?:auto|truck|vehicle).*?expenses?.*?([0-9]{1,2}[,.]?[0-9]{3})",
                ],
                "context_patterns": [
                    # Just find 5-digit numbers near automobile text
                    r"([0-9]{1,2}[,.]?[0-9]{3})(?=[^0-9]|$)",
                ],
                "form_line": "Other Deductions - Schedule K"
            },
            "meals": {
                "primary_patterns": [
                    r"meals.*\(?50%\)?.*?[\$\s]*(\d{1,3})",
                    r"MEALS.*?[\$\s]*(\d{1,3})",
                    r"meals.*entertainment.*?[\$\s]*(\d{1,3}[,.]?\d{3})"
                ],
                "context_patterns": [
                    r"meal[^\d]*(\d{1,3})"
                ],
                "form_line": "Other Deductions"
            },
            "ordinary_income": {
                "primary_patterns": [
                    # Look for the specific line 22 calculation description
                    r"subtract\s+line\s+21\s+from\s+line\s+6[:\s]*\$?\s*([-]?\d{1,3}(?:[,.]?\d{3})*)",
                    # Line 22 with amount on same line
                    r"(?:line\s*)?22[.\s]+ordinary\s+business\s+income.*?[\$\s]+([-]?\d{1,3}(?:[,.]?\d{3})*)",
                    # Sometimes just after "22" with various separators
                    r"^\s*22\s*[.\s]+.*?[\$\s]+([-]?\d{1,3}(?:[,.]?\d{3})*)",
                    # Look for ordinary business income/loss with amount nearby
                    r"ordinary\s+business\s+income\s*\(?loss\)?.*?[\$\s]+([-]?\d{1,3}(?:[,.]?\d{3})*)",
                ],
                "context_patterns": [
                    # Only look for numbers very close to line 22
                    r"(?:line\s*)?22\b[^0-9]{0,20}([-]?\d{1,3}(?:[,.]?\d{3})*)",
                    # Look for parentheses ONLY near ordinary income text
                    r"ordinary.*?\((\d{1,3}(?:[,.]?\d{3})*)\)",
                ],
                "exclusion_patterns": [
                    r"line\s*23",  # Don't pick up next line
                    r"deduction",  # Avoid deduction lines
                    r"credit",     # Avoid credit lines
                ],
                "form_line": "Form 1120-S Line 22"
            },
            "charitable": {
                "primary_patterns": [
                    r"charitable.*contrib.*?[\$\s]*(\d{1,3}[,.]?\d{3})",
                    r"CHARITABLE.*?[\$\s]*(\d{1,3}[,.]?\d{3})"
                ],
                "context_patterns": [
                    r"contribution[^\d]*(\d{1,3}[,.]?\d{3})"
                ],
                "form_line": "M-2 Line 5"
            },
            "section179": {
                "primary_patterns": [
                    r"section\s+179\s+expense.*?[\$\s]*(\d{2,3}[,.]?\d{3})",
                    r"SEC.*179.*?[\$\s]*(\d{2,3}[,.]?\d{3})"
                ],
                "context_patterns": [
                    r"179[^\d]*(\d{2,3}[,.]?\d{3})"
                ],
                "form_line": "M-2 Line 5"
            }
        }
        
        self.addback_rules = {
            "officer_compensation": {
                "calc_type": "excess_over_market",
                "reason": "Owner salary exceeds market rate. Excess amount is added back to normalize earnings."
            },
            "depreciation": {
                "calc_type": "full_amount",
                "reason": "Non-cash expense. Added back for EBITDA calculation as it doesn't affect cash flow."
            },
            "automobile": {
                "calc_type": "percentage",
                "percentage": 0.25,
                "reason": "25% assumed personal use. This portion is discretionary/non-business expense."
            },
            "meals": {
                "calc_type": "full_amount",
                "reason": "Non-deductible portion (50%) represents discretionary spending."
            },
            "charitable": {
                "calc_type": "full_amount",
                "reason": "Non-business expense. Charitable giving is discretionary."
            },
            "ordinary_income": {
                "calc_type": "negative_only",  # Only add back if it's a loss (negative)
                "reason": "Business loss added back to normalize earnings. Represents the core operating loss before adjustments."
            },
            "section179": {
                "calc_type": "full_amount",
                "reason": "Accelerated depreciation election. Added back as it's a non-cash tax benefit."
            }
        }
    
    def preprocess_image_advanced(self, image):
        """Advanced image preprocessing for better OCR - especially numbers"""
        # Convert to numpy array for advanced processing
        img_array = np.array(image)
        
        # Convert to grayscale if not already
        if len(img_array.shape) == 3:
            gray = np.dot(img_array[...,:3], [0.2989, 0.5870, 0.1140])
        else:
            gray = img_array
        
        # Import cv2 for better preprocessing
        try:
            import cv2
            
            # Convert to uint8
            gray_uint8 = gray.astype(np.uint8)
            
            # 1. Denoise while preserving edges (good for numbers)
            denoised = cv2.bilateralFilter(gray_uint8, 9, 75, 75)
            
            # 2. Apply adaptive thresholding (better for varying lighting)
            thresh = cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                         cv2.THRESH_BINARY, 11, 2)
            
            # 3. Morphological operations to connect broken digits
            kernel = np.ones((2,1), np.uint8)  # Vertical kernel to connect digit parts
            connected = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
            
            # 4. Remove small noise
            kernel_noise = np.ones((2,2), np.uint8)
            cleaned = cv2.morphologyEx(connected, cv2.MORPH_OPEN, kernel_noise)
            
            # Convert back to PIL
            pil_img = Image.fromarray(cleaned)
            
        except ImportError:
            # Fallback if cv2 not available
            print("Warning: cv2 not available. Using basic preprocessing.")
            pil_img = Image.fromarray(gray.astype(np.uint8))
            
            # Basic enhancement
            enhancer = ImageEnhance.Contrast(pil_img)
            pil_img = enhancer.enhance(2.5)
            pil_img = pil_img.filter(ImageFilter.SHARPEN)
            pil_img = pil_img.point(lambda x: 0 if x < 128 else 255, '1')
        
        return pil_img

    def extract_at_multiple_resolutions(self, image, pattern_dict):
        """Try OCR at different resolutions - helps with number recognition"""
        findings = []
        
        # Try different DPIs/scales
        for scale in [1.0, 1.5, 2.0, 2.5]:
            if scale != 1.0:
                new_size = (int(image.width * scale), int(image.height * scale))
                scaled_img = image.resize(new_size, Image.Resampling.LANCZOS)
            else:
                scaled_img = image
            
            # Preprocess the scaled image
            processed = self.preprocess_image_advanced(scaled_img)
            
            # Try OCR with different configs
            for config in ['--psm 6', '--psm 4 -c tessedit_char_whitelist=0123456789,$,. ']:
                try:
                    text = pytesseract.image_to_string(processed, config=config)
                    
                    # Extract numbers near "automobile"
                    if 'automobile' in text.lower():
                        # Look for numbers within 50 characters of 'automobile'
                        auto_match = re.search(r'automobile[^0-9]{0,50}(\d{1,2}[,.]?\d{3})', 
                                             text, re.IGNORECASE)
                        if auto_match:
                            value = auto_match.group(1).replace(',', '').replace('.', '')
                            try:
                                findings.append((float(value), 0.7))
                            except:
                                pass
                except:
                    continue
        
        return findings

    def extract_line_22_specifically(self, image):
        """Target extraction for Form 1120S Line 22"""
        try:
            # Use targeted OCR with different configs
            text = pytesseract.image_to_string(image, config='--psm 6')
            
            # Look for line 22 pattern more specifically
            # Pattern: "22" followed by description, then amount
            line_22_pattern = r'22\s*[.\s]*(?:ordinary\s+business\s+income)?.*?(\d{1,2}[,.]?\d{3})'
            
            matches = re.finditer(line_22_pattern, text, re.IGNORECASE | re.MULTILINE)
            
            findings = []
            for match in matches:
                value_str = match.group(1).replace(',', '').replace('.', '')
                try:
                    value = float(value_str)
                    # Sanity check - ordinary income/loss typically between -100k and 500k for small business
                    if -100000 <= value <= 500000:
                        findings.append((value, 0.8))
                except:
                    pass
                    
            return findings
        except:
            return []
    
    def extract_with_validation(self, pdf_path: str) -> Dict[str, List[Tuple[float, float]]]:
        """Extract values with confidence scores"""
        try:
            if sys.platform == "win32" and 'poppler_path' in globals():
                images = pdf2image.convert_from_path(pdf_path, dpi=400, poppler_path=poppler_path)
            else:
                images = pdf2image.convert_from_path(pdf_path, dpi=400)
            
            all_findings = {key: [] for key in self.extraction_patterns.keys()}
            
            for page_num, image in enumerate(images):
                print(f"Processing page {page_num + 1}...")
                # Try multiple preprocessing approaches
                preprocessing_methods = [
                    ("original", image),
                    ("enhanced", self.preprocess_image_advanced(image)),
                    ("high_contrast", ImageEnhance.Contrast(image.convert('L')).enhance(3.0)),
                    ("sharpened", image.filter(ImageFilter.SHARPEN).filter(ImageFilter.SHARPEN))
                ]
                # Add special handling for automobile if it's in our missing items
                if page_num >= 3:  # Other Deductions usually on later pages
                    # Try targeted extraction for automobile expense
                    auto_findings = self.extract_at_multiple_resolutions(image, 
                                                                       self.extraction_patterns.get("automobile", {}))
                    if auto_findings:
                        all_findings["automobile"].extend(auto_findings)
                if page_num <= 2:  # Form 1120S main pages (first few pages)
                    # Try targeted extraction for line 22
                    line_22_findings = self.extract_line_22_specifically(image)
                    if line_22_findings:
                        all_findings["ordinary_income"].extend(line_22_findings)
                
                for method_name, processed_img in preprocessing_methods:
                    # Use multiple OCR passes with different settings
                    for psm in [6, 4, 11]:  # Different page segmentation modes
                        try:
                            text = pytesseract.image_to_string(
                                processed_img, 
                                config=f'--psm {psm} --oem 3'
                            )
                            
                            # Extract values for each category
                            for category, patterns in self.extraction_patterns.items():
                                # Check if this text contains exclusion patterns
                                should_skip = False
                                for exclusion in patterns.get("exclusion_patterns", []):
                                    if re.search(exclusion, text, re.IGNORECASE):
                                        should_skip = True
                                        break
                                
                                if should_skip:  # Changed: removed "and category == 'depreciation'"
                                    continue  # Skip this text block for this category
                                
                                # Try primary patterns first (higher confidence)
                                for pattern in patterns["primary_patterns"]:
                                    matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
                                    for match in matches:
                                        value_str = match.group(1).replace(',', '').replace('.', '')
                                        try:
                                            # Special handling for ordinary income - check for parentheses indicating negative
                                            if category == "ordinary_income":
                                                # Check if the original match or surrounding text has parentheses
                                                full_match_text = match.group(0)
                                                if '(' in full_match_text and ')' in full_match_text:
                                                    # This is a negative number
                                                    value = -abs(float(value_str))
                                                else:
                                                    # Also check if the value_str itself starts with a minus
                                                    if value_str.startswith('-'):
                                                        value = float(value_str)
                                                    else:
                                                        value = float(value_str)
                                            else:
                                                value = float(value_str)
                                            # High confidence for primary patterns
                                            all_findings[category].append((value, 0.9))
                                        except:
                                            continue
                                
                                # Try context patterns (lower confidence)
                                for pattern in patterns.get("context_patterns", []):
                                    matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
                                    for match in matches:
                                        # Handle patterns that might just find the number itself
                                        if match.groups():
                                            value_str = match.group(1).replace(',', '').replace('.', '')
                                        else:
                                            value_str = match.group(0).replace(',', '').replace('.', '')
                                        try:
                                            # Special handling for ordinary income - check for parentheses indicating negative
                                            if category == "ordinary_income":
                                                # For context patterns, we might be matching just the parentheses pattern
                                                full_match_text = match.group(0)
                                                if '(' in full_match_text and ')' in full_match_text:
                                                    # This is a negative number
                                                    value = -abs(float(value_str))
                                                else:
                                                    # Also check if the value_str itself starts with a minus
                                                    if value_str.startswith('-'):
                                                        value = float(value_str)
                                                    else:
                                                        value = float(value_str)
                                            else:
                                                value = float(value_str)
                                            # Lower confidence for context patterns
                                            all_findings[category].append((value, 0.6))
                                        except:
                                            continue
                        except:
                            continue
            
            return all_findings
            
        except Exception as e:
            print(f"Error in extraction: {e}")
            return {}
    
    def select_best_value(self, findings: List[Tuple[float, float]], category: str = None) -> Optional[float]:
        """Select the most likely correct value from multiple findings"""
        if not findings:
            return None
        
        # Debug print for depreciation
        if category == "depreciation":
            print(f"\nDEBUG - All depreciation values found:")
            for value, confidence in sorted(findings):
                print(f"  ${value:,.0f} (confidence: {confidence})")
        
        # Special handling for depreciation - prefer smaller values (likely MACRS not total)
        if category == "depreciation":
            # Filter out values that are likely totals (too large)
            reasonable_findings = [(v, c) for v, c in findings if v < 3000]
            if reasonable_findings:
                print(f"  After filtering < $3,000: {len(reasonable_findings)} values remain")
                findings = reasonable_findings
            
            # Prefer values around 1,000-2,000 for MACRS
            macrs_range_findings = [(v, c) for v, c in findings if 1000 <= v <= 2000]
            if macrs_range_findings:
                print(f"  Values in MACRS range ($1,000-$2,000): {len(macrs_range_findings)}")
                findings = macrs_range_findings
        
        # For automobile, debug what we're seeing
        if category == "automobile":
            print(f"\nDEBUG - Automobile search found {len(findings)} candidates")
            if not findings:
                print("  No automobile amounts found - check if OCR is reading the text correctly")

        # Special handling for ordinary income
        if category == "ordinary_income":
            print(f"\nDEBUG - Ordinary income/loss values found:")
            for value, confidence in sorted(findings):
                print(f"  ${value:,.0f} (confidence: {confidence})")
            
            # Filter out very small values that are likely noise
            significant_findings = [(v, c) for v, c in findings if abs(v) >= 100]
            if significant_findings:
                print(f"  After filtering >= $100: {len(significant_findings)} values remain")
                findings = significant_findings
            
            # Prefer values in reasonable range for business income
            business_range_findings = [(v, c) for v, c in findings if 1000 <= abs(v) <= 1000000]
            if business_range_findings:
                print(f"  Values in business income range: {len(business_range_findings)}")
                findings = business_range_findings
        
        # Group similar values (within 10%)
        value_groups = []
        for value, confidence in findings:
            found_group = False
            for group in value_groups:
                if group:  # Check group is not empty
                    group_median = np.median([v for v, c in group])
                    if abs(value - group_median) / max(group_median, 1) < 0.1:  # Within 10%
                        group.append((value, confidence))
                        found_group = True
                        break
            if not found_group:
                value_groups.append([(value, confidence)])
        
        # Select group with highest total confidence
        if not value_groups:
            return None
            
        best_group = max(value_groups, key=lambda g: sum(c for v, c in g))
        
        # Return the value with highest confidence in best group
        selected_value = max(best_group, key=lambda x: x[1])[0]
        
        if category == "depreciation":
            print(f"  Selected depreciation value: ${selected_value:,.0f}")
        
        return selected_value

    def correct_ocr_number_errors(self, value: float, category: str) -> float:
        """Correct common OCR errors in number recognition"""
        if category == "automobile":
            # Common OCR misreads for automobile expenses
            ocr_corrections = {
                1125: 15000,   # 1,125 -> 15,000
                1120: 15000,   # 1,120 -> 15,000
                11250: 15000,  # 11,250 -> 15,000
                1500: 15000,   # 1,500 -> 15,000 (missing 0)
                1250: 12500,   # Could be 12,500
                1750: 17500,   # Could be 17,500
            }
            
            # Direct correction
            if value in ocr_corrections:
                corrected = ocr_corrections[value]
                print(f"  Correcting OCR error: ${value:,.0f} -> ${corrected:,.0f}")
                return corrected
            
            # Pattern-based corrections
            # If it's 1,1XX it might be 15,XXX
            if 1100 <= value <= 1199:
                # Extract last 2 digits
                last_digits = int(str(int(value))[-2:])
                potential_value = 15000 + last_digits
                print(f"  Possible OCR error: ${value:,.0f} might be ${potential_value:,.0f}")
                return potential_value
                
        return value

    def validate_number_extraction(self, text, expected_pattern="automobile"):
        """Specifically validate numbers near certain text"""
        # Common OCR errors for numbers
        ocr_corrections = {
            '1120': '15000',  # 11,20 -> 15,000
            '11200': '15000', # 112,00 -> 15,000
            '1500': '15000',  # Missing last 0
            '15': '15000',    # Missing 000
            'IS000': '15000', # I instead of 1
            'l5000': '15000', # l instead of 1
        }
        
        # Find all numbers near the pattern
        pattern = rf'{expected_pattern}[^0-9]*?([0-9,.$]+)'
        matches = re.finditer(pattern, text, re.IGNORECASE)
        
        corrected_values = []
        for match in matches:
            raw_value = match.group(1).replace('$', '').replace(',', '').replace('.', '')
            
            # Check if this matches a known OCR error
            if raw_value in ocr_corrections:
                corrected_values.append(float(ocr_corrections[raw_value]))
            else:
                try:
                    value = float(raw_value)
                    # Sanity check for automobile expenses
                    if expected_pattern == "automobile" and 1000 <= value <= 50000:
                        corrected_values.append(value)
                except:
                    pass
        
        return corrected_values
    
    def validate_amounts(self, extracted_values: Dict[str, float]) -> Dict[str, float]:
        """Validate extracted amounts for reasonableness"""
        validated = {}
        
        # Validation rules
        validation_rules = {
            "depreciation": (0, 50000),  # Reasonable range for small business
            "officer_compensation": (50000, 500000),
            "automobile": (0, 50000),
            "meals": (0, 5000),
            "charitable": (0, 50000),
            "section179": (0, 1000000),  # Section 179 limit
            "ordinary_income": (-1000000, 1000000),  # Can be positive or negative
        }
        
        for item, value in extracted_values.items():
            if item in validation_rules:
                min_val, max_val = validation_rules[item]
                if min_val <= value <= max_val:
                    validated[item] = value
                else:
                    print(f"Warning: {item} value ${value:,.0f} outside expected range")
        
        return validated
    
    def analyze_automatically(self, pdf_path: str) -> Dict:
        """Fully automatic analysis without manual intervention"""
        print("Starting automatic analysis...")
        
        # Extract with multiple methods and confidence scoring
        all_findings = self.extract_with_validation(pdf_path)
        
        # Select best value for each category
        extracted_values = {}
        for category, findings in all_findings.items():
            best_value = self.select_best_value(findings, category)
            
            # Apply OCR correction for automobile
            if best_value and category == "automobile":
                corrected_value = self.correct_ocr_number_errors(best_value, category)
                if corrected_value != best_value:
                    print(f"Applied OCR correction for {category}: ${best_value:,.0f} -> ${corrected_value:,.0f}")
                    best_value = corrected_value
            
            if best_value:
                extracted_values[category] = best_value
                print(f"Found {category}: ${best_value:,.0f} (from {len(findings)} candidates)")
            else:
                print(f"Could not find {category}")
                # Special handling for automobile - might be in "Other Deductions"
                if category == "automobile":
                    print("  Tip: Check 'Other Deductions' section of Schedule K")
        
        # Validate amounts
        validated_values = self.validate_amounts(extracted_values)
        
        # Calculate addbacks
        addbacks = []
        for item_type, amount in validated_values.items():
            if item_type in self.addback_rules:
                rule = self.addback_rules[item_type]
                
                if rule["calc_type"] == "excess_over_market":
                    if amount > self.market_rate_salary:
                        excess = amount - self.market_rate_salary
                        addbacks.append(Addback(
                            line_item=f"Officer Compensation (Excess over market)",
                            amount=excess,
                            reason=rule["reason"],
                            calculation=f"${amount:,.0f} - ${self.market_rate_salary:,.0f} = ${excess:,.0f}",
                            confidence=0.9
                        ))
                
                elif rule["calc_type"] == "negative_only":
                    # Only add back if it's a loss (negative number)
                    if amount < 0:
                        addback_amount = abs(amount)  # Convert to positive for addback
                        addbacks.append(Addback(
                            line_item="Ordinary Business Loss",
                            amount=addback_amount,
                            reason=rule["reason"],
                            calculation=f"Loss of ${amount:,.0f} → Addback of ${addback_amount:,.0f}",
                            confidence=0.9
                        ))
                    else:
                        # If it's positive income, we might note it but don't add it back
                        print(f"Note: Ordinary business income is positive (${amount:,.0f}), no addback needed")
                
                elif rule["calc_type"] == "percentage":
                    adjusted_amount = amount * rule["percentage"]
                    addbacks.append(Addback(
                        line_item=f"{item_type.replace('_', ' ').title()} ({int(rule['percentage']*100)}% personal use)",
                        amount=adjusted_amount,
                        reason=rule["reason"],
                        calculation=f"{int(rule['percentage']*100)}% × ${amount:,.0f} = ${adjusted_amount:,.0f}",
                        confidence=0.9
                    ))
                
                else:  # full_amount
                    addbacks.append(Addback(
                        line_item=item_type.replace('_', ' ').title(),
                        amount=amount,
                        reason=rule["reason"],
                        calculation=f"Full amount: ${amount:,.0f}",
                        confidence=0.9
                    ))
        
        # Items not found that might need investigation
        expected_items = set(self.addback_rules.keys())
        found_items = set(validated_values.keys())
        missing_items = expected_items - found_items
        
        if missing_items:
            print(f"\nWarning: Could not extract: {', '.join(missing_items)}")
        
        return {
            "extracted_values": validated_values,
            "addbacks": addbacks,
            "total_addbacks": sum(ab.amount for ab in addbacks),
            "missing_items": list(missing_items),
            "extraction_confidence": len(validated_values) / len(expected_items)
        }
    
    def generate_report(self, results: Dict) -> str:
        """Generate comprehensive report"""
        report = "\nAUTOMATIC EBITDA ADDBACK ANALYSIS\n"
        report += "=" * 60 + "\n"
        
        # Extraction confidence
        confidence = results.get("extraction_confidence", 0)
        report += f"\nExtraction Confidence: {confidence:.0%}\n"
        
        if confidence < 0.7:
            report += "⚠️  Low extraction confidence - manual review recommended\n"
        
        # Addbacks
        report += "\nIDENTIFIED ADDBACKS:\n"
        report += "-" * 60 + "\n"
        
        for addback in results["addbacks"]:
            report += f"\n{addback.line_item}"
            if addback.confidence < 0.8:
                report += " ⚠️"
            report += f"\n  Amount: ${addback.amount:,.2f}"
            report += f"\n  Reason: {addback.reason}"
            report += f"\n  Calculation: {addback.calculation}\n"
        
        # Total
        report += "-" * 60 + "\n"
        report += f"TOTAL ADDBACKS: ${results['total_addbacks']:,.2f}\n"
        
        # Missing items
        if results.get("missing_items"):
            report += f"\n⚠️  Could not extract: {', '.join(results['missing_items'])}\n"
            report += "These items may need manual review.\n"
        
        return report

In [24]:
# Cell 4: Main Execution
analyzer = ProductionAddbackAnalyzer()

# Simply provide the PDF path - no manual values needed!
pdf_path = "Roselle_Dental_Center_2022_Modified.pdf"  # <-- CHANGE THIS

# Fully automatic analysis
results = analyzer.analyze_automatically(pdf_path)

# Generate report
report = analyzer.generate_report(results)
print(report)

# Export to Excel
if results["addbacks"]:
    df = pd.DataFrame([{
        "Item": ab.line_item,
        "Amount": ab.amount,
        "Reason": ab.reason,
        "Calculation": ab.calculation,
        "Confidence": f"{ab.confidence:.0%}"
    } for ab in results["addbacks"]])
    
    df.to_excel("automatic_addback_analysis.xlsx", index=False)
    print("\nResults exported to automatic_addback_analysis.xlsx")

Starting automatic analysis...
Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...

DEBUG - All depreciation values found:
  $1,373 (confidence: 0.6)
  $1,373 (confidence: 0.6)
  $1,373 (confidence: 0.6)
  $1,373 (confidence: 0.6)
  $1,373 (confidence: 0.6)
  $1,373 (confidence: 0.6)
  $1,373 (confidence: 0.6)
  $1,373 (confidence: 0.6)
  $1,373 (confidence: 0.6)
  $1,373 (confidence: 0.6)
  $1,373 (confidence: 0.6)
  $1,373 (confidence: 0.6)
  $1,998 (confidence: 0.9)
  $4,562 (confidence: 0.6)
  $4,562 (confidence: 0.6)
  $4,562 (confidence: 0.6)
  $4,562 (confidence: 0.6)
  $4,562 (confidence: 0.6)
  $4,562 (confidence: 0.6)
  $4,562 (confidence: 0.6)
  $4,562 (confidence: 0.6)
  $4,562 (confidence: 0.6)
  $4,562 (confidence: 0.6)
  $4,562 (confidence: 0.6)
  $4,562 (confidence: 0.6)
  $4,797 (confidence: 0.9)
  $4,797 (confidence: 0.9)
  $4,797 (confidence: 0.9)
  $4,797 (confidence: 0.9)
  $4,797 (confidence: 0.9)
  $4,797 (conf

In [13]:
# Cell 1: Imports
import pdfplumber
import re
import pandas as pd
from decimal import Decimal

In [14]:
# Cell 2: Complete Profit and Loss Analyzer Class
class PLAnalyzer:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        # Define patterns that handle OCR corruptions
        self.discretionary_categories = {
            'automobile': r'[Aa]uto|6050',  # Catches Automobile or account 6050
            'contributions': r'[Cc]ontrib',  # Catches Contributions
            'legal_fees': r'egal|[Ll]egal|6660',  # Catches Legal, egal, or account 6660
            'miscellaneous': r'[Mm]iscel',  # Catches Miscellaneous
            'travel_entertainment': r'6900'  # Just look for account 6900 (Travel & Entertainment)
        }
        
        # Explanations for why each category is added back
        self.addback_reasons = {
            'automobile': "25% assumed personal use. This portion is a non-business expense.",
            'contributions': "Non-business expense. Charitable giving is discretionary.",
            'legal_fees': "Typically non-recurring expense. Added back to normalize earnings.",
            'miscellaneous': "Non-specific expense that may include personal or one-time items.",
            'travel_entertainment': "Often includes personal entertainment. Added back as discretionary.",
            'owner_salary': "Owner compensation exceeding market rate. Excess is added back to normalize earnings."
        }
        
        self.results = {}
        self.debug_mode = True  # Set to False to hide debug output
        
    def clean_amount(self, amount_str):
        """Convert string amounts to Decimal, handling commas and parentheses"""
        if not amount_str or amount_str.strip() == '':
            return Decimal('0')
        # Remove commas and spaces
        cleaned = amount_str.replace(',', '').replace(' ', '').strip()
        # Handle parentheses (negative numbers)
        if cleaned.startswith('(') and cleaned.endswith(')'):
            cleaned = '-' + cleaned[1:-1]
        try:
            return Decimal(cleaned)
        except:
            if self.debug_mode:
                print(f"Could not parse amount: {amount_str}")
            return Decimal('0')
    
    def diagnose_pdf_structure(self):
        """Debug function to understand PDF structure"""
        print("\n" + "="*60)
        print("PDF STRUCTURE DIAGNOSIS")
        print("="*60)
        
        with pdfplumber.open(self.pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages[:2]):  # Look at first 2 pages
                print(f"\n--- Page {page_num + 1} ---")
                text = page.extract_text()
                
                if text:
                    lines = text.split('\n')
                    print(f"Total lines: {len(lines)}")
                    
                    # Look for lines containing "Total" or category names
                    print("\nLines containing 'Total':")
                    for i, line in enumerate(lines):
                        if 'Total' in line:
                            print(f"Line {i}: [{line}]")
                    
                    print("\nLines containing 'Automobile':")
                    for i, line in enumerate(lines):
                        if 'Automobile' in line or 'automobile' in line:
                            print(f"Line {i}: [{line}]")
                            
                    print("\nLines containing 'Legal' or 'Travel':")
                    for i, line in enumerate(lines):
                        if 'Legal' in line or 'Travel' in line or 'egal' in line or 'Trael' in line:
                            print(f"Line {i}: [{line}]")
                            
                    # Show sample of all lines
                    print("\nFirst 20 lines of page:")
                    for i, line in enumerate(lines[:20]):
                        print(f"Line {i}: [{line}]")
                        
            # Search through all pages for our categories
            print("\n\nSEARCHING ALL PAGES FOR CATEGORIES:")
            all_lines = []
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    all_lines.extend(text.split('\n'))
            
            search_terms = ['Automobile', 'Legal', 'egal', 'Travel', 'Trael', 'Entertainment', 'Enter', 'NetIncome', 'Net Income']
            for term in search_terms:
                print(f"\nSearching for '{term}':")
                found = False
                for i, line in enumerate(all_lines):
                    if term.lower() in line.lower():
                        print(f"  Line {i}: [{line[:100]}...]")
                        found = True
                        if 'Total' in line:
                            print(f"    ^ This is a total line!")
                if not found:
                    print(f"  Not found")
    
    def extract_data(self):
        """Extract all relevant data from the PDF"""
        all_lines = []
        
        with pdfplumber.open(self.pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    all_lines.extend(text.split('\n'))
        
        if self.debug_mode:
            print(f"\nTotal lines extracted: {len(all_lines)}")
        
        self.parse_categories_improved(all_lines)
        self.find_net_income_improved(all_lines)
        self.find_owner_salary(all_lines)  # Add owner salary extraction
        
    def parse_categories_improved(self, lines):
        """Improved parsing that handles various PDF formats and OCR issues"""
        for category_name, pattern in self.discretionary_categories.items():
            if self.debug_mode:
                print(f"\nSearching for {category_name} with pattern: {pattern}")
            
            total_found = False
            
            for i, line in enumerate(lines):
                # Look for lines that contain "Total" and match our pattern
                if 'Total' in line and re.search(pattern, line):
                    if self.debug_mode:
                        print(f"Found total line at {i}: {line}")
                    
                    # Try different methods to extract amounts
                    amounts = self.extract_amounts_from_line(line)
                    
                    if amounts:
                        # Usually the last two numbers are debit and credit
                        if len(amounts) >= 2:
                            debit = self.clean_amount(amounts[-2])
                            credit = self.clean_amount(amounts[-1])
                        elif len(amounts) == 1:
                            debit = self.clean_amount(amounts[0])
                            credit = Decimal('0')
                        else:
                            debit = credit = Decimal('0')
                        
                        self.results[category_name] = {
                            'debit': debit,
                            'credit': credit,
                            'net': debit - credit,
                            'line': line  # Store the line for debugging
                        }
                        total_found = True
                        break
            
            if not total_found:
                if self.debug_mode:
                    print(f"Warning: Total not found for {category_name}")
                self.results[category_name] = {
                    'debit': Decimal('0'),
                    'credit': Decimal('0'),
                    'net': Decimal('0'),
                    'line': 'Not found'
                }
    
    def extract_amounts_from_line(self, line):
        """Extract numeric amounts from a line using multiple strategies"""
        amounts = []
        
        # Strategy 1: Look for patterns like "1,234.56" or "1234.56"
        pattern = r'[\d,]+\.?\d*'
        matches = re.findall(pattern, line)
        
        # Filter out things that might not be amounts (like years, line numbers)
        for match in matches:
            # Skip if it's likely a year (4 digits starting with 19 or 20)
            if re.match(r'^(19|20)\d{2}$', match):
                continue
            # Skip if it's too short to be an amount
            if len(match.replace(',', '').replace('.', '')) < 2:
                continue
            # Check if it has reasonable decimal places
            if '.' in match:
                decimal_part = match.split('.')[-1]
                if len(decimal_part) > 2:
                    continue
            amounts.append(match)
        
        if self.debug_mode and amounts:
            print(f"Extracted amounts: {amounts}")
        
        return amounts
    
    def find_net_income_improved(self, lines):
        """Find the Net Income line with improved matching"""
        for i, line in enumerate(lines):
            # Handle both "Net Income" and "NetIncome" (no spaces)
            if re.search(r'Net\s*Income(?!\s*\(Loss\))', line, re.IGNORECASE):
                if self.debug_mode:
                    print(f"\nFound Net Income at line {i}: {line}")
                
                amounts = self.extract_amounts_from_line(line)
                
                if amounts:
                    if len(amounts) >= 2:
                        debit = self.clean_amount(amounts[-2])
                        credit = self.clean_amount(amounts[-1])
                    elif len(amounts) == 1:
                        # Might need to check next line for credit
                        debit = Decimal('0')
                        credit = self.clean_amount(amounts[0])
                    else:
                        debit = credit = Decimal('0')
                    
                    self.results['net_income'] = {
                        'debit': debit,
                        'credit': credit,
                        'net': credit - debit,  # For income, credit - debit
                        'line': line
                    }
                    return
        
        if self.debug_mode:
            print("Warning: Net Income not found")
        self.results['net_income'] = {
            'debit': Decimal('0'),
            'credit': Decimal('0'),
            'net': Decimal('0'),
            'line': 'Not found'
        }
    
    def find_owner_salary(self, lines):
        """Find and calculate owner's salary from Gross Wages section"""
        if self.debug_mode:
            print("\nSearching for Owner's Salary in Payroll/Gross Wages sections...")
        
        owners_found = {}  # Dictionary to store multiple owners and their salaries
        market_rate = Decimal('195700')
        
        # Look through all lines for paycheck entries
        for i, line in enumerate(lines):
            # Look for paycheck lines (including corrupted "aycheck")
            if re.search(r'[Pp]aycheck|aycheck', line):
                # Check if this line contains OWNER pattern with possible characters between letters
                # This regex allows for any character(s) between O and NER
                owner_match = re.search(r'O.{0,2}NER', line)
                
                if owner_match:
                    if self.debug_mode:
                        print(f"Found OWNER pattern '{owner_match.group()}' in line {i}: {line[:100]}")
                    
                    # Extract the name
                    parts = line.split()
                    
                    # Find where the OWNER pattern appears
                    owner_index = -1
                    for j, part in enumerate(parts):
                        if re.search(r'O.{0,2}NER', part):
                            owner_index = j
                            break
                    
                    # Name should be 1-2 positions before OWNER
                    current_owner_name = None
                    if owner_index > 2:
                        # Handle names that might be split
                        potential_name_parts = []
                        for j in range(3, owner_index):  # Skip aycheck, date, check#
                            part = parts[j]
                            # Skip if it's likely a check number or amount
                            if not part.replace('.', '').replace(',', '').replace('$', '').isdigit():
                                # Also skip common location abbreviations
                                if part not in ['B...', 'Schaumburg']:
                                    potential_name_parts.append(part)
                        
                        if potential_name_parts:
                            current_owner_name = ' '.join(potential_name_parts)
                            if self.debug_mode:
                                print(f"Identified OWNER name: {current_owner_name}")
                    
                    # Extract amount - it's typically the last number on the line
                    amounts = self.extract_amounts_from_line(line)
                    if amounts and current_owner_name:
                        # Get the last amount (paycheck amount)
                        amount = self.clean_amount(amounts[-1])
                        if amount > 100:  # Sanity check
                            if current_owner_name not in owners_found:
                                owners_found[current_owner_name] = Decimal('0')
                            owners_found[current_owner_name] += amount
                            if self.debug_mode:
                                print(f"  Paycheck amount for {current_owner_name}: ${amount:,.2f}")
        
        # Calculate total owner salary across all owners
        total_owner_salary = sum(owners_found.values())
        
        # Calculate excess over market rate
        excess_salary = total_owner_salary - market_rate if total_owner_salary > market_rate else Decimal('0')
        
        # Format owner names for display
        if len(owners_found) == 0:
            owner_display = "Not found"
        elif len(owners_found) == 1:
            owner_display = list(owners_found.keys())[0]
        else:
            owner_display = "Multiple owners"
        
        self.results['owner_salary'] = {
            'total': total_owner_salary,
            'market_rate': market_rate,
            'excess': excess_salary,
            'owner_name': owner_display,
            'owner_details': owners_found  # Store individual owner details
        }
        
        if self.debug_mode:
            print(f"\nOwner Salary Summary:")
            if len(owners_found) > 1:
                print("  Multiple owners found:")
                for name, salary in owners_found.items():
                    print(f"    - {name}: ${salary:,.2f}")
            else:
                print(f"  Owner: {owner_display}")
            print(f"  Total Salary: ${total_owner_salary:,.2f}")
            print(f"  Market Rate: ${market_rate:,.2f}")
            print(f"  Excess over Market: ${excess_salary:,.2f}")
    
    def calculate_discretionary_earnings(self):
        """Calculate total discretionary earnings"""
        # Start with net income
        discretionary = self.results.get('net_income', {}).get('net', Decimal('0'))
        
        # Add back discretionary expenses
        for category in ['contributions', 'legal_fees', 'miscellaneous', 'travel_entertainment']:
            if category in self.results:
                discretionary += self.results[category]['net']
        
        # Add back 25% of automobile expense
        if 'automobile' in self.results:
            auto_expense = self.results['automobile']['net']
            auto_addon = auto_expense * Decimal('0.25')
            discretionary += auto_addon
            self.results['automobile_25_percent'] = auto_addon
        
        # Add back owner's excess salary over market rate
        if 'owner_salary' in self.results:
            excess_salary = self.results['owner_salary']['excess']
            if excess_salary > 0:
                discretionary += excess_salary
                if self.debug_mode:
                    print(f"Adding owner's excess salary: ${excess_salary:,.2f}")
        
        self.results['total_discretionary_earnings'] = discretionary
        
        return discretionary
    
    def display_results(self):
        """Display all results in a formatted manner"""
        print("\n" + "="*60)
        print("PROFIT & LOSS ANALYSIS RESULTS")
        print("="*60)
        
        # Display each category
        categories_display = {
            'automobile': 'Automobile Expense',
            'contributions': 'Contributions',
            'legal_fees': 'Legal Fees',
            'miscellaneous': 'Miscellaneous',
            'travel_entertainment': 'Travel & Entertainment'
        }
        
        for category_name, display_name in categories_display.items():
            if category_name in self.results:
                data = self.results[category_name]
                print(f"\n{display_name}:")
                print(f"  Source line: {data.get('line', 'N/A')[:60]}...")
                print(f"  Debit:  ${data['debit']:>12,.2f}")
                print(f"  Credit: ${data['credit']:>12,.2f}")
                print(f"  Net:    ${data['net']:>12,.2f}")
                
                if category_name == 'automobile':
                    print(f"  25% of Automobile: ${self.results.get('automobile_25_percent', 0):>12,.2f}")
                
                # Add reason for addback
                if data['net'] > 0:
                    print(f"  Reason: {self.addback_reasons.get(category_name, 'Discretionary expense')}")
        
        # Display Owner Salary info
        if 'owner_salary' in self.results:
            data = self.results['owner_salary']
            print(f"\nOwner Salary Analysis:")
            
            # Handle multiple owners
            if 'owner_details' in data and len(data['owner_details']) > 1:
                print(f"  Multiple owners found:")
                for name, salary in data['owner_details'].items():
                    print(f"    - {name}: ${salary:,.2f}")
            else:
                print(f"  Owner: {data['owner_name']}")
            
            print(f"  Total Salary: ${data['total']:>12,.2f}")
            print(f"  Market Rate:  ${data['market_rate']:>12,.2f}")
            print(f"  Excess:       ${data['excess']:>12,.2f}")
            
            if data['excess'] > 0:
                print(f"  Reason: {self.addback_reasons.get('owner_salary', 'Excess compensation')}")
        
        # Display Net Income
        if 'net_income' in self.results:
            data = self.results['net_income']
            print(f"\nNet Income:")
            print(f"  Source line: {data.get('line', 'N/A')[:60]}...")
            print(f"  Debit:  ${data['debit']:>12,.2f}")
            print(f"  Credit: ${data['credit']:>12,.2f}")
            print(f"  Net:    ${data['net']:>12,.2f}")
        
        # Display Total Discretionary Earnings with breakdown
        print("\n" + "-"*60)
        print("DISCRETIONARY EARNINGS CALCULATION:")
        print("-"*60)
        
        # Start with net income
        net_income = self.results.get('net_income', {}).get('net', Decimal('0'))
        print(f"Starting Net Income:                  ${net_income:>12,.2f}")
        
        # Show each addback
        print("\nAdd-backs:")
        for category in ['automobile', 'contributions', 'legal_fees', 'miscellaneous', 'travel_entertainment']:
            if category in self.results and self.results[category]['net'] > 0:
                if category == 'automobile':
                    amount = self.results.get('automobile_25_percent', 0)
                    print(f"  + Automobile (25%):                 ${amount:>12,.2f}")
                else:
                    amount = self.results[category]['net']
                    print(f"  + {categories_display[category]:.<30} ${amount:>12,.2f}")
        
        # Owner salary excess
        if 'owner_salary' in self.results and self.results['owner_salary']['excess'] > 0:
            excess = self.results['owner_salary']['excess']
            print(f"  + Owner Excess Compensation:        ${excess:>12,.2f}")
        
        print("-"*60)
        print(f"TOTAL DISCRETIONARY EARNINGS:         ${self.results.get('total_discretionary_earnings', 0):>12,.2f}")
        print("="*60)

In [15]:
# Cell 3 Main functions
def diagnose_pdf(pdf_path):
    """Run diagnostic on PDF to understand structure"""
    analyzer = PLAnalyzer(pdf_path)
    analyzer.diagnose_pdf_structure()

def find_all_totals(pdf_path):
    """Find all lines containing 'Total' in the PDF"""
    print("\n" + "="*60)
    print("ALL TOTAL LINES IN PDF")
    print("="*60)
    
    with pdfplumber.open(pdf_path) as pdf:
        all_lines = []
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                all_lines.extend(text.split('\n'))
        
        total_lines = []
        for i, line in enumerate(all_lines):
            if 'Total' in line:
                total_lines.append((i, line))
                
        print(f"\nFound {len(total_lines)} lines containing 'Total':\n")
        for line_num, line in total_lines:
            print(f"Line {line_num}: {line}")
            
            # Try to identify what category this might be
            categories = ['Auto', 'Legal', 'egal', 'Travel', 'Trael', 'Enter', 'Entertainment', 'Contributions', 'Miscellaneous']
            for cat in categories:
                if cat.lower() in line.lower():
                    print(f"  ^ Contains '{cat}'")
                    
    return total_lines

def find_payroll_section(pdf_path):
    """Find and display the Payroll Expenses section to diagnose OWNER entries"""
    print("\n" + "="*60)
    print("PAYROLL EXPENSES SECTION ANALYSIS")
    print("="*60)
    
    with pdfplumber.open(pdf_path) as pdf:
        all_lines = []
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                all_lines.extend(text.split('\n'))
        
        in_payroll = False
        owner_patterns = ['OWNER', 'ONER', 'OWNE', 'WNER', '0WNER']
        
        for i, line in enumerate(all_lines):
            # Start of Payroll section
            if re.search(r'Payroll\s*Expense|PayrollExpense', line, re.IGNORECASE):
                in_payroll = True
                print(f"\nFOUND PAYROLL SECTION at line {i}: {line}")
                print("-" * 60)
            
            # Within Payroll section
            elif in_payroll:
                # End of Payroll section
                if 'Total' in line and re.search(r'Payroll\s*Expense', line, re.IGNORECASE):
                    print("-" * 60)
                    print(f"END PAYROLL SECTION at line {i}: {line}\n")
                    break
                
                # Show Gross Wages headers
                if re.search(r'ross\s*[Ww]ages|657\d', line):
                    print(f"\n>>> GROSS WAGES SECTION: {line}")
                
                # Show lines that might contain OWNER
                for pattern in owner_patterns:
                    if pattern in line.upper():
                        print(f"!!! OWNER PATTERN '{pattern}' found: {line[:100]}")
                        break
                
                # Show all Check lines in Gross Wages sections
                if 'Check' in line and any(x in line for x in ['657', 'ross', 'ages']):
                    print(f"    {line[:100]}")

def analyze_pl_document(pdf_path, debug=True):
    """Main function to analyze a P&L document"""
    analyzer = PLAnalyzer(pdf_path)
    analyzer.debug_mode = debug
    
    print(f"Analyzing: {pdf_path}")
    
    # Extract data
    analyzer.extract_data()
    
    # Calculate discretionary earnings
    analyzer.calculate_discretionary_earnings()
    
    # Display results
    analyzer.display_results()
    
    return analyzer.results

# To use:
# First, diagnose the PDF structure:
# diagnose_pdf('your_pl_document.pdf')

# Then analyze:
results = analyze_pl_document('P&L 2022.pdf', debug=True)

Analyzing: P&L 2022.pdf

Total lines extracted: 2749

Searching for automobile with pattern: [Aa]uto|6050
Found total line at 131: Total6050·AutomobileEpense 15,000.00 0.00
Extracted amounts: ['6050', '15,000.00', '0.00']

Searching for contributions with pattern: [Cc]ontrib
Found total line at 217: Total6180·Contributions 3,975.00 0.00
Extracted amounts: ['6180', '3,975.00', '0.00']

Searching for legal_fees with pattern: egal|[Ll]egal|6660
Found total line at 2424: Total6660·egalFees 472.50 0.00
Extracted amounts: ['6660', '472.50', '0.00']

Searching for miscellaneous with pattern: [Mm]iscel
Found total line at 922: Total6530·Miscellaneous 350.34 0.00
Extracted amounts: ['6530', '350.34', '0.00']

Searching for travel_entertainment with pattern: 6900
Found total line at 2664: Total6900·TraelEntertrainmentOther 367.01 0.00
Extracted amounts: ['6900', '367.01', '0.00']

Found Net Income at line 2747: NetIncome 1,254,117.93 1,247,240.57
Extracted amounts: ['1,254,117.93', '1,247,2