In [18]:
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import pdf2image
import re
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import pandas as pd
import os
import sys

# === CONFIGURATION ===
# For Windows users: Update this path to where you extracted Poppler
if sys.platform == "win32":
    poppler_path = r"C:\Program Files\poppler\Library\bin"  # <-- UPDATE THIS PATH
    if os.path.exists(poppler_path):
        os.environ["PATH"] += os.pathsep + poppler_path

@dataclass
class Addback:
    """Represents a single addback item"""
    line_item: str
    amount: float
    reason: str
    calculation: str = ""  # Shows how amount was calculated
    source: str = "Tax Return"

class RobustAddbackAnalyzer:
    """Enhanced analyzer with better OCR and specific addback logic"""
    
    def __init__(self):
        # Market rate for owner compensation
        self.market_rate_salary = 195700
        
        # Categories that require investigation before adding back
        self.investigate_items = {
            "miscellaneous": {
                "threshold": 1000,  # Only investigate if over $1,000
                "reason": "Requires review - could be one-time or recurring"
            },
            "legal": {
                "threshold": 5000,  # Only investigate if over $5,000
                "reason": "Requires review - could be one-time legal matter or ongoing"
            }
        }
        
        # Define specific addback rules with clear reasoning
        self.addback_definitions = {
            "owner_excess": {
                "description": "Owner Compensation Excess",
                "reason": "Owner salary exceeds market rate. Excess amount is added back to normalize earnings."
            },
            "depreciation": {
                "description": "Depreciation & Amortization",
                "reason": "Non-cash expense. Added back for EBITDA calculation as it doesn't affect cash flow."
            },
            "automobile": {
                "description": "Automobile Expenses",
                "reason": "25% assumed personal use. This portion is discretionary/non-business expense."
            },
            "charitable": {
                "description": "Charitable Contributions",
                "reason": "Non-business expense. Charitable giving is discretionary and not required for operations."
            },
            "meals": {
                "description": "Meals & Entertainment",
                "reason": "50% non-deductible portion represents discretionary spending not essential to operations."
            },
            "section179": {
                "description": "Section 179 Expense",
                "reason": "Accelerated depreciation election. Added back as it's a non-cash tax benefit."
            },
            "interest": {
                "description": "Interest Expense",
                "reason": "Added back for EBITDA calculation. Represents financing decisions, not operations."
            }
        }
    
    def enhance_image_for_ocr(self, image):
        """Enhance image quality for better OCR results"""
        # Convert to grayscale
        image = image.convert('L')
        
        # Enhance contrast
        enhancer = ImageEnhance.Contrast(image)
        image = enhancer.enhance(2.0)
        
        # Apply sharpening
        image = image.filter(ImageFilter.SHARPEN)
        
        # Resize for better OCR (300 DPI is usually good)
        width, height = image.size
        image = image.resize((width * 2, height * 2), Image.Resampling.LANCZOS)
        
        return image
    
    def extract_with_enhanced_ocr(self, pdf_path: str) -> str:
        """Extract text with enhanced OCR processing"""
        try:
            print("Converting PDF to images...")
            if sys.platform == "win32" and 'poppler_path' in globals():
                images = pdf2image.convert_from_path(pdf_path, dpi=300, poppler_path=poppler_path)
            else:
                images = pdf2image.convert_from_path(pdf_path, dpi=300)
            
            all_text = []
            for i, image in enumerate(images):
                print(f"Processing page {i+1} with enhanced OCR...")
                
                # Enhance image
                enhanced_image = self.enhance_image_for_ocr(image)
                
                # Try different OCR configurations
                configs = [
                    '--psm 6',  # Uniform block of text
                    '--psm 4',  # Single column of text
                    '--psm 11', # Sparse text
                ]
                
                best_text = ""
                max_numbers_found = 0
                
                for config in configs:
                    text = pytesseract.image_to_string(enhanced_image, config=config)
                    # Count how many numbers we found (crude quality metric)
                    numbers_found = len(re.findall(r'\d{3,}', text))
                    if numbers_found > max_numbers_found:
                        max_numbers_found = numbers_found
                        best_text = text
                
                all_text.append(best_text)
            
            return "\n".join(all_text)
            
        except Exception as e:
            print(f"Error in OCR: {e}")
            return ""
    
    def parse_known_items(self, text: str) -> Dict[str, float]:
        """Parse specific known items from tax returns"""
        items = {}
        
        # Enhanced patterns for common tax return items
        patterns = {
            # Officer compensation
            "officer_compensation": [
                r"compensation.*officers.*?(\d{3},?\d{3})",
                r"HARVEY.*SEYBOLD.*?(\d{3},?\d{3})",
                r"officer.*salary.*?(\d{3},?\d{3})",
                r"Total compensation.*?(\d{3},?\d{3})"
            ],
            # Depreciation
            "depreciation": [
                r"depreciation.*?(\d{1,3},?\d{3})",
                r"MACRS.*?(\d{1,3},?\d{3})",
                r"depr.*amort.*?(\d{1,3},?\d{3})"
            ],
            # Automobile
            "automobile": [
                r"automobile.*expense.*?(\d{2,3},?\d{3})",
                r"auto.*truck.*?(\d{2,3},?\d{3})",
                r"vehicle.*?(\d{2,3},?\d{3})"
            ],
            # Charitable
            "charitable": [
                r"charitable.*contrib.*?(\d{1,3},?\d{3})",
                r"contributions.*?(\d{1,3},?\d{3})"
            ],
            # Meals
            "meals": [
                r"meals.*\(?50%\)?.*?(\d{1,3})",
                r"meals.*entertainment.*?(\d{1,3})",
                r"MEALS.*?(\d{1,3})"
            ],
            # Section 179
            "section179": [
                r"section.*179.*?(\d{2,3},?\d{3})",
                r"sec.*179.*expense.*?(\d{2,3},?\d{3})"
            ],
            # Interest
            "interest": [
                r"interest\s+(\d{1,3},?\d{3})",
                r"interest.*expense.*?(\d{1,3},?\d{3})"
            ],
            # Miscellaneous
            "miscellaneous": [
                r"miscellaneous.*?(\d{1,3},?\d{3})",
                r"misc\s.*?(\d{1,3},?\d{3})",
                r"other.*misc.*?(\d{1,3},?\d{3})"
            ],
            # Legal and Professional (combined on tax return)
            "legal_professional": [
                r"legal.*professional.*?(\d{3,},?\d{3})",
                r"professional.*legal.*?(\d{3,},?\d{3})",
                r"legal.*?(\d{3,},?\d{3})"
            ]
        }
        
        # Search for each pattern
        for item_type, pattern_list in patterns.items():
            for pattern in pattern_list:
                matches = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
                if matches:
                    amount_str = matches.group(1).replace(',', '')
                    try:
                        items[item_type] = float(amount_str)
                        print(f"Found {item_type}: ${amount_str}")
                        break
                    except:
                        continue
        
        return items
    
    def calculate_addbacks(self, parsed_items: Dict[str, float]) -> List[Addback]:
        """Calculate specific addbacks based on your rules"""
        addbacks = []
        
        # 1. Owner Compensation Excess
        if "officer_compensation" in parsed_items:
            actual_salary = parsed_items["officer_compensation"]
            if actual_salary > self.market_rate_salary:
                excess = actual_salary - self.market_rate_salary
                addbacks.append(Addback(
                    line_item="Owner Compensation (Excess over market)",
                    amount=excess,
                    reason=self.addback_definitions["owner_excess"]["reason"],
                    calculation=f"${actual_salary:,.0f} actual - ${self.market_rate_salary:,.0f} market = ${excess:,.0f}"
                ))
        
        # 2. Depreciation (full amount)
        if "depreciation" in parsed_items:
            amount = parsed_items["depreciation"]
            addbacks.append(Addback(
                line_item="Depreciation & Amortization",
                amount=amount,
                reason=self.addback_definitions["depreciation"]["reason"],
                calculation=f"Full amount: ${amount:,.0f}"
            ))
            print(f"DEBUG: Depreciation amount being added: ${amount:,.0f}")  # Debug line
        
        # 3. Automobile (25% personal use)
        if "automobile" in parsed_items:
            total_auto = parsed_items["automobile"]
            personal_portion = total_auto * 0.25
            addbacks.append(Addback(
                line_item="Automobile Expenses (Personal Use)",
                amount=personal_portion,
                reason=self.addback_definitions["automobile"]["reason"],
                calculation=f"25% of ${total_auto:,.0f} = ${personal_portion:,.0f}"
            ))
        
        # 4. Charitable Contributions (full amount)
        if "charitable" in parsed_items:
            amount = parsed_items["charitable"]
            addbacks.append(Addback(
                line_item="Charitable Contributions",
                amount=amount,
                reason=self.addback_definitions["charitable"]["reason"],
                calculation=f"Full amount: ${amount:,.0f}"
            ))
        
        # 5. Meals & Entertainment (50% or full if already limited)
        if "meals" in parsed_items:
            amount = parsed_items["meals"]
            addbacks.append(Addback(
                line_item="Meals & Entertainment",
                amount=amount,
                reason=self.addback_definitions["meals"]["reason"],
                calculation=f"Non-deductible portion: ${amount:,.0f}"
            ))
        
        # 6. Section 179 Expense
        if "section179" in parsed_items:
            amount = parsed_items["section179"]
            addbacks.append(Addback(
                line_item="Section 179 Expense",
                amount=amount,
                reason=self.addback_definitions["section179"]["reason"],
                calculation=f"Full amount: ${amount:,.0f}"
            ))
        
        # 7. Interest Expense
        if "interest" in parsed_items:
            amount = parsed_items["interest"]
            addbacks.append(Addback(
                line_item="Interest Expense",
                amount=amount,
                reason=self.addback_definitions["interest"]["reason"],
                calculation=f"Full amount: ${amount:,.0f}"
            ))
        
        # 8. Items requiring investigation (only flag, don't automatically add back)
        investigation_notes = []
        
        if "miscellaneous" in parsed_items:
            amount = parsed_items["miscellaneous"]
            if amount >= self.investigate_items["miscellaneous"]["threshold"]:
                investigation_notes.append(
                    f"INVESTIGATE: Miscellaneous expenses (${amount:,.0f}) - "
                    "Could be one-time or recurring. Review details before adding back."
                )
        
        if "legal_professional" in parsed_items:
            amount = parsed_items["legal_professional"]
            if amount >= self.investigate_items["legal"]["threshold"]:
                investigation_notes.append(
                    f"INVESTIGATE: Legal/Professional fees (${amount:,.0f}) - "
                    "Check P&L for breakdown. Only add back if one-time legal matter."
                )
        
        # Add investigation notes as a special entry if any exist
        if investigation_notes:
            addbacks.append(Addback(
                line_item="ITEMS REQUIRING REVIEW",
                amount=0,  # Don't add to total yet
                reason="The following items need investigation before determining if they're addbacks:",
                calculation="\n".join(investigation_notes)
            ))
        
        return addbacks
    
    def analyze_with_fallback(self, pdf_path: str, known_values: Optional[Dict] = None, 
                             pl_breakdown: Optional[Dict] = None, 
                             trust_known_values: bool = True) -> Dict:
        """Analyze with fallback to manual values if OCR fails
        
        Args:
            pdf_path: Path to PDF file
            known_values: Dictionary of known correct values
            pl_breakdown: P&L specific breakdowns
            trust_known_values: If True, always use known_values over OCR results
        """
        # Try OCR extraction
        extracted_text = self.extract_with_enhanced_ocr(pdf_path)
        parsed_items = self.parse_known_items(extracted_text)
        
        print("\nDEBUG - Items found by OCR:")
        for item, value in parsed_items.items():
            print(f"  {item}: ${value:,.2f}")
        
        # Decide which values to use
        if trust_known_values and known_values:
            print("\nUsing provided known values (overriding OCR)...")
            # Merge, with known_values taking precedence
            final_items = parsed_items.copy()
            final_items.update(known_values)  # This overwrites OCR values
            parsed_items = final_items
        elif not parsed_items and known_values:
            print("\nOCR extraction limited. Using provided values...")
            parsed_items = known_values
        elif not parsed_items:
            print("\nOCR extraction failed. Please provide known values.")
            return {"error": "OCR failed", "addbacks": []}
        
        print("\nDEBUG - Final parsed items being used:")
        for item, value in parsed_items.items():
            print(f"  {item}: ${value:,.2f}")
        
        # If P&L breakdown provided, use it to make decisions
        if pl_breakdown and "legal_one_time" in pl_breakdown:
            # Add one-time legal fees as addback
            parsed_items["legal_one_time"] = pl_breakdown["legal_one_time"]
        
        # Calculate addbacks
        addbacks = self.calculate_addbacks(parsed_items)
        
        # If we have P&L data for one-time legal, add it
        if "legal_one_time" in parsed_items:
            addbacks.append(Addback(
                line_item="Legal Fees (One-time)",
                amount=parsed_items["legal_one_time"],
                reason="One-time legal expense not expected to recur",
                calculation=f"Per P&L breakdown: ${parsed_items['legal_one_time']:,.0f}"
            ))
        
        return {
            "parsed_items": parsed_items,
            "addbacks": addbacks,
            "total_addbacks": sum(ab.amount for ab in addbacks if ab.line_item != "ITEMS REQUIRING REVIEW")
        }
    
    def generate_report(self, results: Dict) -> str:
        """Generate clear report of addbacks"""
        if "error" in results:
            return f"Error: {results['error']}"
        
        report = "\nEBITDA ADDBACK ANALYSIS REPORT\n"
        report += "=" * 60 + "\n\n"
        
        addbacks = results["addbacks"]
        
        if not addbacks:
            report += "No addbacks identified.\n"
            return report
        
        report += "IDENTIFIED ADDBACKS:\n"
        report += "-" * 60 + "\n\n"
        
        for i, addback in enumerate(addbacks, 1):
            report += f"{i}. {addback.line_item}\n"
            report += f"   Amount: ${addback.amount:,.2f}\n"
            report += f"   Reason: {addback.reason}\n"
            report += f"   Calculation: {addback.calculation}\n"
            report += "\n"
        
        report += "-" * 60 + "\n"
        report += f"TOTAL ADDBACKS: ${results['total_addbacks']:,.2f}\n"
        report += "=" * 60 + "\n"
        
        return report

# Example usage
if __name__ == "__main__":
    analyzer = RobustAddbackAnalyzer()
    
    # Your PDF path
    pdf_path = "Roselle_Dental_Center_2022_Modified.pdf"  # <-- CHANGE THIS
    
    # Since OCR might not work well with your document, you can provide known values
    # Based on what you've told me about the 2022 tax return:
    known_2022_values = {
        "officer_compensation": 200115,  # Harvey's salary
        "depreciation": 1373,            # MACRS depreciation
        "automobile": 15000,             # Automobile expenses
        "charitable": 3975,              # Charitable contributions
        "meals": 184,                    # Meals (50%)
        "section179": 12721,             # Section 179 expense
        "miscellaneous": 350,            # Miscellaneous expenses
        "legal_professional": 121585,    # Combined legal and professional
    }
    
    # Optional: If you have P&L breakdown showing one-time items
    pl_breakdown = {
        "legal_one_time": 473,  # One-time legal fees from P&L
        # Add other P&L specific items here
    }
    
    # Analyze with fallback to known values
    # Set trust_known_values=True to override any OCR errors
    results = analyzer.analyze_with_fallback(
        pdf_path, 
        known_2022_values, 
        pl_breakdown,
        trust_known_values=True  # This ensures your known values override OCR
    )
    
    # Generate and print report
    report = analyzer.generate_report(results)
    print(report)
    
    # Export to Excel
    if results.get("addbacks"):
        df = pd.DataFrame([
            {
                "Addback Item": ab.line_item,
                "Amount": ab.amount,
                "Reason": ab.reason,
                "How Calculated": ab.calculation
            }
            for ab in results["addbacks"]
        ])
        
        # Add summary row
        summary_row = pd.DataFrame([{
            "Addback Item": "TOTAL ADDBACKS",
            "Amount": results["total_addbacks"],
            "Reason": "",
            "How Calculated": ""
        }])
        df = pd.concat([df, summary_row], ignore_index=True)
        
        df.to_excel("2022_addback_analysis.xlsx", index=False)
        print("\nResults exported to 2022_addback_analysis.xlsx")

Converting PDF to images...
Processing page 1 with enhanced OCR...
Processing page 2 with enhanced OCR...
Processing page 3 with enhanced OCR...
Processing page 4 with enhanced OCR...
Processing page 5 with enhanced OCR...
Found officer_compensation: $200115
Found depreciation: $1645
Found charitable: $3975
Found meals: $183
Found section179: $12721

DEBUG - Items found by OCR:
  officer_compensation: $200,115.00
  depreciation: $1,645.00
  charitable: $3,975.00
  meals: $183.00
  section179: $12,721.00

Using provided known values (overriding OCR)...

DEBUG - Final parsed items being used:
  officer_compensation: $200,115.00
  depreciation: $1,373.00
  charitable: $3,975.00
  meals: $184.00
  section179: $12,721.00
  automobile: $15,000.00
  miscellaneous: $350.00
  legal_professional: $121,585.00
DEBUG: Depreciation amount being added: $1,373

EBITDA ADDBACK ANALYSIS REPORT

IDENTIFIED ADDBACKS:
------------------------------------------------------------

1. Owner Compensation (Exce