In [None]:
import re
from typing import List

class SQLValidator:
    """Minimal SQL injection protection for controlled environments"""
    
    # Core reserved keywords that cause actual problems
    CRITICAL_KEYWORDS = {
        'select', 'from', 'where', 'drop', 'delete', 'insert', 
        'update', 'union', 'exec', 'execute'
    }
    
    # Single regex for all dangerous patterns
    DANGEROUS_PATTERN = re.compile(
        r"--|/\*|\*/|;|'|\"|\\|0x[0-9a-f]+|\$\{",
        re.IGNORECASE
    )
    
    @staticmethod
    def validate_identifier(identifier: str, allow_dots: bool = False) -> None:
        """Fast, minimal validation for identifiers"""
        if not identifier or not isinstance(identifier, str):
            raise ValueError(f"Invalid identifier: empty or not string")
        
        # Strip backticks if present
        clean = identifier.strip('`')
        
        # Check dangerous patterns (fastest check first)
        if SQLValidator.DANGEROUS_PATTERN.search(clean):
            raise ValueError(f"Identifier contains dangerous characters: {identifier}")
        
        # Basic character check
        pattern = r'^[a-zA-Z0-9_\.]+$' if allow_dots else r'^[a-zA-Z0-9_]+$'
        if not re.match(pattern, clean):
            raise ValueError(f"Identifier contains invalid characters: {identifier}")
        
        # Must start with letter or underscore
        if not clean[0].isalpha() and clean[0] != '_':
            raise ValueError(f"Identifier must start with letter/underscore: {identifier}")
        
        # Check critical reserved keywords only
        if clean.lower() in SQLValidator.CRITICAL_KEYWORDS:
            raise ValueError(f"Identifier is a reserved keyword: {identifier}")
    
    @staticmethod
    def validate_batch(identifiers: List[str], allow_dots: bool = False) -> None:
        """Validate multiple identifiers efficiently"""
        errors = []
        for idx, identifier in enumerate(identifiers):
            try:
                SQLValidator.validate_identifier(identifier, allow_dots)
            except ValueError as e:
                errors.append(f"[{idx}] {identifier}: {str(e)}")
        
        if errors:
            raise ValueError(f"Validation failed:\n" + "\n".join(errors))

class Validator:
    def _add_issue(self, severity: WarningSeverity, stage: str, entity: str, 
                   message: str, details: Optional[Dict] = None):
        """Add a validation issue to the collection"""
        self._validation_issues.append(
            ValidationIssue(severity, stage, entity, message, details)
        )
        if severity == WarningSeverity.ERROR:
            self._has_errors = True
    
    def _validate_identifier(self, identifier: str, allow_dots: bool = False) -> None:
        """Fast validation wrapper"""
        try:
            self.validator.validate_identifier(identifier, allow_dots)
        except ValueError as e:
            self._add_issue(WarningSeverity.ERROR, "validation", "identifier", str(e))
            raise
    
    def _validate_batch(self, identifiers: List[str]) -> None:
        """Batch validate for performance"""
        try:
            self.validator.validate_batch(identifiers, allow_dots=False)
        except ValueError as e:
            self._add_issue(WarningSeverity.ERROR, "validation", "identifiers", str(e))
            raise

    def _validate_satellite_parent(self, sat: Satellite) -> None:
        all_parents = self._get_all_parents()
        
        if sat.parent_hub_or_link not in all_parents:
            self._add_issue(
                WarningSeverity.ERROR,
                "satellite",
                sat.name,
                f"Parent '{sat.parent_hub_or_link}' not found",
                {"available": sorted(all_parents)}
            )

    def _check_vault_entity_duplicate(self, name: str, collection: list, entity_type: str) -> bool:
        if any(item.name == name for item in collection):
            self._add_issue(
                WarningSeverity.ERROR,
                entity_type,
                name,
                f"Duplicate {entity_type} name '{name}' â€” already registered"
            )
            return True
        return False

    def _detect_duplicate_satellites(self) -> None:
        """Detect satellites with duplicate descriptive columns on same source"""
        
        # Group satellites by source table
        by_source = {}
        for sat in self._sats:
            if sat.source_table not in by_source:
                by_source[sat.source_table] = []
            by_source[sat.source_table].append(sat)
        
        # Collect all duplicates: {source_table: {signature: [satellites]}}
        all_duplicates = {}
        
        for source_table, satellites in by_source.items():
            column_signatures = {}
            
            for sat in satellites:
                try:
                    resolved_cols = sat.resolved_columns
                except RuntimeError:
                    continue

                signature = tuple(sorted(resolved_cols))
                
                if signature not in column_signatures:
                    column_signatures[signature] = []
                column_signatures[signature].append(sat)
            
            # Filter to only duplicates (2+ satellites with same signature)
            duplicates = {
                sig: sats for sig, sats in column_signatures.items() 
                if len(sats) > 1
            }
            
            if duplicates:
                all_duplicates[source_table] = duplicates
        
        # Print all duplicates at once
        if all_duplicates:
            # print("DUPLICATE SATELLITE COLUMNS DETECTED")
            
            for source_table, duplicates in all_duplicates.items():
                # print(f"Source Table: {source_table}")
                
                for signature, satellites in duplicates.items():
                    sat_names = [sat.name for sat in satellites]
                    
                    # print(f"  Columns: {list(signature)}")
                    # print(f"  Satellites ({len(satellites)}):")
                    # for sat in satellites:
                    #     print(f"    - {sat.name}")
                    
                    # Add issue for each duplicate group
                    self._add_issue(
                        WarningSeverity.WARNING,
                        "satellite",
                        ", ".join(sat_names),
                        f"{len(satellites)} satellites share identical descriptive columns on '{source_table}'",
                        {
                            "source_table": source_table,
                            "columns": list(signature),
                            "satellites": sat_names,
                            "impact": f"All {len(satellites)} satellites will generate identical hash diffs in staging table",
                            "suggestion": "Consider using different columns or combining into one satellite"
                        }
                    )

    def _validate_all_columns(self) -> None:
        """Validate all entity columns exist in their source tables."""

        for hub in self._hubs:
            self._validate_columns_exist(
                entity_name=hub.name,
                entity_type="hub",
                columns=hub.business_key_columns,
                source_table=hub.source_table
            )
        
        for sat in self._sats:
            columns = sat.resolved_columns if sat._resolved_columns else sat.descriptive_columns
            self._validate_columns_exist(
                entity_name=sat.name,
                entity_type="satellite",
                columns=columns,
                source_table=sat.source_table
            )
            # TODO: if it's a new stage, ofc it will fail
            # if self._registered_hubs or self._registered_links:
            #     columns.extend(sat._stage.columns)
            #     self._validate_columns_exist(
            #         entity_name=sat.name,
            #         entity_type="satellite",
            #         columns=columns,
            #         source_table=f"{sat._stage.schema}.{sat._stage.table}"
            #     )

    def _validate_columns_exist(
        self, 
        entity_name: str, 
        entity_type: str, 
        columns: List[str], 
        source_table: str
    ) -> None:
        """Check if columns exist in source table."""
        
        source_columns = self._get_source_columns(source_table)
        
        if not source_columns:
            return  # Error already logged
        
        source_lower = {c.lower() for c in source_columns}
        
        missing = [c for c in columns if c.lower() not in source_lower]
        
        if missing:
            self._add_issue(
                WarningSeverity.ERROR,
                entity_type,
                entity_name,
                f"Column(s) not found in source: {missing}",
                {
                    "source_table": source_table,
                    "missing_columns": missing,
                    "available_columns": sorted(source_columns)[:15]
                }
            )
    
