In [None]:
import pandas as pd
import json
from pathlib import Path
from typing import Dict, List, Any, Optional
from shapely.geometry import shape
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)


def load_geojson(file_path: str) -> Dict[str, Any]:
    """
    Load GeoJSON data from file with error handling.
    
    Args:
        file_path: Path to GeoJSON file
        
    Returns:
        Parsed GeoJSON data as dictionary
        
    Raises:
        FileNotFoundError: If file doesn't exist
        json.JSONDecodeError: If file is not valid JSON
    """
    path = Path(file_path)
    if not path.exists():
        raise FileNotFoundError(f"GeoJSON file not found: {file_path}")
    
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)


def save_geojson(data: Dict[str, Any], file_path: str) -> None:
    """
    Save GeoJSON data to file.
    
    Args:
        data: GeoJSON data dictionary
        file_path: Output file path
    """
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)
    logger.info(f"Saved GeoJSON to {file_path}")


def join_csv_to_geojson(
    csv_path: str, 
    geojson_path: str, 
    output_path: str,
    id_column: str = 'study_area_id',
    date_column: str = 'date',
    value_column: str = 'predicted_hotspot_count'
) -> None:
    """
    Join CSV predictions to GeoJSON features based on ID matching.
    
    This function is optimized using pandas groupby and dictionary operations
    instead of iterrows() for better performance.
    
    Args:
        csv_path: Path to input CSV file
        geojson_path: Path to input GeoJSON file
        output_path: Path to output GeoJSON file
        id_column: Column name for feature ID in CSV
        date_column: Column name for date in CSV
        value_column: Column name for prediction values in CSV
    """
    try:
        # Load CSV data
        logger.info(f"Loading CSV from {csv_path}")
        csv_data = pd.read_csv(csv_path)
        
        # Strip whitespace from column names (common issue in CSV files)
        csv_data.columns = csv_data.columns.str.strip()
        logger.info(f"CSV columns: {list(csv_data.columns)}")
        
        # Validate required columns
        required_cols = [id_column, date_column, value_column]
        missing_cols = [col for col in required_cols if col not in csv_data.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns in CSV: {missing_cols}")
        
        # Convert date format from d/m/yyyy to yyyy-mm-dd
        logger.info(f"Converting date format from d/m/yyyy to yyyy-mm-dd")
        csv_data[date_column] = pd.to_datetime(
            csv_data[date_column], 
            format='%d/%m/%Y',
            errors='coerce'  # Convert invalid dates to NaT
        ).dt.strftime('%Y-%m-%d')
        
        # Remove rows with invalid dates
        invalid_dates = csv_data[date_column].isna().sum()
        if invalid_dates > 0:
            logger.warning(f"Found {invalid_dates} invalid dates, removing those rows")
            csv_data = csv_data.dropna(subset=[date_column])
        
        logger.info(f"Sample dates after conversion: {csv_data[date_column].head(3).tolist()}")
        
        # Load GeoJSON data
        logger.info(f"Loading GeoJSON from {geojson_path}")
        geojson_data = load_geojson(geojson_path)
        
        # OPTIMIZATION: Use groupby and apply instead of iterrows for much better performance
        # Group by study_area_id and create list of predictions
        csv_data[id_column] = csv_data[id_column].astype(int)
        predictions_dict = (
            csv_data
            .groupby(id_column)
            .apply(lambda x: x[[date_column, value_column]].to_dict('records'), include_groups=False)
            .to_dict()
        )
        
        logger.info(f"Processing {len(geojson_data['features'])} features")
        
        # Add predictions to matching GeoJSON features
        matched_count = 0
        for feature in geojson_data['features']:
            feature_id = int(feature['properties'].get('id', -1))
            if feature_id in predictions_dict:
                feature['properties']['predictions'] = predictions_dict[feature_id]
                matched_count += 1
        
        logger.info(f"Matched {matched_count} features with predictions")
        
        # Save updated GeoJSON
        save_geojson(geojson_data, output_path)
        
    except FileNotFoundError as e:
        logger.error(f"File not found: {e}")
        raise
    except Exception as e:
        logger.error(f"Error joining data: {e}")
        raise


def remove_duplicate_polygons(
    geojson_path: str, 
    output_path: str,
    keep_properties: bool = True
) -> int:
    """
    Remove duplicate polygon geometries from GeoJSON.
    
    Uses WKT (Well-Known Text) representation for reliable geometry comparison.
    
    Args:
        geojson_path: Path to input GeoJSON file
        output_path: Path to output GeoJSON file
        keep_properties: If True, keeps properties from first occurrence
        
    Returns:
        Number of duplicates removed
    """
    try:
        # Load GeoJSON data
        logger.info(f"Loading GeoJSON from {geojson_path}")
        geojson_data = load_geojson(geojson_path)
        
        original_count = len(geojson_data['features'])
        unique_features = []
        seen_geometries = set()
        
        logger.info(f"Processing {original_count} features for duplicates")
        
        # OPTIMIZATION: Use WKT for more reliable geometry comparison
        for feature in geojson_data['features']:
            try:
                # Convert to Shapely geometry and get WKT representation
                geom = shape(feature['geometry'])
                geom_wkt = geom.wkt
                
                # Check if geometry is unique
                if geom_wkt not in seen_geometries:
                    seen_geometries.add(geom_wkt)
                    unique_features.append(feature)
                    
            except Exception as e:
                logger.warning(f"Skipping invalid geometry: {e}")
                continue
        
        duplicates_removed = original_count - len(unique_features)
        
        # Update GeoJSON with unique features
        geojson_data['features'] = unique_features
        
        # Save result
        save_geojson(geojson_data, output_path)
        
        logger.info(f"Removed {duplicates_removed} duplicate polygons")
        logger.info(f"Remaining features: {len(unique_features)}")
        
        return duplicates_removed
        
    except Exception as e:
        logger.error(f"Error removing duplicates: {e}")
        raise


# Main execution
if __name__ == "__main__":
    try:
        # Join CSV predictions to GeoJSON
        join_csv_to_geojson(
            csv_path="new_area_predictions.csv",
            geojson_path="hex_forest_pro_4326.geojson",
            output_path="hex_forest_pro_4326_with_predictions.geojson"
        )
        
        # Remove duplicate polygons
        remove_duplicate_polygons(
            geojson_path="hex_forest_pro_4326_with_predictions.geojson",
            output_path="hex_forest_pro_4326_predict.geojson"
        )
        
        logger.info("Processing completed successfully!")
        
    except Exception as e:
        logger.error(f"Processing failed: {e}")
        raise