# Geo dupes in DuckDB
**Author**:  Greg Slater <br>
**Date**:  24th September 2024 <br>
**Dataset Scope**: `dataset` <br>
**Report Type**: Ad-hoc analysis <br>
**Purpose**: Minimal reproduction of method used in [conservation area geo duplicates report](https://github.com/digital-land/jupyter-analysis/blob/main/reports/find_conservation_area_duplicates/map_conservation_area_duplicates.ipynb), using duckdb instead of geopandas as this may be closer to the aproach that is required to identify geo-duplicates for any dataset in expectations or as part of the add-data command.


In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import duckdb as ddb
from datetime import datetime

pd.set_option("display.max_rows", 100)

td = datetime.today().strftime('%Y-%m-%d')
data_dir = "../../data/endpoint_checker/entity_resolution/"


In [None]:
ca_sqlite_path = os.path.join(data_dir, "conservation-area.sqlite3")

# Connect to DuckDB
con = ddb.connect()

# Load the SQLite extension
con.execute("INSTALL sqlite;")
con.execute("LOAD sqlite;")
con.execute("INSTALL spatial;")
con.execute("LOAD spatial;")

# Attach the SQLite database
con.execute(f"ATTACH DATABASE '{ca_sqlite_path}' AS sqlite_db;")

# Create a new table in DuckDB, load in entity table from sqlite and create spatial index on geom field
# Note - remove LIMIT statement to run on full entity table, restricted for now for easier testing
con.execute("""
    DROP TABLE IF EXISTS entity_spatial;
            
    CREATE TABLE entity_spatial (
    entity INTEGER,
    reference TEXT,
    geom GEOMETRY);
            
    DELETE FROM entity_spatial;

    INSERT INTO entity_spatial (entity, reference, geom)
    SELECT entity, reference, ST_GeomFromText(geometry)
    FROM sqlite_db.entity
    WHERE geometry != ''
    LIMIT 1000;
            
    CREATE INDEX idx ON entity_spatial USING RTREE (geom);
""")


<duckdb.duckdb.DuckDBPyConnection at 0x126cffe70>

In [3]:
MATCH_THRESHOLD = 0.95

con.sql(f"""
    WITH calc as (
        SELECT 
            a.entity as entity_a,
            b.entity as entity_b,
            CONCAT(LEAST(a.entity, b.entity), '-', GREATEST(a.entity, b.entity)) AS entity_join_key,
            ST_Area(ST_Intersection(a.geom, b.geom)) / ST_Area(ST_Union(a.geom, b.geom)) as pct_comb_overlap,
            ST_Area(ST_Intersection(a.geom, b.geom)) / ST_Area(a.geom) as pct_overlap_a,
            ST_Area(ST_Intersection(a.geom, b.geom)) / ST_Area(b.geom) as pct_overlap_b
        FROM entity_spatial a
        JOIN entity_spatial b 
            ON ST_Intersects(a.geom, b.geom)
            AND a.entity <> b.entity
          ),
        
    categorised as (
          
        SELECT 
            *,
            CASE 
                WHEN pct_overlap_a > {MATCH_THRESHOLD} AND pct_overlap_b > {MATCH_THRESHOLD} THEN 'Complete match (two-way)'
                WHEN pct_overlap_a > {MATCH_THRESHOLD} OR pct_overlap_b > {MATCH_THRESHOLD} THEN 'Single match (one-way)'
            ELSE 'undefined' END as intersection_type,
            row_number() OVER (PARTITION BY entity_join_key ORDER BY pct_comb_overlap) as key_count
        FROM calc
        WHERE pct_overlap_a > 0.9 OR pct_overlap_a > 0.9
        ORDER BY entity_join_key
          )
          
    SELECT *
    FROM categorised
    WHERE key_count = 1
""").to_csv("conservation-area_geo_dupes_duckdb.csv")
          
con.close()