## Testing query performance (WORK IN PROGRESS)

During the [duplicate analysis](geo_analysis/conservation_area_duplication_analysis.ipynb) it became clear that spatialite queries perform poorly, so this notebook is capturing some tests carried out to compare performance between spatialite, geopandas and duckdb.

In [112]:
from download_data import download_dataset
from data import get_entity_dataset, nrow
from plot import plot_map
import spatialite
import pandas as pd
import geopandas as gpd
import os
import itertools
import shapely.wkt

import numpy as np

pd.set_option("display.max_rows", None)


In [105]:
def nrow(df):
    return print(f"No. of records in entity_df: {len(df):,}")

download the sqlite3 file for the data. this will be used to run queries against

In [4]:
# download dataset
# dataset = 'article-4-direction-area'
# collection = 'article-4-direction-collection'
# data_dir = os.path.join('../data/entity_resolution',dataset)
# dataset_path = os.path.join(data_dir,f'{dataset}.sqlite3')

dataset = 'conservation-area'
collection = 'conservation-area-collection'
data_dir = os.path.join('../data/entity_resolution',dataset)
dataset_path = os.path.join(data_dir,f'{dataset}.sqlite3')

In [6]:
# download_dataset(dataset,collection,data_dir)

## Sqlite test

Useful resources for handling geometry and creating spatial index in spatialite:   
https://docs.datasette.io/en/stable/spatialite.html   
http://www.gaia-gis.it/gaia-sins/spatialite-cookbook-5/cookbook_topics.01.html#topic_create_database_italy_2011

In [7]:
# create a new table in the sqlite database with a geometry column and create a spatial index for it

# con = spatialite.connect(dataset_path)

# con.execute("""
#     CREATE TABLE entity_geom AS
#     SELECT  entity,
#             name,
#             organisation_entity,
#             reference,
#             geometry as geometry_wkt
#     FROM entity
#     WHERE st_isvalid(GeomFromText(geometry));

# """
# )

# con.execute("select InitSpatialMetadata(1)")

# con.execute(
#     "SELECT AddGeometryColumn('entity_geom', 'geometry', 4326, 'MULTIPOLYGON', 'XY');"
# )

# con.execute(
#     """
#     UPDATE entity_geom SET
#     geometry = GeomFromText(geometry_wkt,4326);
# """
# )

# con.execute("select CreateSpatialIndex('entity_geom', 'geometry');")


# con.commit()         
# con.close()


In [54]:
# query which identifies LPA conservation area entities which have an overlap with Historic England entities.

start_time = time.time()

sql = """
    SELECT  a.entity as p_entity,
            a.name as p_name,
            a.organisation_entity as p_org_entity,
            a.reference as p_reference,
            b.entity as s_entity,
            b.name as s_name,
            b.organisation_entity as s_org_entity,
            b.reference as s_reference,

            ST_Area(st_difference(a.geometry, b.geometry)) / st_area(a.geometry) as p_pct_non_intersect,
            st_area(st_intersection(a.geometry, b.geometry)) / (st_area(st_difference(a.geometry, b.geometry)) + st_area(st_intersection(a.geometry, b.geometry)) + st_area(st_difference(b.geometry, a.geometry))) as pct_intersection,
            st_area(st_difference(b.geometry, a.geometry)) / st_area(b.geometry) as s_pct_non_intersect

FROM (
    SELECT  entity,
            name,
            organisation_entity,
            reference,
            geometry
    FROM entity_geom
    WHERE organisation_entity <> 16
    AND st_isvalid(geometry)) a

JOIN (
    SELECT  entity,
            name,
            organisation_entity,
            reference,
            geometry
    FROM entity_geom
    WHERE organisation_entity = 16
    AND st_isvalid(geometry)) b

ON a.entity <> b.entity 
AND a.organisation_entity <> b.organisation_entity 
AND ST_Intersects(a.geometry, b.geometry)

WHERE 100 * (ST_Area(ST_Intersection(a.geometry, b.geometry))/ MIN(ST_Area(a.geometry), ST_Area(b.geometry))) > 0;

    
""" 

with spatialite.connect(dataset_path) as con:
    cursor = con.execute(sql)
    cols = [column[0] for column in cursor.description]
    results = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols)
    
    
end_time = time.time()

elapsed_time = (end_time - start_time) / 60
print(f"Elapsed time: {elapsed_time:.2f} mins")

print(len(results))
results.head()

Elapsed time: 6.69 mins
758


Unnamed: 0,p_entity,p_name,p_org_entity,p_reference,s_entity,s_name,s_org_entity,s_reference,p_pct_non_intersect,pct_intersection,s_pct_non_intersect
0,44000540,The Hinton St Mary Conservation Area,222,45,44002322,,16,1,,,0.999554
1,44000549,The Farnham Conservation Area,222,13,44002322,,16,1,,,0.999592
2,44000551,Stourton Caundle Conservation Area,222,15,44002322,,16,1,,,0.999823
3,44000552,The Charlton Marshall Conservation Area,222,16,44002322,,16,1,,,0.999766
4,44000553,The Stour Provost Conservation Area,222,17,44002322,,16,1,,,0.999773


## GeoPandas

In [76]:
sql = """
    SELECT 
        entity,
        name,
        organisation_entity,
        reference,
        geometry
        -- CASE WHEN organisation_entity = 16 THEN 'Historic England' ELSE 'LPA' END AS organisation_type
        FROM entity;

""" 
with spatialite.connect(dataset_path) as con:
    cursor = con.execute(sql)
    cols = [column[0] for column in cursor.description]
    entity_df = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols)
            
print(f"No. of records in entity_df: {len(entity_df):,}")
entity_df.head()

No. of records in entity_df: 8,761


Unnamed: 0,entity,name,organisation_entity,reference,geometry
0,44000001,Napsbury,16,5080,"MULTIPOLYGON (((-0.307721 51.724964,-0.307831 ..."
1,44000002,Shafford Mill,16,5071,"MULTIPOLYGON (((-0.372417 51.774343,-0.372532 ..."
2,44000003,Potters Crouch,16,5074,"MULTIPOLYGON (((-0.385179 51.733498,-0.385157 ..."
3,44000004,Old Brickett Wood,16,5075,"MULTIPOLYGON (((-0.373629 51.690198,-0.373565 ..."
4,44000005,Sleapshyde,16,5078,"MULTIPOLYGON (((-0.259590 51.746264,-0.257212 ..."


In [82]:
entity_df['geometry'] = geom_df['geometry'].apply(shapely.wkt.loads)
entity_gdf = gpd.GeoDataFrame(entity_df, geometry='geometry')

# Define the CRS
entity_gdf.set_crs(epsg=4326, inplace=True)
entity_gdf.to_crs(epsg=27700, inplace=True)

entity_gdf["area"] = entity_gdf["geometry"].area

entity_gdf.head()

Unnamed: 0,entity,name,organisation_entity,reference,geometry,area
0,44000001,Napsbury,16,5080,"MULTIPOLYGON (((516981.159 204270.242, 516973....",495087.300218
1,44000002,Shafford Mill,16,5071,"MULTIPOLYGON (((512390.333 209659.962, 512382....",136187.979619
2,44000003,Potters Crouch,16,5074,"MULTIPOLYGON (((511610.510 205098.079, 511611....",34603.675292
3,44000004,Old Brickett Wood,16,5075,"MULTIPOLYGON (((512515.275 200300.431, 512520....",55128.469061
4,44000005,Sleapshyde,16,5078,"MULTIPOLYGON (((520248.830 206717.191, 520410....",44167.433073


In [107]:
LPA_HE_join = gpd.sjoin(
    entity_gdf[entity_gdf["organisation_entity"] == "169"],
    entity_gdf[entity_gdf["organisation_entity"] == "16"],
    how = "inner", predicate = "intersects"
)

LPA_HE_join["area_intersection"] = LPA_HE_join["geometry"].area

LPA_HE_join["p_pct_intersect"] = LPA_HE_join["area_intersection"] / LPA_HE_join["area_left"]
LPA_HE_join["pct_intersection"] = LPA_HE_join["area_intersection"] / (LPA_HE_join["area_left"] + LPA_HE_join["area_right"] - LPA_HE_join["area_intersection"])
LPA_HE_join["s_pct_intersect"] = LPA_HE_join["area_intersection"] / LPA_HE_join["area_right"]

nrow(LPA_HE_join)
LPA_HE_join.head()

No. of records in entity_df: 5


Unnamed: 0,entity_left,name_left,organisation_entity_left,reference_left,geometry,area_left,index_right,entity_right,name_right,organisation_entity_right,reference_right,area_right,area_intersection,p_pct_intersect,pct_intersection,s_pct_intersect
7543,44008347,St. Peter's Square,169,1,"MULTIPOLYGON (((522011.124 178302.578, 522030....",109646.284885,6777,44006887,Old Chiswick,16,1216,246275.692003,109646.284885,1.0,0.445218,0.445218
7544,44008348,The Mall,169,2,"MULTIPOLYGON (((521999.461 178296.513, 521980....",259371.802064,6777,44006887,Old Chiswick,16,1216,246275.692003,259371.802064,1.0,1.053177,1.053177
7546,44008350,Hurlingham,169,4,"MULTIPOLYGON (((525695.269 176397.528, 525672....",654460.124221,1051,44001072,Deodar Road,16,3666,62414.965719,654460.124221,1.0,10.485628,10.485628
7580,44008384,Ravenscourt & Starch Green,169,8,"MULTIPOLYGON (((521939.395 178863.421, 521939....",736149.830461,3317,44003413,Stamford Brook,16,1223,74346.529517,736149.830461,1.0,9.901603,9.901603
7580,44008384,Ravenscourt & Starch Green,169,8,"MULTIPOLYGON (((521939.395 178863.421, 521939....",736149.830461,3316,44003412,Bedford Park,16,1205,174651.440497,736149.830461,1.0,4.214966,4.214966


In [110]:
start_time = time.time()

LPA_HE_join = gpd.overlay(
    entity_gdf[entity_gdf["organisation_entity"] != "16"],
    entity_gdf[entity_gdf["organisation_entity"] == "16"],
    how = "intersection", keep_geom_type=False
)

LPA_HE_join["area_intersection"] = LPA_HE_join["geometry"].area

LPA_HE_join["p_pct_intersect"] = LPA_HE_join["area_intersection"] / LPA_HE_join["area_1"]
LPA_HE_join["pct_intersection"] = LPA_HE_join["area_intersection"] / (LPA_HE_join["area_1"] + LPA_HE_join["area_2"] - LPA_HE_join["area_intersection"])
LPA_HE_join["s_pct_intersect"] = LPA_HE_join["area_intersection"] / LPA_HE_join["area_2"]

end_time = time.time()

elapsed_time = (end_time - start_time) 
print(f"Elapsed time: {elapsed_time:.2f} ")

nrow(LPA_HE_join)
LPA_HE_join.head()

Elapsed time: 0.38 
No. of records in entity_df: 793


Unnamed: 0,entity_1,name_1,organisation_entity_1,reference_1,area_1,entity_2,name_2,organisation_entity_2,reference_2,area_2,geometry,area_intersection,p_pct_intersect,pct_intersection,s_pct_intersect
0,44000540,The Hinton St Mary Conservation Area,222,45,271632.834883,44002322,,16,1,609217700.0,"POLYGON ((378317.034 116323.476, 378314.048 11...",271632.834883,1.0,0.000446,0.000446
1,44000549,The Farnham Conservation Area,222,13,248447.273755,44002322,,16,1,609217700.0,"POLYGON ((395687.904 115525.150, 395887.261 11...",248447.273755,1.0,0.000408,0.000408
2,44000551,Stourton Caundle Conservation Area,222,15,107920.554378,44002322,,16,1,609217700.0,"POLYGON ((371523.221 114654.328, 371519.930 11...",107920.554378,1.0,0.000177,0.000177
3,44000552,The Charlton Marshall Conservation Area,222,16,143088.444037,44002322,,16,1,609217700.0,"POLYGON ((389787.452 104398.843, 389895.392 10...",143088.444037,1.0,0.000235,0.000235
4,44000553,The Stour Provost Conservation Area,222,17,138300.397606,44002322,,16,1,609217700.0,"POLYGON ((379351.667 121384.347, 379350.181 12...",138300.397606,1.0,0.000227,0.000227


In [117]:
LPA_HE_join["issue_type"] = np.select(
    [
        (LPA_HE_join["p_pct_intersect"] >= 0.9) & (LPA_HE_join["s_pct_intersect"] >= 0.9),
        (LPA_HE_join["p_pct_intersect"] <= 0.1) & (LPA_HE_join["s_pct_intersect"] <= 0.1),
        (LPA_HE_join["p_pct_intersect"] >= 0.9),
        (LPA_HE_join["s_pct_intersect"] >= 0.9)
    ],
    [
        "almost matches", "crossover", "LPA covered by HE", "LPA covers HE"
    ],
    default = "-"
)

In [118]:
LPA_HE_join[(LPA_HE_join["p_pct_intersect"] >= 0.9)].head()

Unnamed: 0,entity_1,name_1,organisation_entity_1,reference_1,area_1,entity_2,name_2,organisation_entity_2,reference_2,area_2,geometry,area_intersection,p_pct_intersect,pct_intersection,s_pct_intersect,issue_type
0,44000540,The Hinton St Mary Conservation Area,222,45,271632.834883,44002322,,16,1,609217700.0,"POLYGON ((378317.034 116323.476, 378314.048 11...",271632.834883,1.0,0.000446,0.000446,LPA covered by HE
1,44000549,The Farnham Conservation Area,222,13,248447.273755,44002322,,16,1,609217700.0,"POLYGON ((395687.904 115525.150, 395887.261 11...",248447.273755,1.0,0.000408,0.000408,LPA covered by HE
2,44000551,Stourton Caundle Conservation Area,222,15,107920.554378,44002322,,16,1,609217700.0,"POLYGON ((371523.221 114654.328, 371519.930 11...",107920.554378,1.0,0.000177,0.000177,LPA covered by HE
3,44000552,The Charlton Marshall Conservation Area,222,16,143088.444037,44002322,,16,1,609217700.0,"POLYGON ((389787.452 104398.843, 389895.392 10...",143088.444037,1.0,0.000235,0.000235,LPA covered by HE
4,44000553,The Stour Provost Conservation Area,222,17,138300.397606,44002322,,16,1,609217700.0,"POLYGON ((379351.667 121384.347, 379350.181 12...",138300.397606,1.0,0.000227,0.000227,LPA covered by HE


In [121]:
LPA_HE_join.groupby(["issue_type"]).size().reset_index()

Unnamed: 0,issue_type,0
0,-,18
1,LPA covered by HE,201
2,LPA covers HE,28
3,almost matches,173
4,crossover,373
