In [0]:
import os
import geopandas as gpd
import pyspark.databricks.sql.functions as DBF
import pyspark.sql.functions as F

### What ST_ functions are available?

In [0]:
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")

spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"USE SCHEMA {schema}")

In [0]:
%sql
USE CATALOG IDENTIFIER(:catalog);
USE SCHEMA IDENTIFIER(:schema);

In [0]:
%sql
SHOW FUNCTIONS LIKE 'ST_*'

### What is the definition of a given ST_ function?

In [0]:
%sql
DESCRIBE FUNCTION EXTENDED st_buffer

### About the Dataset
- Local Government Areas - 2025 - Shapefile
- Digital boundaries are available in both the Geocentric Datum of Australia 2020 (GDA2020) and the Geocentric Datum of Australia 1994 (GDA94). GDA2020 was adopted as the new official national datum in 2017 and will be adopted gradually by organisations across Australia.



In [0]:
# update to your preferred location
data_path = "/Volumes/pamela_lim/dev/spatial_sql_101"
os.environ["DATA_PATH"] = data_path

- Convert the GeoDataFrame to a Spark DataFrame
- If your geometry column is complex (e.g. Shapely objects), you will need to convert it to WKT or WKB format first since Spark does not natively understand geometry objects.

In [0]:
import geopandas as gpd
import pandas as pd

gdf_lga = gpd.read_file(f"{data_path}/LGA_2025_AUST_GDA2020.shp")

# Convert geometry to WKT
gdf_lga['geometry'] = gdf_lga['geometry'].to_wkt()

# Convert to Spark DataFrame
pdf = pd.DataFrame(gdf_lga)
sdf = spark.createDataFrame(pdf)

# Write to Delta table
sdf.write.format("delta").mode("overwrite").saveAsTable(f"{catalog}.{schema}.lgas")

In [0]:
%sql

-- 1.0 Explore LGA data
SELECT 
    lga_code25 as lga_code,
    lga_name25 as lga_name,
    ste_code21 as state_code,
    ste_name21 as state_name,
    aus_code21 as aus_code,
    aus_name21 as aus_name,
    areasqkm as area_sqkm,
    ST_NPoints(geometry) as num_vertices
FROM lgas
LIMIT 10;

-- 1.1 ST_Transform: Area Calculation in Different Coordinate Reference System (CRS)
SELECT 
    lga_code25 as lga_code,
    lga_name25 as lga_name,
    ste_code21 as state_code,
    ste_name21 as state_name,
    areasqkm AS original_area,
    ST_Area(ST_Transform(ST_GeomFromText(geometry, 7844), 3112)) AS area_sqkm_gda2020_zone55,
    ST_Area(ST_Transform(ST_GeomFromText(geometry, 7844), 3857)) AS area_sqkm_web_mercator
FROM lgas
WHERE ste_name21 = 'Victoria'
ORDER BY areasqkm DESC
LIMIT 10;

-- 2.0 ST_Contains: Find all LGAs that contain a specific point (e.g., Sydney CBD coordinates)
SELECT 
    lga_code25 as lga_code,
    lga_name25 as lga_name,
    ste_name21 as state_name,
    areasqkm
FROM lgas
WHERE ST_Contains(
    ST_GeomFromText(geometry), 
    ST_Point(151.2093, -33.8688) -- Sydney CBD coordinates
);

-- TODO:
-- H3 index for points and polygons
-- H3 tessallate (is_called? = true means fully contained)
