In [1]:
# sedona and pyspark imports
from sedona.spark import *
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType

In [None]:
config = (
    SedonaContext.builder()
    .config(
        "spark.jars.packages",
        "org.apache.sedona:sedona-spark-3.3_2.12:1.7.1,"
        "org.datasyslab:geotools-wrapper:1.7.1-28.5",
    )
    .config(
        "spark.jars.repositories",
        "https://artifacts.unidata.ucar.edu/repository/unidata-all",
    )
    .getOrCreate()
)
sedona = SedonaContext.create(config)

https://artifacts.unidata.ucar.edu/repository/unidata-all added as a remote repository with the name: repo-1
Ivy Default Cache set to: /home/cdsw/.ivy2/cache
The jars for the packages stored in: /home/cdsw/.ivy2/jars
org.apache.sedona#sedona-spark-3.3_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5ec18a16-dd2a-4912-b5bf-c3f531af4107;1.0
	confs: [default]


:: loading settings :: url = jar:file:/runtime-addons/spark330-22-p3v4jn/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.sedona#sedona-spark-3.3_2.12;1.7.1 in central
	found org.apache.sedona#sedona-common;1.7.1 in central
	found org.apache.commons#commons-math3;3.6.1 in central
	found org.locationtech.jts#jts-core;1.20.0 in central
	found org.wololo#jts2geojson;0.16.1 in central
	found org.locationtech.spatial4j#spatial4j;0.8 in central
	found com.google.geometry#s2-geometry;2.0.0 in central
	found com.google.guava#guava;25.1-jre in central
	found com.google.code.findbugs#jsr305;3.0.2 in central
	found org.checkerframework#checker-qual;2.0.0 in central
	found com.google.errorprone#error_prone_annotations;2.1.3 in central
	found com.google.j2objc#j2objc-annotations;1.1 in central
	found org.codehaus.mojo#animal-sniffer-annotations;1.14 in central
	found com.uber#h3;4.1.1 in central
	found net.sf.geographiclib#GeographicLib-Java;1.52 in central
	found com.github.ben-manes.caffeine#caffeine;2.9.2 in central
	found org.checkerframework#checker-qual;3.10.0 in central
	found com.google.error

In [None]:
import os

import geopandas as gpd
from pyspark.sql import SparkSession

from sedona.spark import *

## Geometry Constructors

### ST_Point

In [None]:
point_csv_df = (
    sedona.read.format("csv")
    .option("delimiter", ",")
    .option("header", "false")
    .load("data/testpoint.csv")
)

point_csv_df.createOrReplaceTempView("pointtable")

point_df = sedona.sql(
    "select ST_Point(cast(pointtable._c0 as Decimal(24,20)), cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable"
)
point_df.show(5)

### ST_GeomFromText

In [None]:
polygon_wkt_df = (
    sedona.read.format("csv")
    .option("delimiter", "\t")
    .option("header", "false")
    .load("data/county_small.tsv")
)

polygon_wkt_df.createOrReplaceTempView("polygontable")
polygon_df = sedona.sql(
    "select polygontable._c6 as name, ST_GeomFromText(polygontable._c0) as countyshape from polygontable"
)
polygon_df.show(5)

### ST_GeomFromWKB

In [None]:
polygon_wkb_df = (
    sedona.read.format("csv")
    .option("delimiter", "\t")
    .option("header", "false")
    .load("data/county_small_wkb.tsv")
)

polygon_wkb_df.createOrReplaceTempView("polygontable")
polygon_df = sedona.sql(
    "select polygontable._c6 as name, ST_GeomFromWKB(polygontable._c0) as countyshape from polygontable"
)
polygon_df.show(5)

### ST_GeomFromGeoJSON

In [None]:
polygon_json_df = (
    sedona.read.format("csv")
    .option("delimiter", "\t")
    .option("header", "false")
    .load("data/testPolygon.json")
)

polygon_json_df.createOrReplaceTempView("polygontable")
polygon_df = sedona.sql(
    "select ST_GeomFromGeoJSON(polygontable._c0) as countyshape from polygontable"
)
polygon_df.show(5)

## Spatial Operations

### Spatial Join - Distance Join

In [None]:
point_csv_df_1 = (
    sedona.read.format("csv")
    .option("delimiter", ",")
    .option("header", "false")
    .load("data/testpoint.csv")
)

point_csv_df_1.createOrReplaceTempView("pointtable")

point_df1 = sedona.sql(
    "SELECT ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape1, 'abc' as name1 from pointtable"
)
point_df1.createOrReplaceTempView("pointdf1")

point_csv_df2 = (
    sedona.read.format("csv")
    .option("delimiter", ",")
    .option("header", "false")
    .load("data/testpoint.csv")
)

point_csv_df2.createOrReplaceTempView("pointtable")
point_df2 = sedona.sql(
    "select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape2, 'def' as name2 from pointtable"
)
point_df2.createOrReplaceTempView("pointdf2")

distance_join_df = sedona.sql(
    "select * from pointdf1, pointdf2 where ST_Distance(pointdf1.pointshape1,pointdf2.pointshape2) < 2"
)
distance_join_df.explain()
distance_join_df.show(5)

### Spatial Join - Range Join and RDD API Join

Please refer to the example - airports per country: https://github.com/apache/sedona/blob/master/docs/usecases/ApacheSedonaSQL_SpatialJoin_AirportsPerCountry.ipynb

### Converting GeoPandas to Apache Sedona

In [None]:
import geopandas as gpd
import pandas as pd
from sedona.register import SedonaRegistrator
from sedona.sql.types import GeometryType
from pyspark.sql.types import StructType, StructField, StringType

# Read shapefile
gdf = gpd.read_file("data/gis_osm_pois_free_1.shp")

# Replace NA and convert geometry to WKT
gdf = gdf.replace(pd.NA, "")
gdf["geometry"] = gdf["geometry"].to_wkt()

# Convert to regular Pandas DataFrame
pdf = pd.DataFrame(gdf)

# Create Spark DataFrame
sedona_df = sedona.createDataFrame(pdf)

#from sedona.sql.functions import ST_GeomFromWKT

sedona_df = df.withColumn("geom", ST_GeomFromWKT("geometry"))
sedona_df.createOrReplaceTempView("osm_points")


In [None]:
osm_points.printSchema()

In [None]:
osm_points.show(5)

In [None]:
osm_points.createOrReplaceTempView("points")

In [None]:
transformed_df = sedona.sql(
    """
        SELECT osm_id,
               code,
               fclass,
               name,
               ST_Transform(geometry, 'epsg:4326', 'epsg:2180') as geom 
        FROM points
    """
)

In [None]:
transformed_df.show(5)

In [None]:
transformed_df.createOrReplaceTempView("points_2180")

In [None]:
neighbours_within_1000m = sedona.sql(
    """
        SELECT a.osm_id AS id_1,
               b.osm_id AS id_2,
               a.geom 
        FROM points_2180 AS a, points_2180 AS b 
        WHERE ST_Distance(a.geom,b.geom) < 50
    """
)

In [None]:
neighbours_within_1000m.show()

## Converting Apache Sedona to GeoPandas

In [None]:
df = neighbours_within_1000m.toPandas()

In [None]:
gdf = gpd.GeoDataFrame(df, geometry="geom")

In [None]:
gdf