```
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements.  See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.  The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License.  You may obtain a copy of the License at
  http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied.  See the License for the
specific language governing permissions and limitations
under the License.
```

In [1]:
import os

import geopandas as gpd
from pyspark.sql import SparkSession

from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer

In [2]:
spark = SparkSession. \
    builder. \
    appName('appName'). \
    config("spark.serializer", KryoSerializer.getName). \
    config("spark.kryo.registrator", SedonaKryoRegistrator.getName). \
    config('spark.jars.packages',
           'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.1.0-incubating,org.datasyslab:geotools-wrapper:1.1.0-25.2'). \
    getOrCreate()

:: loading settings :: url = jar:file:/home/jovyan/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.sedona#sedona-python-adapter-3.0_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-3625fe59-a0fc-45df-ae07-108b2b675db5;1.0
	confs: [default]
	found org.apache.sedona#sedona-python-adapter-3.0_2.12;1.1.0-incubating in central
	found org.locationtech.jts#jts-core;1.18.0 in central
	found org.wololo#jts2geojson;0.16.1 in central
	found com.fasterxml.jackson.core#jackson-databind;2.12.2 in central
	found com.fasterxml.jackson.core#jackson-annotations;2.12.2 in central
	found com.fasterxml.jackson.core#jackson-core;2.12.2 in central
	found org.apache.sedona#sedona-core-3.0_2.12;1.1.0-incubating in central
	found org.apache.sedona#sedona-sql-3.0_2.12;1.1.0-incubating in central
	found org.datasyslab#geotools-wrapper;1.1.0-25.2 in central
downloading https://repo1.maven.org/m

In [3]:
SedonaRegistrator.registerAll(spark)

                                                                                

True

## Geometry Constructors

### ST_Point

In [4]:
point_csv_df = spark.read.format("csv").\
    option("delimiter", ",").\
    option("header", "false").\
    load("data/testpoint.csv")

point_csv_df.createOrReplaceTempView("pointtable")

point_df = spark.sql("select ST_Point(cast(pointtable._c0 as Decimal(24,20)), cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable")
point_df.show(5)

+-----------------+
|     arealandmark|
+-----------------+
|POINT (1.1 101.1)|
|POINT (2.1 102.1)|
|POINT (3.1 103.1)|
|POINT (4.1 104.1)|
|POINT (5.1 105.1)|
+-----------------+
only showing top 5 rows



### ST_GeomFromText

In [5]:
polygon_wkt_df = spark.read.format("csv").\
    option("delimiter", "\t").\
    option("header", "false").\
    load("data/county_small.tsv")

polygon_wkt_df.createOrReplaceTempView("polygontable")
polygon_df = spark.sql("select polygontable._c6 as name, ST_GeomFromText(polygontable._c0) as countyshape from polygontable")
polygon_df.show(5)

+----------------+--------------------+
|            name|         countyshape|
+----------------+--------------------+
|   Cuming County|POLYGON ((-97.019...|
|Wahkiakum County|POLYGON ((-123.43...|
|  De Baca County|POLYGON ((-104.56...|
|Lancaster County|POLYGON ((-96.910...|
| Nuckolls County|POLYGON ((-98.273...|
+----------------+--------------------+
only showing top 5 rows



### ST_GeomFromWKB

In [6]:
polygon_wkb_df = spark.read.format("csv").\
    option("delimiter", "\t").\
    option("header", "false").\
    load("data/county_small_wkb.tsv")

polygon_wkb_df.createOrReplaceTempView("polygontable")
polygon_df = spark.sql("select polygontable._c6 as name, ST_GeomFromWKB(polygontable._c0) as countyshape from polygontable")
polygon_df.show(5)

+----------------+--------------------+
|            name|         countyshape|
+----------------+--------------------+
|   Cuming County|POLYGON ((-97.019...|
|Wahkiakum County|POLYGON ((-123.43...|
|  De Baca County|POLYGON ((-104.56...|
|Lancaster County|POLYGON ((-96.910...|
| Nuckolls County|POLYGON ((-98.273...|
+----------------+--------------------+
only showing top 5 rows



### ST_GeomFromGeoJSON

In [7]:
polygon_json_df = spark.read.format("csv").\
    option("delimiter", "\t").\
    option("header", "false").\
    load("data/testPolygon.json")

polygon_json_df.createOrReplaceTempView("polygontable")
polygon_df = spark.sql("select ST_GeomFromGeoJSON(polygontable._c0) as countyshape from polygontable")
polygon_df.show(5)

+--------------------+
|         countyshape|
+--------------------+
|POLYGON ((-87.621...|
|POLYGON ((-85.719...|
|POLYGON ((-86.000...|
|POLYGON ((-86.574...|
|POLYGON ((-85.382...|
+--------------------+
only showing top 5 rows



## Spatial Operations

### Spatial Join - Distance Join

In [8]:
point_csv_df_1 = spark.read.format("csv").\
    option("delimiter", ",").\
    option("header", "false").load("data/testpoint.csv")

point_csv_df_1.createOrReplaceTempView("pointtable")

point_df1 = spark.sql("SELECT ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape1, \'abc\' as name1 from pointtable")
point_df1.createOrReplaceTempView("pointdf1")

point_csv_df2 = spark.read.format("csv").\
    option("delimiter", ",").\
    option("header", "false").load("data/testpoint.csv")

point_csv_df2.createOrReplaceTempView("pointtable")
point_df2 = spark.sql("select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape2, \'def\' as name2 from pointtable")
point_df2.createOrReplaceTempView("pointdf2")

distance_join_df = spark.sql("select * from pointdf1, pointdf2 where ST_Distance(pointdf1.pointshape1,pointdf2.pointshape2) < 2")
distance_join_df.explain()
distance_join_df.show(5)

== Physical Plan ==
DistanceJoin pointshape1#259: geometry, pointshape2#283: geometry, 2.0, false
:- Project [st_point(cast(_c0#255 as decimal(24,20)), cast(_c1#256 as decimal(24,20))) AS pointshape1#259, abc AS name1#260]
:  +- FileScan csv [_c0#255,_c1#256] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/home/jovyan/binder/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>
+- Project [st_point(cast(_c0#279 as decimal(24,20)), cast(_c1#280 as decimal(24,20))) AS pointshape2#283, def AS name2#284]
   +- FileScan csv [_c0#279,_c1#280] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/home/jovyan/binder/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>




22/08/28 05:00:01 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.


+-----------------+-----+-----------------+-----+
|      pointshape1|name1|      pointshape2|name2|
+-----------------+-----+-----------------+-----+
|POINT (1.1 101.1)|  abc|POINT (1.1 101.1)|  def|
|POINT (2.1 102.1)|  abc|POINT (1.1 101.1)|  def|
|POINT (1.1 101.1)|  abc|POINT (2.1 102.1)|  def|
|POINT (2.1 102.1)|  abc|POINT (2.1 102.1)|  def|
|POINT (3.1 103.1)|  abc|POINT (2.1 102.1)|  def|
+-----------------+-----+-----------------+-----+
only showing top 5 rows



### Spatial Join - Range Join and RDD API Join

Please refer to the example - airports per country: https://github.com/apache/incubator-sedona/blob/master/binder/ApacheSedonaSQL_SpatialJoin_AirportsPerCountry.ipynb

### Converting GeoPandas to Apache Sedona

In [9]:
gdf = gpd.read_file("data/gis_osm_pois_free_1.shp")

osm_points = spark.createDataFrame(
    gdf
)

  out = from_shapely(data)


In [10]:
osm_points.printSchema()

root
 |-- osm_id: string (nullable = true)
 |-- code: long (nullable = true)
 |-- fclass: string (nullable = true)
 |-- name: string (nullable = true)
 |-- geometry: geometry (nullable = true)



In [11]:
osm_points.show(5)

[Stage 17:>                                                         (0 + 1) / 1]

+--------+----+---------+--------------+--------------------+
|  osm_id|code|   fclass|          name|            geometry|
+--------+----+---------+--------------+--------------------+
|26860257|2422|camp_site|      de Kroon|POINT (15.3393145...|
|26860294|2406|   chalet|Leśne Ustronie|POINT (14.8709625...|
|29947493|2402|    motel|          null|POINT (15.0946636...|
|29947498|2602|      atm|          null|POINT (15.0732014...|
|29947499|2401|    hotel|          null|POINT (15.0696777...|
+--------+----+---------+--------------+--------------------+
only showing top 5 rows



                                                                                

In [12]:
osm_points.createOrReplaceTempView("points")

In [13]:
transformed_df = spark.sql(
    """
        SELECT osm_id,
               code,
               fclass,
               name,
               ST_Transform(geometry, 'epsg:4326', 'epsg:2180') as geom 
        FROM points
    """)

In [14]:
transformed_df.show(5)

[Stage 18:>                                                         (0 + 1) / 1]

+--------+----+---------+--------------+--------------------+
|  osm_id|code|   fclass|          name|                geom|
+--------+----+---------+--------------+--------------------+
|26860257|2422|camp_site|      de Kroon|POINT (-3288183.3...|
|26860294|2406|   chalet|Leśne Ustronie|POINT (-3341183.9...|
|29947493|2402|    motel|          null|POINT (-3320466.5...|
|29947498|2602|      atm|          null|POINT (-3323205.7...|
|29947499|2401|    hotel|          null|POINT (-3323655.1...|
+--------+----+---------+--------------+--------------------+
only showing top 5 rows



                                                                                

In [15]:
transformed_df.createOrReplaceTempView("points_2180")

In [16]:
neighbours_within_1000m = spark.sql("""
        SELECT a.osm_id AS id_1,
               b.osm_id AS id_2,
               a.geom 
        FROM points_2180 AS a, points_2180 AS b 
        WHERE ST_Distance(a.geom,b.geom) < 50
    """)

In [17]:
neighbours_within_1000m.show()

22/08/28 05:00:09 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.

+----------+---------+--------------------+
|      id_1|     id_2|                geom|
+----------+---------+--------------------+
|  26860294| 26860294|POINT (-3341183.9...|
|  29947493| 29947493|POINT (-3320466.5...|
|4165181885| 29947498|POINT (-3323204.4...|
|5818905324| 29947498|POINT (-3323210.6...|
|  29947498| 29947498|POINT (-3323205.7...|
|  29947499| 29947499|POINT (-3323655.1...|
|  30077461| 29947499|POINT (-3323697.1...|
|  29947505| 29947505|POINT (-3330369.2...|
|  29947499| 30077461|POINT (-3323655.1...|
|  30077461| 30077461|POINT (-3323697.1...|
| 197624402|197624402|POINT (-3383818.5...|
| 197663196|197663196|POINT (-3383367.1...|
| 197953474|197953474|POINT (-3383763.3...|
| 262310516|262310516|POINT (-3384257.6...|
|1074233123|262310516|POINT (-3384262.1...|
| 270281140|270281140|POINT (-3385421.2...|
|1074232906|270281140|POINT (-3385408.6...|
| 270306609|270306609|POINT (-3383982.8...|
| 270306746|270306746|POINT (-3383898.4...|
| 273101780|273101780|POINT (-33

                                                                                

## Converting Apache Sedona to GeoPandas

In [18]:
df = neighbours_within_1000m.toPandas()

22/08/28 05:00:15 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.
                                                                                

In [19]:
gdf = gpd.GeoDataFrame(df, geometry="geom")

In [20]:
gdf

Unnamed: 0,id_1,id_2,geom
0,26860294,26860294,POINT (-3341183.976 4318356.064)
1,29947493,29947493,POINT (-3320466.547 4265941.760)
2,4165181885,29947498,POINT (-3323204.491 4266510.379)
3,5818905324,29947498,POINT (-3323210.654 4266502.772)
4,29947498,29947498,POINT (-3323205.784 4266548.416)
...,...,...,...
45314,6815618439,6815618435,POINT (-3285827.820 4250345.966)
45315,6815618435,6815618439,POINT (-3285831.862 4250347.684)
45316,6815618439,6815618439,POINT (-3285827.820 4250345.966)
45317,6815883980,6815883980,POINT (-3286165.443 4249818.008)
