# Imports

In [1]:
from pyspark.sql.types import StructType, StructField, StringType
from sedona.sql.types import GeometryType
from sedona.spark import SedonaContext

import pyspark.sql.functions as psf

# Functions

In [2]:
from sedona.spark import SedonaPyDeck
# Functions
def view_geodf(df):
    # PyDeck
    fill_color=[255, 12, 250]
    census_map = SedonaPyDeck.create_choropleth_map(df=df, fill_color=fill_color)
    return census_map

# Session

In [3]:
builder = SedonaContext.builder().appName(
        'Sedona Session'
    )
    # Set sedona session
spark = SedonaContext.create(builder.getOrCreate())
sc = spark.sparkContext
sc.setSystemProperty("sedona.global.charset", "utf8")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/23 11:09:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

# Variables

In [4]:
census_path = "/opt/data/input/euskadi.parquet"
output_path = "/opt/data/output/census/"

# Extract

In [5]:
SCHEMA = StructType([
        StructField("geometry", GeometryType(), nullable=False),
        StructField("NPRO", StringType(), nullable=True),
    ])

# Read
df = spark.read.schema(
            SCHEMA # Read schema
        ).format(
            'geoparquet'  # File format
        ).load(
            census_path  # Load path
        )

df.show()

[Stage 3:>                                                          (0 + 1) / 1]

+--------------------+--------------------+
|            geometry|                NPRO|
+--------------------+--------------------+
|POLYGON ((-2.5053...|Araba/Álava      ...|
|MULTIPOLYGON (((-...|Gipuzkoa         ...|
|MULTIPOLYGON (((-...|Bizkaia          ...|
+--------------------+--------------------+



                                                                                

# Transform

In [6]:
df.withColumn('NPRO', psf.trim('NPRO')).withColumn('NCA', psf.lit('Euskadi')).createOrReplaceTempView("census")

dissolved_census = spark.sql(f"""
    SELECT NCA, ST_Union_Aggr(geometry) AS geometry 
    FROM census
    GROUP BY NCA
""")
dissolved_census.show()

+-------+--------------------+
|    NCA|            geometry|
+-------+--------------------+
|Euskadi|MULTIPOLYGON (((-...|
+-------+--------------------+



# Load

In [7]:
dissolved_census.write.format(
            'geoparquet',  # File format
        ).mode("overwrite").save(output_path)


# Check

In [8]:
df = spark.read.format('geoparquet').load(output_path)
df.show()

+-------+--------------------+
|    NCA|            geometry|
+-------+--------------------+
|Euskadi|MULTIPOLYGON (((-...|
+-------+--------------------+



In [9]:
view_geodf(df)