In [15]:
import configparser

from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import (expr, broadcast)
from pyspark.sql.types import (StructType, StructField, IntegerType, StringType, FloatType, TimestampType)

from sedona.register import SedonaRegistrator
from sedona.utils import (KryoSerializer, SedonaKryoRegistrator)
from sedona.core.enums import (GridType, IndexType)
from sedona.core.spatialOperator import JoinQueryRaw
from sedona.utils.adapter import Adapter

import re

In [2]:
sparkConf = SparkConf()
parser = configparser.ConfigParser()
parser.optionxform=str
parser.read_file(open('../sparkconf.cfg'))

for section, config in parser.items():
    for key, value in config.items():
        sparkConf.set(key, value)

sparkConf.set("spark.serializer", KryoSerializer.getName)
sparkConf.set("spark.kryo.registrator", SedonaKryoRegistrator.getName)
sparkConf.set("spark.kryoserializer.buffer.max", "512")

spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

In [3]:
SedonaRegistrator.registerAll(spark)
spark

                                                                                

In [88]:
df_adm = spark.read.format("json").load("s3a://dutrajardim-fi/src/shapes/osm/*")
df_children = df_adm.selectExpr("admin_level", "osm_id", "name", "EXPLODE_OUTER(SPLIT(parents, ',')) AS parent_id")

# adm will be parent in the relation
parent = df_adm.selectExpr("admin_level AS parent_level", "osm_id AS parent_id", "name AS parent_name").alias("parent")
children = df_children.selectExpr("admin_level AS adm", "osm_id AS id" , "parent_id", "name").alias("children")

# get children with their parents
df_relations = parent.join(children, "parent_id", "right")

# group by children and make set relations in rows to columns
pt_relations = df_relations.groupBy(["adm", "id", "name"]).pivot("parent_level").agg(expr("FIRST(parent_id) AS adm"), expr("FIRST(parent_name) AS adm_name"))

pt_relations = pt_relations.drop('null_adm', 'null_adm_name')

pattern = re.compile("^(([0-9]{,2})_adm(_name)?)$")
columns_to_rename = [pattern.match(column).groups() for column in pt_relations.columns if pattern.match(column)]
for col, adm_id, is_name in columns_to_rename:
    pt_relations = pt_relations.withColumnRenamed(col, f"adm{adm_id}_name" if is_name else f"adm{adm_id}")

                                                                                

In [90]:
(
    pt_relations.repartition("adm")
    .write.partitionBy("adm")
    .mode("overwrite")
    .format("parquet")
    .save("s3a://dutrajardim-fi/tables/shapes/osm/relations.parquet")
)

                                                                                

In [96]:
( 
    df_adm.selectExpr("geom AS geometry", "admin_level AS adm", "osm_id AS id")
    .repartition("adm")
    .write.partitionBy("adm")
    .option("maxRecordsPerFile", 500) 
    .mode("overwrite")
    .format("parquet")
    .save("s3a://dutrajardim-fi/tables/shapes/osm/shapes.parquet")
)

                                                                                

In [97]:
spark.stop()

22/04/03 19:03:13 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
