In [1]:
import configparser

from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import (expr, broadcast)
from pyspark.sql.types import (StructType, StructField, IntegerType, StringType, FloatType, TimestampType)

from sedona.register import SedonaRegistrator
from sedona.utils import (KryoSerializer, SedonaKryoRegistrator)
from sedona.utils.adapter import Adapter

In [2]:
sparkConf = SparkConf()
parser = configparser.ConfigParser()
parser.optionxform=str
parser.read_file(open('../sparkconf.cfg'))

for section, config in parser.items():
    for key, value in config.items():
        sparkConf.set(key, value)

sparkConf.set("spark.serializer", KryoSerializer.getName)
sparkConf.set("spark.kryo.registrator", SedonaKryoRegistrator.getName)

sparkConf.set("spark.archives", "https://minio.minio-tenant/dutrajardim-etls/dependencies.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=admin%2F20220228%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20220228T111424Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=a20e1526dd993354ae90c90c7cb76da4726a5ae00c9920ceec8e2a35f3482c3c#deps")
sparkConf.set("spark.executorEnv.PYTHONPATH", "/opt/spark/work-dir/deps")
sparkConf.set("spark.executorEnv.LD_LIBRARY_PATH", "/opt/spark/work-dir/deps/Shapely.libs")
# sparkConf.set("spark.sql.extensions", "org.apache.sedona.sql.SedonaSqlExtensions")

spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

In [3]:
SedonaRegistrator.registerAll(spark)
spark

                                                                                

In [4]:
bucket = "s3a://dutrajardim-fi"
s3_source = "%s/src/firms/suomi_viirs_c2/archive/{2021}.csv.gz" % bucket

df_firms = spark.read.format("csv").option("header", "true").load(s3_source)

sdf_firms = df_firms \
    .selectExpr(
        "ST_GeomFromWKT(CONCAT('POINT(', longitude, ' ', latitude, ')')) as geometry", 
        "brightness", 
        "frp", 
        "scan", 
        "track", 
        "confidence", 
        "type", 
        "instrument",
        "CONCAT(acq_date, ' ', regexp_replace(acq_time, '(.{2})(.{2})', '$1:$2')) as datetime",
        "SUBSTRING(acq_date, 1, 4) as year"
    )

rdd_firms = Adapter.toSpatialRdd(sdf_firms, "geometry")
rdd_firms.analyze()

22/03/06 11:02:22 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
22/03/06 11:02:22 WARN AmazonHttpClient: SSL Certificate checking for endpoints has been explicitly disabled.
                                                                                

True

In [5]:
df_adm3 = spark.read.format("parquet").load("s3a://dutrajardim-fi/tables/shapes/adm3.parquet")
df_adm2 = spark.read.format("parquet").load("s3a://dutrajardim-fi/tables/shapes/adm2.parquet")

sdf_adm3 = df_adm3.selectExpr(
    "id as adm3",
    "ST_GeomFromWKT(geometry) as geometry_adm3",
    "CONCAT(CONCAT_WS('.', SLICE(SPLIT(id, '\\\\.'), 1, 3)), '_1') as adm2"
)

sdf_adm2 = df_adm2.selectExpr(
    "id as adm2",
    "ST_GeomFromWKT(geometry) as geometry_adm2"
)

sdf_adm = broadcast(sdf_adm2).join(sdf_adm3, on='adm2', how='left')
sdf_adm = sdf_adm.selectExpr(
    "CASE WHEN geometry_adm3 IS NOT NULL THEN geometry_adm3 ELSE geometry_adm2 END as geometry",
    "ELEMENT_AT(SPLIT(adm2, '\\\\.'), 1) as adm0",
    "CONCAT(CONCAT_WS('.', SLICE(SPLIT(adm2, '\\\\.'), 1, 2)), '_1') as adm1",
    "adm2",
    "adm3"
)

rdd_adm = Adapter.toSpatialRdd(sdf_adm, "geometry")
rdd_adm.analyze()

                                                                                

True

In [6]:
from sedona.core.enums import GridType
from sedona.core.enums import IndexType
from sedona.core.spatialOperator import JoinQueryRaw

rdd_adm.spatialPartitioning(GridType.KDBTREE)
rdd_firms.spatialPartitioning(rdd_adm.getPartitioner())

# second param is buildIndexOnSpatialPartitionedRDD - set to true as we will run a join query
rdd_firms.buildIndex(IndexType.QUADTREE, True)

# third param set using index to true while the fourth param set consider boundary intersection to true
query_result = JoinQueryRaw.SpatialJoinQueryFlat(rdd_firms, rdd_adm, True, True)

                                                                                

In [8]:
adm_columns = ["adm0", "adm1", "adm2", "adm3"]
firms_columns = ["brightness", "frp", "scan", "track", "confidence", "type", "instrument", "datetime", "year"]

sdf_firm_adm = Adapter.toDf(query_result, adm_columns, firms_columns, spark)

In [9]:
sdf_firm_adm.first()

                                                                                

Row(leftgeometry=<shapely.geometry.polygon.Polygon object at 0x7f81fd30b580>, adm0='ARG', adm1='ARG.8_1', adm2='ARG.8.1_1', adm3='null', rightgeometry=<shapely.geometry.point.Point object at 0x7f81fd30b0d0>, brightness='353.5', frp='33.81', scan='0.56', track='0.69', confidence='n', type='0', instrument='VIIRS', datetime='2021-01-08 18:36', year='2021')

In [10]:
sdf_firm_adm.printSchema()

root
 |-- leftgeometry: geometry (nullable = true)
 |-- adm0: string (nullable = true)
 |-- adm1: string (nullable = true)
 |-- adm2: string (nullable = true)
 |-- adm3: string (nullable = true)
 |-- rightgeometry: geometry (nullable = true)
 |-- brightness: string (nullable = true)
 |-- frp: string (nullable = true)
 |-- scan: string (nullable = true)
 |-- track: string (nullable = true)
 |-- confidence: string (nullable = true)
 |-- type: string (nullable = true)
 |-- instrument: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- year: string (nullable = true)



22/03/06 12:50:20 ERROR TaskSchedulerImpl: Lost executor 1 on 10.1.220.149: The executor with id 1 was deleted by a user or the framework.
22/03/06 12:50:29 ERROR TaskSchedulerImpl: Lost executor 2 on 10.1.220.144: The executor with id 2 was deleted by a user or the framework.


In [8]:
# set dynamic mode to preserve previous month of times saved
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

schema = StructType([
    StructField("geometry", StringType(), False),
    StructField("brightness", FloatType(), True),
    StructField("frp", FloatType(), True),
    StructField("scan", FloatType(), True),
    StructField("track", FloatType(), True),
    StructField("confidence", StringType(), True),
    StructField("type", IntegerType(), True),
    StructField("instrument", StringType(), True),
    StructField("datetime", TimestampType(), True),
    StructField("year", IntegerType(), True),
    StructField("adm0", StringType(), False),
    StructField("adm1", StringType(), False),
    StructField("adm2", StringType(), False),
    StructField("adm3", StringType(), True)
])

sdf_firm_adm = sdf_firm_adm.selectExpr(
    "ST_AsText(rightgeometry) AS geometry",
    "CAST(brightness as FLOAT)",
    "CAST(frp as FLOAT)",
    "CAST(scan as FLOAT)",
    "CAST(track as FLOAT)",
    "confidence",
    "CAST(type as INT)",
    "instrument",
    "TO_TIMESTAMP(datetime, 'yyyy-MM-dd HH:mm') as datetime",
    "CAST(year as INT)",
    "adm0",
    "adm1",
    "adm2",
    "adm3"
).repartition("adm0", "adm1", "year")

sdf_firm_adm.write \
    .partitionBy("adm0", "adm1", "year") \
    .option("schema",schema) \
    .mode("overwrite") \
    .format("parquet") \
    .save("s3a://dutrajardim-fi/tables/firms.parquet")

                                                                                

In [9]:
spark.stop()

22/03/02 23:27:36 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.


In [1]:
import configparser

from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import (expr, broadcast, substring as s_substring, trim as s_trim)
from pyspark.sql.types import (StructType, StructField, IntegerType, StringType, FloatType)

from sedona.register import SedonaRegistrator
from sedona.utils import KryoSerializer, SedonaKryoRegistrator
from sedona.utils.adapter import Adapter
from sedona.core.formatMapper.shapefileParser import ShapefileReader 

In [2]:
sparkConf = SparkConf()
parser = configparser.ConfigParser()
parser.optionxform=str
parser.read_file(open('../sparkconf.cfg'))

for section, config in parser.items():
    for key, value in config.items():
        sparkConf.set(key, value)

sparkConf.set("spark.serializer", KryoSerializer.getName)
sparkConf.set("spark.kryo.registrator", SedonaKryoRegistrator.getName)
sparkConf.set("spark.kryoserializer.buffer.max", "512")

sparkConf.set("spark.archives", "https://minio.minio-tenant/dutrajardim-etls/dependencies.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=admin%2F20220228%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20220228T111424Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=a20e1526dd993354ae90c90c7cb76da4726a5ae00c9920ceec8e2a35f3482c3c#deps")
sparkConf.set("spark.executorEnv.PYTHONPATH", "/opt/spark/work-dir/deps")
sparkConf.set("spark.executorEnv.LD_LIBRARY_PATH", "/opt/spark/work-dir/deps/Shapely.libs")

spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()