# **Datset Spatial Partitioning**


### Load data into Spark

In [1]:
import sys
sys.path.append("/usr/local/python-env/py39/lib/python3.9/site-packages")

import pyspark
print(pyspark.__version__)

print(sys.executable)

3.5.1
/usr/bin/python3.9


### Initialze a SparkSession

Initialize a test session to ensure the SparkSession is working properly. This will connect to the resource manager node that is running the YARN cluster. If we visit the YARN web portal, we can see that the Spark application is running.

Ensuring the pyspark library is being accessed from my local usr directory.


In [2]:
import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.9'

In [3]:
import pkg_resources

sedona_version = pkg_resources.get_distribution("apache-sedona").version
print(f"Apache Sedona version: {sedona_version}")

Apache Sedona version: 1.5.1


In [4]:
print(os.environ['SPARK_HOME'])
print(os.environ['PYSPARK_PYTHON'])

/usr/local/spark/latest
/usr/bin/python3.9


In [5]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql.types import IntegerType, DateType
from pyspark.sql.functions import year, to_json, expr, col, split, lit
from pyspark.sql import Window
from pyspark.sql.functions import sum as pyspark_sum
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql import SparkSession
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.spark import *
import geopandas as gpd
from py4j.java_gateway import java_import
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, MapType, LongType, IntegerType, DoubleType

Skipping SedonaKepler import, verify if keplergl is installed


## Now to make the app

In [6]:
spark = SparkSession \
    .builder \
    .appName('Dataset Spatial Partitioning') \
    .master('spark://columbus-oh.cs.colostate.edu:30800') \
    .config("spark.yarn.resourcemanager.address", "columbia.cs.colostate.edu:30799") \
    .config("spark.executor.memory", "3g") \
    .config("spark.executor.memoryOverhead", "512m") \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size", "500m") \
    .config("spark.serializer", KryoSerializer.getName) \
    .config("spark.kryo.registrator", SedonaKryoRegistrator.getName) \
    .config('spark.jars.packages',
            'org.apache.sedona:sedona-spark-3.5_2.12:1.5.1,'
            'org.datasyslab:geotools-wrapper:1.5.1-28.2') \
    .config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all') \
    .getOrCreate()

# Set log level to DEBUG
spark.sparkContext.setLogLevel("ERROR")

sedona = SedonaContext.create(spark)
SedonaRegistrator.registerAll(spark)

# create a logger
logger = spark._jvm.org.apache.log4j.LogManager.getLogger(__name__)
logger.info("Pyspark initialized...")

https://artifacts.unidata.ucar.edu/repository/unidata-all added as a remote repository with the name: repo-1
Ivy Default Cache set to: /s/chopin/a/grad/flarrieu/.ivy2/cache
The jars for the packages stored in: /s/chopin/a/grad/flarrieu/.ivy2/jars
org.apache.sedona#sedona-spark-3.5_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-cae7e06b-e6b3-4b7f-bd45-386b17a8f937;1.0
	confs: [default]
	found org.apache.sedona#sedona-spark-3.5_2.12;1.5.1 in central


:: loading settings :: url = jar:file:/usr/local/spark/3.5.0-with-hadoop3.3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.sedona#sedona-common;1.5.1 in central
	found org.apache.commons#commons-math3;3.6.1 in central
	found org.locationtech.jts#jts-core;1.19.0 in central
	found org.wololo#jts2geojson;0.16.1 in central
	found org.locationtech.spatial4j#spatial4j;0.8 in central
	found com.google.geometry#s2-geometry;2.0.0 in central
	found com.google.guava#guava;25.1-jre in central
	found com.google.code.findbugs#jsr305;3.0.2 in user-list
	found org.checkerframework#checker-qual;2.0.0 in central
	found com.google.errorprone#error_prone_annotations;2.1.3 in central
	found com.google.j2objc#j2objc-annotations;1.1 in central
	found org.codehaus.mojo#animal-sniffer-annotations;1.14 in central
	found com.uber#h3;4.1.1 in central
	found net.sf.geographiclib#GeographicLib-Java;1.52 in central
	found com.github.ben-manes.caffeine#caffeine;2.9.2 in central
	found org.checkerframework#checker-qual;3.10.0 in central
	found com.google.errorprone#error_prone_annotations;2.5.1 in central
	found org.apac

In [7]:
java_import(spark._jvm, 'org.apache.hadoop.fs.Path')
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())   

# **Helper Functions**

In [8]:
def append_to_csv(df: DataFrame):
    path = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/base_graph.csv"
    # Here we use mode 'append' to add to the existing file
    df.write.csv(path=path, mode='append', header=True)

In [9]:
def load_and_display_graph() -> DataFrame:
    hdfs_path =  "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/base_graph.csv"
    
    # Define the schema for the graph data
    schema = StructType([
        StructField("Subject", StringType(), True),
        StructField("Relationship", StringType(), True),
        StructField("Object", StringType(), True)
    ])
    
    graph_df = spark.read.csv(hdfs_path, header=True, schema=schema)
    graph_df.show(truncate=False)
    return graph_df

In [10]:
def get_file_to_df(file_name: str): 
    data_directory = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/"
    print(f"Processing file: {file_name}")

    geojson_schema =  "type string, crs string, totalFeatures long, features array<struct<type string, geometry string, properties map<string, string>>>"

    
    df = spark.read.schema(geojson_schema).json(data_directory + file_name, multiLine=True)
    
    # Explode the features array to create a row for each feature and select the columns
    df =  (df
        .select(F.explode("features").alias("features"))
        .select("features.*")
        # Use Sedona's ST_GeomFromGeoJSON function to convert the geometry string to a geometry object
        .withColumn("geometry", F.expr("ST_GeomFromGeoJSON(geometry)"))
    )
    
    return df

In [11]:

def get_files_recursive(path):
    file_status_arr = fs.listStatus(spark._jvm.Path(path))
    file_paths = []
    
    for file_status in file_status_arr:
        if file_status.isDirectory():
            file_paths += get_files_recursive(file_status.getPath().toString())
        elif file_status.getPath().getName().endswith(('.geojson')):
            file_paths.append(file_status.getPath().toString())
    
    print(file_paths)
    return file_paths

In [12]:
def load_directory_into_df(directory_path: str):
    file_dataframes = {}
    files = get_files_recursive(directory_path)

    for file_path in files:
        file_name = file_path.split('/')[-1]
        
        # Print the file path
        print(f"Processing file: {file_name}")
        
        geojson_schema = "type string, crs string, totalFeatures long, features array<struct<type string, geometry string, properties map<string, string>>>"

        df = spark.read.schema(geojson_schema).json(file_path, multiLine=True)
        
        df = (df
            .select(F.explode("features").alias("features"))
            .select("features.*")
            .withColumn("geometry", F.expr("ST_GeomFromGeoJSON(geometry)"))
            )
        
        file_dataframes[file_name] = df

    return file_dataframes

# **Partition Dataset Based on States**

In [13]:
df_states = get_file_to_df('States.geojson')

Processing file: States.geojson


In [14]:
df_wildfires = get_file_to_df('Wildfires.geojson')

Processing file: Wildfires.geojson


In [15]:
states = df_states.collect()
for state in states:
    state_name = state['properties']['NAME'].replace(" ", '')
    state_geometry = state['geometry']

    filtered_wildfires = df_wildfires.filter(
        expr(f"ST_Contains(ST_GeomFromWKT('{state_geometry}'), ST_Centroid(geometry))")
    )

    hdfs_output_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/WildfiresByState/{state_name}Wildfires.geojson"

    # Attempt to save the DataFrame to HDFS as GeoJSON
    try:
        # Spark does not natively support writing to GeoJSON, so you might need to convert the DataFrame to JSON first
        filtered_wildfires.write.format("json").mode("overwrite").save(hdfs_output_path)
        print(f"GeoJSON file has been successfully saved to HDFS at {hdfs_output_path}.")
    except Exception as e:
        print(f"Failed to write GeoJSON file for {state_name} to HDFS:", e)

                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/MississippiWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/NorthCarolinaWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/OklahomaWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/VirginiaWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/WestVirginiaWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/LouisianaWildfires.geojson.


24/04/22 10:52:43 ERROR TaskSchedulerImpl: Lost executor 0 on 129.82.44.144: Command exited with code 50
                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/MichiganWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/MassachusettsWildfires.geojson.


24/04/22 11:01:42 ERROR TaskSchedulerImpl: Lost executor 4 on 129.82.44.141: Command exited with code 50
                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/IdahoWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/FloridaWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/NebraskaWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/WashingtonWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/NewMexicoWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/PuertoRicoWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/SouthDakotaWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/TexasWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/CaliforniaWildfires.geojson.


24/04/22 11:14:03 ERROR TaskSchedulerImpl: Lost executor 11 on 129.82.44.141: Command exited with code 50
                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/AlabamaWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/GeorgiaWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/PennsylvaniaWildfires.geojson.


24/04/22 11:15:40 ERROR TaskSchedulerImpl: Lost executor 7 on 129.82.44.143: Command exited with code 50
                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/MissouriWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/ColoradoWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/UtahWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/TennesseeWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/WyomingWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/NewYorkWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/KansasWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/AlaskaWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/NevadaWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/IllinoisWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/VermontWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/MontanaWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/IowaWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/SouthCarolinaWildfires.geojson.


                                                                                

GeoJSON file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/WildfiresByState/NewHampshireWildfires.geojson.


[Stage 42:>                                                         (0 + 1) / 1]