# **Dataset Integration**

In [39]:
import os
import sys
import pyspark
import pkg_resources

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, lit, expr
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import split
from pyspark.sql.functions import col
from pyspark.sql import Row
from pyspark.sql.types import IntegerType, DateType
from pyspark.sql.functions import year  # used to extract year from date, could do this manually as well
from pyspark.sql import Window
from pyspark.sql.functions import sum as pyspark_sum
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql import SparkSession
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.spark import *
import geopandas as gpd
from py4j.java_gateway import java_import


In [15]:
sys.path.append("/usr/local/python-env/py39/lib/python3.9/site-packages")

print(pyspark.__version__)

print(sys.executable)

3.5.1
/usr/bin/python3.9


In [16]:
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.9'

In [17]:
sedona_version = pkg_resources.get_distribution("apache-sedona").version
print(f"Apache Sedona version: {sedona_version}")

Apache Sedona version: 1.5.1


In [18]:
print(os.environ['SPARK_HOME'])
print(os.environ['PYSPARK_PYTHON'])

/usr/local/spark/latest
/usr/bin/python3.9


In [19]:
spark = SparkSession \
    .builder \
    .appName('DatasetIntegration') \
    .master('spark://columbus-oh.cs.colostate.edu:30800') \
    .config("spark.yarn.resourcemanager.address", "columbia.cs.colostate.edu:30799") \
    .config("spark.serializer", KryoSerializer.getName) \
    .config("spark.kryo.registrator", SedonaKryoRegistrator.getName) \
    .config('spark.jars.packages',
            'org.apache.sedona:sedona-spark-3.5_2.12:1.5.1,'
            'org.datasyslab:geotools-wrapper:1.5.1-28.2') \
    .config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all') \
    .getOrCreate()

# Set log level to DEBUG
spark.sparkContext.setLogLevel("ERROR")

sedona = SedonaContext.create(spark)
SedonaRegistrator.registerAll(spark)

# create a logger
logger = spark._jvm.org.apache.log4j.LogManager.getLogger(__name__)
logger.info("Pyspark initialized...")

https://artifacts.unidata.ucar.edu/repository/unidata-all added as a remote repository with the name: repo-1
Ivy Default Cache set to: /s/chopin/a/grad/flarrieu/.ivy2/cache
The jars for the packages stored in: /s/chopin/a/grad/flarrieu/.ivy2/jars
org.apache.sedona#sedona-spark-3.5_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ca90b1d4-2057-4406-b0ed-337f6a0595cb;1.0
	confs: [default]


:: loading settings :: url = jar:file:/usr/local/spark/3.5.0-with-hadoop3.3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.sedona#sedona-spark-3.5_2.12;1.5.1 in central
	found org.apache.sedona#sedona-common;1.5.1 in central
	found org.apache.commons#commons-math3;3.6.1 in central
	found org.locationtech.jts#jts-core;1.19.0 in central
	found org.wololo#jts2geojson;0.16.1 in central
	found org.locationtech.spatial4j#spatial4j;0.8 in central
	found com.google.geometry#s2-geometry;2.0.0 in central
	found com.google.guava#guava;25.1-jre in central
	found com.google.code.findbugs#jsr305;3.0.2 in user-list
	found org.checkerframework#checker-qual;2.0.0 in central
	found com.google.errorprone#error_prone_annotations;2.1.3 in central
	found com.google.j2objc#j2objc-annotations;1.1 in central
	found org.codehaus.mojo#animal-sniffer-annotations;1.14 in central
	found com.uber#h3;4.1.1 in central
	found net.sf.geographiclib#GeographicLib-Java;1.52 in central
	found com.github.ben-manes.caffeine#caffeine;2.9.2 in central
	found org.checkerframework#checker-qual;3.10.0 in central
	found com.google.err

# **Helper Functions**

In [20]:
def append_to_csv(df: DataFrame):
    path = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/base_graph.csv"
    # Here we use mode 'append' to add to the existing file
    df.write.csv(path=path, mode='append', header=True)

In [21]:
java_import(spark._jvm, 'org.apache.hadoop.fs.Path')
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())

def get_file_to_df(file_name: str): 
    data_directory = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/"
    geojsonSchema = "type string, crs string, totalFeatures long, features array<struct<type string, geometry string, properties map<string, string>>>"

    print(f"Processing file: {file_name}")
    
    df = spark.read.schema(geojsonSchema).json(data_directory + file_name, multiLine=True)
    
    # Explode the features array to create a row for each feature and select the columns
    df = (df
        .select(F.explode("features").alias("features"))
        .select("features.*")
        .withColumn("geometry", F.expr("ST_GeomFromGeoJSON(geometry)"))
        )
    
    return df

In [28]:
java_import(spark._jvm, 'org.apache.hadoop.fs.Path')

def get_files_recursive(path):
    file_status_arr = fs.listStatus(spark._jvm.Path(path))
    
    file_paths = []
    
    for file_status in file_status_arr:
        if file_status.isDirectory():
            file_paths += get_files_recursive(file_status.getPath().toString())
        elif file_status.getPath().getName().endswith(('.json', '.geojson')):
            file_paths.append(file_status.getPath().toString())
    
    print(file_paths)
    return file_paths

# **Get Blocks (Leaf Nodes)**

In [29]:
# Initialize a Hadoop file system 
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())

blocks_dataframes = {}

# Directory containing the files
json_directory = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/"

# Define the schema for the GeoJSON data
geojsonSchema = "type string, crs string, totalFeatures long, features array<struct<type string, geometry string, properties map<string, string>>>"

files = get_files_recursive(json_directory)
# Load each JSON file into a DataFrame and store it in the dictionary
for file_path in files:
    if file_path:
        file_name = file_path.split('/')[-1]
        
        # Print the file path
        print(f"Processing file: {file_path}")
        
        # Read the GeoJSON file using the defined schema using sedona into a spark dataframe
        df = spark.read.schema(geojsonSchema).json(file_path, multiLine=True)
        
        # Explode the features array to create a row for each feature and select the columns
        df = (df
            .select(F.explode("features").alias("features"))
            .select("features.*")
            # Use Sedona's ST_GeomFromGeoJSON function to convert the geometry string to a geometry object
            .withColumn("geometry", F.expr("ST_GeomFromGeoJSON(geometry)"))
            )
        
        blocks_dataframes[file_name] = df

['hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Alabama.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Alaska.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/AmericanSamoa.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Arizona.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Arkansas.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/California.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Colorado.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/CommonwealthoftheNorthernMarianaIslands.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Connecticut.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Delaware.geojson', 'hdfs://columbus-oh.cs.colostate

# **Get Dataset**

In [30]:
df_dataset = get_file_to_df('FloodZones.geojson')

Processing file: FloodZones.geojson


# **Integrate Dataset**

In [31]:
blocks_dataframes['Alabama.geojson'].show(n=1, truncate=False)
df_dataset.show(n=1, truncate=False)

                                                                                

+-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|type   |geometry                                                                                                                                                                                                                                                                                                                                                                               

[Stage 7:>                                                          (0 + 1) / 1]

+-------+--------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|type   |geometry                                                                              |properties                                                                                                                                                                                                                                                                                                                                                                          |
+-------+---------------------------------------------------

                                                                                

In [40]:
def integrate_dataset(dataset_key: str, df_dataset: DataFrame):
    df_dataset = df_dataset.withColumnRenamed("geometry", "dataset_geometry")
    df_dataset = df_dataset.withColumnRenamed("properties", "dataset_properties")

    for blocks_file_name, df_blocks in blocks_dataframes.items():
        logger.info(f"Integrating blocks from {blocks_file_name} with dataset {dataset_key}")
        df_blocks = df_blocks.withColumnRenamed("geometry", "blocks_geometry")
        df_blocks = df_blocks.withColumnRenamed("properties", "blocks_properties")

        # Using a more appropriate spatial function
        df_blocks_partOf_dataset = df_blocks.crossJoin(df_dataset).where(
            expr("ST_Intersects(dataset_geometry, blocks_geometry)") |
            expr("ST_Contains(dataset_geometry, blocks_geometry)") |
            expr("ST_Contains(blocks_geometry, dataset_geometry)") |
            expr("ST_Touches(dataset_geometry, blocks_geometry)") |
            expr("ST_Overlaps(dataset_geometry, blocks_geometry)") 
        )

        # Assemble the graph relationships
        df_relationships = df_blocks_partOf_dataset.select(
            col(f"dataset_properties.{dataset_key}").alias("Subject"),
            lit("isPartOf").alias("Relationship"),
            col("blocks_properties.NAME").alias("Object")
        )
        
        #append_to_csv(df_relationships)
        return df_relationships

In [41]:
df_relationships = integrate_dataset('DFIRM_ID', df_dataset)

In [37]:
df_relationships.show(n=1, truncate=False)

24/04/19 16:13:28 ERROR TaskSchedulerImpl: Lost executor 2 on 129.82.44.141: Command exited with code 52


+-------+------------+------+
|Subject|Relationship|Object|
+-------+------------+------+
|01001C |isPartOf    |2     |
+-------+------------+------+
only showing top 1 row



                                                                                

In [45]:
df_relationships.count()

                                                                                

(6487, 3)