# **Dataset Integration**

In [1]:
import os
import sys
import pyspark
import pkg_resources

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, lit, expr, broadcast, to_json, explode, split, concat
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql.types import IntegerType, DateType
from pyspark.sql import Window
from pyspark.sql.functions import sum as pyspark_sum
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql import SparkSession
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.spark import *
import geopandas as gpd
from py4j.java_gateway import java_import


Skipping SedonaKepler import, verify if keplergl is installed


In [2]:
sys.path.append("/usr/local/python-env/py39/lib/python3.9/site-packages")

print(pyspark.__version__)

print(sys.executable)

3.5.1
/usr/bin/python3.9


In [3]:
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.9'

In [4]:
sedona_version = pkg_resources.get_distribution("apache-sedona").version
print(f"Apache Sedona version: {sedona_version}")

Apache Sedona version: 1.5.1


In [5]:
print(os.environ['SPARK_HOME'])
print(os.environ['PYSPARK_PYTHON'])

/usr/local/spark/latest
/usr/bin/python3.9


In [6]:
    spark = SparkSession \
        .builder \
        .appName('DatasetIntegration') \
        .master('spark://columbus-oh.cs.colostate.edu:30800') \
        .config("spark.yarn.resourcemanager.address", "columbia.cs.colostate.edu:30799") \
        .config("spark.executor.memory", "3g") \
        .config("spark.executor.memoryOverhead", "512m") \
        .config("spark.memory.offHeap.enabled", True) \
        .config("spark.memory.offHeap.size", "500m") \
        .config("spark.serializer", KryoSerializer.getName) \
        .config("spark.kryo.registrator", SedonaKryoRegistrator.getName) \
        .config('spark.jars.packages',
                'org.apache.sedona:sedona-spark-3.5_2.12:1.5.1,'
                'org.datasyslab:geotools-wrapper:1.5.1-28.2') \
        .config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all') \
        .getOrCreate()

    # Set log level to DEBUG
    spark.sparkContext.setLogLevel("ERROR")

    sedona = SedonaContext.create(spark)
    SedonaRegistrator.registerAll(spark)

    # create a logger
    logger = spark._jvm.org.apache.log4j.LogManager.getLogger(__name__)
    logger.info("Pyspark initialized...")

https://artifacts.unidata.ucar.edu/repository/unidata-all added as a remote repository with the name: repo-1
Ivy Default Cache set to: /s/chopin/a/grad/flarrieu/.ivy2/cache
The jars for the packages stored in: /s/chopin/a/grad/flarrieu/.ivy2/jars
org.apache.sedona#sedona-spark-3.5_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-bf24faaa-26a5-4785-bd22-610ac3d68eb8;1.0
	confs: [default]
	found org.apache.sedona#sedona-spark-3.5_2.12;1.5.1 in central


:: loading settings :: url = jar:file:/usr/local/spark/3.5.0-with-hadoop3.3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.sedona#sedona-common;1.5.1 in central
	found org.apache.commons#commons-math3;3.6.1 in central
	found org.locationtech.jts#jts-core;1.19.0 in central
	found org.wololo#jts2geojson;0.16.1 in central
	found org.locationtech.spatial4j#spatial4j;0.8 in central
	found com.google.geometry#s2-geometry;2.0.0 in central
	found com.google.guava#guava;25.1-jre in central
	found com.google.code.findbugs#jsr305;3.0.2 in user-list
	found org.checkerframework#checker-qual;2.0.0 in central
	found com.google.errorprone#error_prone_annotations;2.1.3 in central
	found com.google.j2objc#j2objc-annotations;1.1 in central
	found org.codehaus.mojo#animal-sniffer-annotations;1.14 in central
	found com.uber#h3;4.1.1 in central
	found net.sf.geographiclib#GeographicLib-Java;1.52 in central
	found com.github.ben-manes.caffeine#caffeine;2.9.2 in central
	found org.checkerframework#checker-qual;3.10.0 in central
	found com.google.errorprone#error_prone_annotations;2.5.1 in central
	found org.apac

In [7]:
java_import(spark._jvm, 'org.apache.hadoop.fs.Path')
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())

# **Helper Functions**

In [8]:
def create_csv(df: DataFrame, path: str):
    df.write.csv(path=path, mode='append', header=True)

In [9]:
def append_to_csv(df: DataFrame, path: str):
    df.write.csv(path=path, mode='append', header=False)

In [10]:
def get_file_to_df(file_name: str): 
    data_directory = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/"
    print(f"Processing file: {file_name}")

    geojson_schema =  "type string, crs string, totalFeatures long, features array<struct<type string, geometry string, properties map<string, string>>>"

    df = spark.read.schema(geojson_schema).json(data_directory + file_name, multiLine=True)
    
    # Explode the features array to create a row for each feature and select the columns
    df = (df
        .select(explode("features").alias("features"))
        .select("features.*")
        .withColumn("geometry", expr("ST_GeomFromGeoJSON(geometry)"))
    )
    
    return df

In [11]:
def get_files_recursive(path):
    file_status_arr = fs.listStatus(spark._jvm.Path(path))
    
    file_paths = []
    
    for file_status in file_status_arr:
        if file_status.isDirectory():
            file_paths += get_files_recursive(file_status.getPath().toString())
        elif file_status.getPath().getName().endswith(('.json', '.geojson')):
            file_paths.append(file_status.getPath().toString())
    
    print(file_paths)
    return file_paths

# **Get Blocks (Leaf Nodes)**

In [12]:
blocks_dataframes = {}

# Directory containing the files
json_directory = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/"

files = get_files_recursive(json_directory)

# Load each JSON file into a DataFrame and store it in the dictionary
for file_path in files:
    if file_path:
        file_name = file_path.split('/')[-1]
        
        print(f"Processing file: {file_path}")
        
        geojsonSchema = "type string, crs string, totalFeatures long, features array<struct<type string, geometry string, properties map<string, string>>>"

        df = spark.read.schema(geojsonSchema).json(file_path, multiLine=True)
        
        df = (df
            .select(explode("features").alias("features"))
            .select("features.*")
            # Use Sedona's ST_GeomFromGeoJSON function to convert the geometry string to a geometry object
            .withColumn("geometry", F.expr("ST_GeomFromGeoJSON(geometry)"))
            )
        
        blocks_dataframes[file_name] = df

['hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Alabama.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Alaska.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/AmericanSamoa.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Arizona.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Arkansas.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/California.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Colorado.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/CommonwealthoftheNorthernMarianaIslands.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Connecticut.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Delaware.geojson', 'hdfs://columbus-oh.cs.colostate

# Initialize Dataset Nodes

In [None]:
dataset = 'GeneralManufacturingFacilities'

node_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Nodes.csv"
edge_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Edges.csv"

data_directory = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/{dataset}.geojson"

df = get_file_to_df(f'{dataset}.geojson')

dataset_foreign_id = "properties.NAME"

# "properties": {
#     "OBJECTID": 3,
#     "UNIQUE_ID": "N/A",
#     "NAME": "JWS REFRIGERATION & AIR CONDITIONING",
#     "PHONE": "(671) 646-7662",
#     "FAX": "NOT AVAILABLE",
#     "ADDRESS": "290 TUN JOSE SALAS STREET",
#     "ADDRESS2": "SUITE A",
#     "CITY": "TAMUNING",
#     "STATE": "GU",
#     "ZIP": "96913",
#     "ZIP4": "N/A",
#     "COUNTY": "GUAM",
#     "FIPS": "66010",
#     "MADDRESS": "290 TUN JOSE SALAS STREET",
#     "MCITY": "TAMUNING",
#     "MSTATE": "GU",
#     "MZIP": "96913",
#     "MZIP4": "N/A",
#     "DIRECTIONS": "NOT AVAILABLE",
#     "GEOPREC": "BLOCKFACE",
#     "EMP": 0,
#     "PRODUCT": "REFRIGERATION AND AIR-CONDITIONING",
#     "SIC": "3585",
#     "SIC2": "2542",
#     "SIC3": "N/A",
#     "SIC4": "N/A",
#     "NAICS": "N/A",
#     "NAICSDESCR": "NOT AVAILABLE",
#     "WEB": "WWW.JWSGUAM.COM",
#     "LONGITUDE": 144.7883065,
#     "LATITUDE": 13.4912295
# }

dataset_prefix = lit(dataset + "_")

# Attach all properties (Node_ID is the forein id)
df_nodes = df.withColumn("Node_ID", concat(dataset_prefix, col("properties.NAME"))) \
                 .withColumn("Type", lit("GeneralManufacturingFacility")) \
                 .withColumn("Name", col("properties.NAME")) \
                 .withColumn("Phone", col("properties.PHONE")) \
                 .withColumn("Fax", col("properties.FAX")) \
                 .withColumn("Address", col("properties.ADDRESS")) \
                 .withColumn("City", col("properties.CITY")) \
                 .withColumn("State", col("properties.STATE")) \
                 .withColumn("Zip", col("properties.ZIP")) \
                 .withColumn("County", col("properties.COUNTY"))

df_nodes = df_nodes.drop("geometry", "properties")

df_nodes.show(n=5)

create_csv(df_nodes, node_path)

# Create Empty Edges

In [None]:
schema = StructType([
    StructField("Subject", StringType(), True),
    StructField("Relationship", StringType(), True),
    StructField("Object", StringType(), True)
])

In [None]:
empty_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

In [None]:
empty_df.write.csv(path=edge_path, mode='overwrite', header=True)

# Integrate Nodes in Base Graph

In [None]:
base_graph_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/base_graph.csv"

df_dataset = df.withColumnRenamed("geometry", "dataset_geometry") \
                    .withColumnRenamed("properties", "dataset_properties")

for blocks_file_name, df_blocks in blocks_dataframes.items():
    df_blocks = df_blocks.withColumnRenamed("geometry", "blocks_geometry") \
                            .withColumnRenamed("properties", "blocks_properties")

    df_blocks_partOf_dataset = df_blocks.crossJoin(df_dataset).where(
        expr("ST_Intersects(dataset_geometry, blocks_geometry)") |
        expr("ST_Contains(blocks_geometry, ST_Centroid(dataset_geometry))") |
        expr("ST_Touches(dataset_geometry, blocks_geometry)") |
        expr("ST_Overlaps(dataset_geometry, blocks_geometry)")
    )

    dataset_prefix = lit(dataset + "_")

    df_isPartOf_relationships = df_blocks_partOf_dataset.select(
        concat(dataset_prefix, col("dataset_properties.NAME")).alias("Subject"),
        lit("isPartOf").alias("Relationship"),
        col("blocks_properties.NAME").alias("Object")
    )

    df_contains_relationships = df_blocks_partOf_dataset.select(
        col("blocks_properties.NAME").alias("Subject"),
        lit("Contains").alias("Relationship"),
        concat(dataset_prefix, col("dataset_properties.NAME")).alias("Object")
    )

    # Update the Edges.csv
    append_to_csv(df_isPartOf_relationships, edge_path)
    # Update the base grap
    # append_to_csv(df_contains_relationships, base_graph_path)

# HydroCarbonGasLiquidPipelines

In [13]:
dataset = 'HydroCarbonGasLiquidPipelines'

node_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Nodes.csv"
edge_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Edges.csv"

data_directory = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/{dataset}.geojson"

df = get_file_to_df(f'{dataset}.geojson')

# "properties": {
#     "FID": 28,
#     "Opername": "ENERGY TRANSFER",
#     "Pipename": "West Texas Pipeline",
#     "Shape_Leng": 9.96270805248,
#     "Shape__Length": 1148322.03802495
# }

dataset_prefix = lit(dataset + "_")

# Attach all properties (Node_ID is the forein id)
df_nodes = df.withColumn("Node_ID", concat(dataset_prefix, col("properties.FID"))) \
                 .withColumn("Type", lit("HydroCarbonGasLiquidPipelines")) \
                 .withColumn("Opername", col("properties.Opername")) \
                 .withColumn("Pipename", col("properties.Pipename")) 

df_nodes = df_nodes.drop("geometry", "properties")

df_nodes.show(n=5)

create_csv(df_nodes, node_path)

Processing file: HydroCarbonGasLiquidPipelines.geojson


                                                                                

+--------------------+--------------------+-------------------+--------------------+
|                Type|             Node_ID|           Opername|            Pipename|
+--------------------+--------------------+-------------------+--------------------+
|HydroCarbonGasLiq...|HydroCarbonGasLiq...|    ENERGY TRANSFER| West Texas Pipeline|
|HydroCarbonGasLiq...|HydroCarbonGasLiq...|ENTERPRISE PRODUCTS|Hobbs East Gathering|
|HydroCarbonGasLiq...|HydroCarbonGasLiq...|      NUSTAR ENERGY|      LPG to Reynosa|
|HydroCarbonGasLiq...|HydroCarbonGasLiq...|ENTERPRISE PRODUCTS|                ATEX|
|HydroCarbonGasLiq...|HydroCarbonGasLiq...|ENTERPRISE PRODUCTS|East Leg - East Loop|
+--------------------+--------------------+-------------------+--------------------+
only showing top 5 rows



                                                                                

In [14]:
schema = StructType([
    StructField("Subject", StringType(), True),
    StructField("Relationship", StringType(), True),
    StructField("Object", StringType(), True)
])

empty_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

empty_df.write.csv(path=edge_path, mode='overwrite', header=True)

In [15]:
base_graph_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/base_graph.csv"

df_dataset = df.withColumnRenamed("geometry", "dataset_geometry") \
                    .withColumnRenamed("properties", "dataset_properties")

for blocks_file_name, df_blocks in blocks_dataframes.items():
    df_blocks = df_blocks.withColumnRenamed("geometry", "blocks_geometry") \
                            .withColumnRenamed("properties", "blocks_properties")

    df_blocks_partOf_dataset = df_blocks.crossJoin(df_dataset).where(
        expr("ST_Intersects(dataset_geometry, blocks_geometry)") |
        expr("ST_Contains(blocks_geometry, ST_Centroid(dataset_geometry))") |
        expr("ST_Touches(dataset_geometry, blocks_geometry)") |
        expr("ST_Overlaps(dataset_geometry, blocks_geometry)")
    )

    dataset_prefix = lit(dataset + "_")

    df_isPartOf_relationships = df_blocks_partOf_dataset.select(
        concat(dataset_prefix, col("dataset_properties.NAME")).alias("Subject"),
        lit("isPartOf").alias("Relationship"),
        col("blocks_properties.NAME").alias("Object")
    )

    df_contains_relationships = df_blocks_partOf_dataset.select(
        col("blocks_properties.NAME").alias("Subject"),
        lit("Contains").alias("Relationship"),
        concat(dataset_prefix, col("dataset_properties.NAME")).alias("Object")
    )

    # Update the Edges.csv
    append_to_csv(df_isPartOf_relationships, edge_path)
    # Update the base grap
    # append_to_csv(df_contains_relationships, base_graph_path)

                                                                                

# Oil Refineries

In [16]:
dataset = 'OilRefineries'

node_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Nodes.csv"
edge_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Edges.csv"

data_directory = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/{dataset}.geojson"

df = get_file_to_df(f'{dataset}.geojson')

# "properties": {
#     "OBJECTID": 18,
#     "REF_ID": "REF220020",
#     "NAME": "LAKE CHARLES",
#     "ADDRESS": "1601 HWY 108 E",
#     "CITY": "SULPHUR",
#     "STATE": "LA",
#     "ZIP": "70665",
#     "ZIP4": "NOT AVAILABLE",
#     "TELEPHONE": "(337) 708-8431",
#     "TYPE": "MODERN DEEP-CONVERSION FACILITY",
#     "STATUS": "IN SERVICE",
#     "POPULATION": 1183,
#     "COUNTY": "CALCASIEU",
#     "COUNTYFIPS": "22019",
#     "COUNTRY": "USA",
#     "LATITUDE": 30.17866697000005,
#     "LONGITUDE": -93.33023517799995,
#     "NAICS_CODE": "324110",
#     "NAICS_DESC": "PETROLEUM REFINERIES",
#     "SOURCE": "EIA-820; EPA TRI",
#     "SOURCEDATE": "2017/01/01 00:00:00",
#     "VAL_METHOD": "IMAGERY/OTHER",
#     "VAL_DATE": "2018/01/31 00:00:00",
#     "WEBSITE": "http://citgorefining.com",
#     "OWNER": "PDV AMERICA INC",
#     "OPERNAME": "CITGO PETROLEUM CORP",
#     "RMP_ID": "55717",
#     "EPA_ID": "100000140199",
#     "POSREL": "WITHIN 166 FEET",
#     "CAPACITY": 425000,
#     "US_RANK": 6,
#     "CRUDE": 425000,
#     "VACDIST": 230000,
#     "COKING": 85410,
#     "THERMALOP": 0,
#     "CATCRACK": 143000,
#     "CATREFORM": 103035,
#     "CATHYDCRCK": 46000,
#     "CATHYDTRT": 398200,
#     "ALKY": 26400,
#     "POLDIM": 0,
#     "AROMATIC": 20900,
#     "ISOMER": 28000,
#     "LUBES": 0,
#     "OXYGENATES": 0,
#     "HYDRGN": 0,
#     "COKE": 32820,
#     "SULFUR": 717,
#     "ASPHALT": 0
# }


dataset_prefix = lit(dataset + "_")

# Attach all properties (Node_ID is the forein id)
df_nodes = df.withColumn("Node_ID", concat(dataset_prefix, col("properties.NAME"))) \
                 .withColumn("Type", lit("OilRefineries")) \
                 .withColumn("CAPACITY", col("properties.CAPACITY")) 

df_nodes = df_nodes.drop("geometry", "properties")

df_nodes.show(n=5)

create_csv(df_nodes, node_path)

Processing file: OilRefineries.geojson
+-------------+--------------------+--------+
|         Type|             Node_ID|CAPACITY|
+-------------+--------------------+--------+
|OilRefineries|OilRefineries_LAK...|  425000|
|OilRefineries|OilRefineries_PAS...|  112229|
|OilRefineries|OilRefineries_TUL...|   70300|
|OilRefineries|OilRefineries_CHA...|  190000|
|OilRefineries|OilRefineries_WIL...|  139000|
+-------------+--------------------+--------+
only showing top 5 rows



In [17]:
schema = StructType([
    StructField("Subject", StringType(), True),
    StructField("Relationship", StringType(), True),
    StructField("Object", StringType(), True)
])

empty_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

empty_df.write.csv(path=edge_path, mode='overwrite', header=True)

In [18]:
base_graph_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/base_graph.csv"

df_dataset = df.withColumnRenamed("geometry", "dataset_geometry") \
                    .withColumnRenamed("properties", "dataset_properties")

for blocks_file_name, df_blocks in blocks_dataframes.items():
    df_blocks = df_blocks.withColumnRenamed("geometry", "blocks_geometry") \
                            .withColumnRenamed("properties", "blocks_properties")

    df_blocks_partOf_dataset = df_blocks.crossJoin(df_dataset).where(
        expr("ST_Intersects(dataset_geometry, blocks_geometry)") |
        expr("ST_Contains(blocks_geometry, ST_Centroid(dataset_geometry))") |
        expr("ST_Touches(dataset_geometry, blocks_geometry)") |
        expr("ST_Overlaps(dataset_geometry, blocks_geometry)")
    )

    dataset_prefix = lit(dataset + "_")

    df_isPartOf_relationships = df_blocks_partOf_dataset.select(
        concat(dataset_prefix, col("dataset_properties.NAME")).alias("Subject"),
        lit("isPartOf").alias("Relationship"),
        col("blocks_properties.NAME").alias("Object")
    )

    df_contains_relationships = df_blocks_partOf_dataset.select(
        col("blocks_properties.NAME").alias("Subject"),
        lit("Contains").alias("Relationship"),
        concat(dataset_prefix, col("dataset_properties.NAME")).alias("Object")
    )

    # Update the Edges.csv
    append_to_csv(df_isPartOf_relationships, edge_path)
    # Update the base grap
    # append_to_csv(df_contains_relationships, base_graph_path)

                                                                                

# NaturalGasStorageFacilities

In [19]:
dataset = 'NaturalGasStorageFacilities'

node_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Nodes.csv"
edge_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Edges.csv"

data_directory = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/{dataset}.geojson"

df = get_file_to_df(f'{dataset}.geojson')

# "properties": {
#     "FID": 12,
#     "STFID": "STF190012",
#     "NAME": "COLUMBUS CITY",
#     "ADDRESS": "120TH STREET/S AVE",
#     "CITY": "COLUMBUS CITY",
#     "STATE": "IA",
#     "ZIP": "52738",
#     "ZIP4": "NOT AVAILABLE",
#     "TELEPHONE": "NOT AVAILABLE",
#     "TYPE": "AQUIFER",
#     "STATUS": "ACTIVE",
#     "POPULATION": -999,
#     "COUNTY": "LOUISA",
#     "COUNTYFIPS": "19115",
#     "COUNTRY": "USA",
#     "LATITUDE": 41.234864,
#     "LONGITUDE": -91.350155,
#     "NAICS_CODE": "486210",
#     "NAICS_DESC": "STORAGE OF NATURAL GAS",
#     "SOURCE": "EIA, IMAGERY",
#     "SOURCEDATE": "2018/12/01 00:00:00",
#     "VAL_METHOD": "IMAGERY/OTHER",
#     "VAL_DATE": "2019/04/10 00:00:00",
#     "WEBSITE": "http://www.kindermorgan.com/",
#     "EPAID": "155321",
#     "OWNER": "KINDER MORGAN (NATURAL GAS PIPELINE CO OF AMERICA)",
#     "OPERATOR": "NATURAL GAS PIPELINE CO OF AMERICA",
#     "POSREL": "EXCEEDS 1 MILE",
#     "OWNERPCT": 100,
#     "MAXDEL": 175000,
#     "WORKCAP": 16685000,
#     "BASEGAS": 37700000,
#     "TOTALCAP": 54400193,
#     "REGION": "MIDWEST REGION",
#     "PROPMAX": -999,
#     "PROPWORK": -999,
#     "PROPTOTAL": -999,
#     "RESERVNAME": "GALESVILLE MT. SIMON ST. PETER",
#     "SEC_NAICS": "NOT APPLICABLE",
#     "SEC_N_DESC": "NOT APPLICABLE"
# }


dataset_prefix = lit(dataset + "_")

# Attach all properties (Node_ID is the forein id)
df_nodes = df.withColumn("Node_ID", concat(dataset_prefix, col("properties.NAME"))) \
                 .withColumn("Type", lit("NaturalGasStorageFacilities")) \
                 .withColumn("Total Capacity", col("properties.TOTALCAP")) 

df_nodes = df_nodes.drop("geometry", "properties")

df_nodes.show(n=5)

create_csv(df_nodes, node_path)

Processing file: NaturalGasStorageFacilities.geojson
+-------------+--------------------+--------------+
|         Type|             Node_ID|Total Capacity|
+-------------+--------------------+--------------+
|OilRefineries|NaturalGasStorage...|      86000000|
|OilRefineries|NaturalGasStorage...|      54400193|
|OilRefineries|NaturalGasStorage...|        320340|
|OilRefineries|NaturalGasStorage...|        654231|
|OilRefineries|NaturalGasStorage...|      46854000|
+-------------+--------------------+--------------+
only showing top 5 rows



In [20]:
schema = StructType([
    StructField("Subject", StringType(), True),
    StructField("Relationship", StringType(), True),
    StructField("Object", StringType(), True)
])

empty_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

empty_df.write.csv(path=edge_path, mode='overwrite', header=True)

In [21]:
base_graph_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/base_graph.csv"

df_dataset = df.withColumnRenamed("geometry", "dataset_geometry") \
                    .withColumnRenamed("properties", "dataset_properties")

for blocks_file_name, df_blocks in blocks_dataframes.items():
    df_blocks = df_blocks.withColumnRenamed("geometry", "blocks_geometry") \
                            .withColumnRenamed("properties", "blocks_properties")

    df_blocks_partOf_dataset = df_blocks.crossJoin(df_dataset).where(
        expr("ST_Intersects(dataset_geometry, blocks_geometry)") |
        expr("ST_Contains(blocks_geometry, ST_Centroid(dataset_geometry))") |
        expr("ST_Touches(dataset_geometry, blocks_geometry)") |
        expr("ST_Overlaps(dataset_geometry, blocks_geometry)")
    )

    dataset_prefix = lit(dataset + "_")

    df_isPartOf_relationships = df_blocks_partOf_dataset.select(
        concat(dataset_prefix, col("dataset_properties.NAME")).alias("Subject"),
        lit("isPartOf").alias("Relationship"),
        col("blocks_properties.NAME").alias("Object")
    )

    df_contains_relationships = df_blocks_partOf_dataset.select(
        col("blocks_properties.NAME").alias("Subject"),
        lit("Contains").alias("Relationship"),
        concat(dataset_prefix, col("dataset_properties.NAME")).alias("Object")
    )

    # Update the Edges.csv
    append_to_csv(df_isPartOf_relationships, edge_path)
    # Update the base grap
    # append_to_csv(df_contains_relationships, base_graph_path)

                                                                                

# NaturalGasProcessingPlants

In [22]:
dataset = 'NaturalGasProcessingPlants'

node_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Nodes.csv"
edge_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Edges.csv"

data_directory = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/{dataset}.geojson"

df = get_file_to_df(f'{dataset}.geojson')

# "properties": {
#     "OBJECTID": 5,
#     "NGPPID": "NGPP010177",
#     "NAME": "DOGWOOD OAKS PLANT",
#     "ADDRESS": "21680 HWY 41",
#     "CITY": "BREWTON",
#     "STATE": "AL",
#     "ZIP": "36426",
#     "ZIP4": "NOT AVAILABLE",
#     "TELEPHONE": "(251) 248-2903",
#     "TYPE": "NATURAL GAS PROCESSING PLANT",
#     "STATUS": "ACTIVE",
#     "POPULATION": 12,
#     "COUNTY": "ESCAMBIA",
#     "COUNTYFIPS": "01053",
#     "COUNTRY": "USA",
#     "LATITUDE": 31.243471,
#     "LONGITUDE": -87.187836,
#     "NAICS_CODE": "211130",
#     "NAICS_DESC": "NATURAL GAS EXTRACTION",
#     "SOURCE": "EPA RISK MANAGEMENT PLAN (RMP) - THE RIGHT-TO-KNOW NETWORK",
#     "SOURCEDATE": "2013/03/22 00:00:00",
#     "VAL_METHOD": "IMAGERY/OTHER",
#     "VAL_DATE": "2015/06/17 00:00:00",
#     "WEBSITE": "www.plainsallamerican.com/",
#     "FACID": "100000218356",
#     "COMPNAME": "PLAINS GAS SOLUTIONS, LLC",
#     "POSREL": "WITHIN 40 FEET",
#     "OPERATOR": "CDM MAX, L.L.C. (PLAINS GAS SOLUTIONS, LLC)",
#     "OPERADDR": "333 CLAY STREET, SUITE 1600",
#     "OPERCITY": "HOUSTON",
#     "OPERSTATE": "TX",
#     "OPERCNTRY": "USA",
#     "OPERZIP": "77002",
#     "OPERPHONE": "(251) 248-2903",
#     "OPERURL": "www.plainsallamerican.com/about-us/subsidiary-websites/plains-gas-solutions/facilities",
#     "GASCAP": 4,
#     "PROCAMTBLS": 186840,
#     "BASIN": "GULF COAST COAL REGION",
#     "PLANTFLOW": 4,
#     "BTUCONTENT": 1000,
#     "GASSTORCAP": -999,
#     "LIQSTORCAP": 1000,
#     "RMP_ID": "1000032802",
#     "EPA_ID": "110055375883"
# }


dataset_prefix = lit(dataset + "_")

# Attach all properties (Node_ID is the forein id)
df_nodes = df.withColumn("Node_ID", concat(dataset_prefix, col("properties.NAME"))) \
                 .withColumn("Type", lit("NaturalGasProcessingPlants")) \
                 .withColumn("Plant Flow", col("properties.PLANTFLOW")) 

df_nodes = df_nodes.drop("geometry", "properties")

df_nodes.show(n=5)

create_csv(df_nodes, node_path)

Processing file: NaturalGasProcessingPlants.geojson
+-------------+--------------------+----------+
|         Type|             Node_ID|Plant Flow|
+-------------+--------------------+----------+
|OilRefineries|NaturalGasProcess...|         6|
|OilRefineries|NaturalGasProcess...|         4|
|OilRefineries|NaturalGasProcess...|      -999|
|OilRefineries|NaturalGasProcess...|        11|
|OilRefineries|NaturalGasProcess...|        11|
+-------------+--------------------+----------+
only showing top 5 rows



In [23]:
schema = StructType([
    StructField("Subject", StringType(), True),
    StructField("Relationship", StringType(), True),
    StructField("Object", StringType(), True)
])

empty_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

empty_df.write.csv(path=edge_path, mode='overwrite', header=True)

In [24]:
base_graph_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/base_graph.csv"

df_dataset = df.withColumnRenamed("geometry", "dataset_geometry") \
                    .withColumnRenamed("properties", "dataset_properties")

for blocks_file_name, df_blocks in blocks_dataframes.items():
    df_blocks = df_blocks.withColumnRenamed("geometry", "blocks_geometry") \
                            .withColumnRenamed("properties", "blocks_properties")

    df_blocks_partOf_dataset = df_blocks.crossJoin(df_dataset).where(
        expr("ST_Intersects(dataset_geometry, blocks_geometry)") |
        expr("ST_Contains(blocks_geometry, ST_Centroid(dataset_geometry))") |
        expr("ST_Touches(dataset_geometry, blocks_geometry)") |
        expr("ST_Overlaps(dataset_geometry, blocks_geometry)")
    )

    dataset_prefix = lit(dataset + "_")

    df_isPartOf_relationships = df_blocks_partOf_dataset.select(
        concat(dataset_prefix, col("dataset_properties.NAME")).alias("Subject"),
        lit("isPartOf").alias("Relationship"),
        col("blocks_properties.NAME").alias("Object")
    )

    df_contains_relationships = df_blocks_partOf_dataset.select(
        col("blocks_properties.NAME").alias("Subject"),
        lit("Contains").alias("Relationship"),
        concat(dataset_prefix, col("dataset_properties.NAME")).alias("Object")
    )

    # Update the Edges.csv
    append_to_csv(df_isPartOf_relationships, edge_path)
    # Update the base grap
    # append_to_csv(df_contains_relationships, base_graph_path)

                                                                                

# NaturalGasCompressorStations

In [25]:
dataset = 'GeographicRegions'

node_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Nodes.csv"
edge_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Edges.csv"

data_directory = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/{dataset}.geojson"

df = get_file_to_df(f'{dataset}.geojson')

# "properties": { 
#     "scalerank": 7.0, 
#     "featurecla": "Island", 
#     "name": "Adak", 
#     "namealt": null, 
#     "region": "North America", 
#     "subregion": null 
# }

dataset_prefix = lit(dataset + "_")

# Attach all properties (Node_ID is the forein id)
df_nodes = df.withColumn("Node_ID", concat(dataset_prefix, col("properties.name"))) \
                 .withColumn("Type", lit("GeographicRegions")) \
                 .withColumn("Scale Rank", col("properties.scalerank")) 

df_nodes = df_nodes.drop("geometry", "properties")

df_nodes.show(n=5)

create_csv(df_nodes, node_path)

Processing file: GeographicRegions.geojson
+-----------------+--------------------+----------+
|             Type|             Node_ID|Scale Rank|
+-----------------+--------------------+----------+
|GeographicRegions|GeographicRegions...|       7.0|
|GeographicRegions|GeographicRegions...|       7.0|
|GeographicRegions|GeographicRegions...|       7.0|
|GeographicRegions|GeographicRegions...|       7.0|
|GeographicRegions|GeographicRegions...|       7.0|
+-----------------+--------------------+----------+
only showing top 5 rows



In [26]:
schema = StructType([
    StructField("Subject", StringType(), True),
    StructField("Relationship", StringType(), True),
    StructField("Object", StringType(), True)
])

empty_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

empty_df.write.csv(path=edge_path, mode='overwrite', header=True)

In [27]:
base_graph_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/base_graph.csv"

df_dataset = df.withColumnRenamed("geometry", "dataset_geometry") \
                    .withColumnRenamed("properties", "dataset_properties")

for blocks_file_name, df_blocks in blocks_dataframes.items():
    df_blocks = df_blocks.withColumnRenamed("geometry", "blocks_geometry") \
                            .withColumnRenamed("properties", "blocks_properties")

    df_blocks_partOf_dataset = df_blocks.crossJoin(df_dataset).where(
        expr("ST_Intersects(dataset_geometry, blocks_geometry)") |
        expr("ST_Contains(blocks_geometry, ST_Centroid(dataset_geometry))") |
        expr("ST_Touches(dataset_geometry, blocks_geometry)") |
        expr("ST_Overlaps(dataset_geometry, blocks_geometry)")
    )

    dataset_prefix = lit(dataset + "_")

    df_isPartOf_relationships = df_blocks_partOf_dataset.select(
        concat(dataset_prefix, col("dataset_properties.name")).alias("Subject"),
        lit("isPartOf").alias("Relationship"),
        col("blocks_properties.NAME").alias("Object")
    )

    df_contains_relationships = df_blocks_partOf_dataset.select(
        col("blocks_properties.NAME").alias("Subject"),
        lit("Contains").alias("Relationship"),
        concat(dataset_prefix, col("dataset_properties.name")).alias("Object")
    )

    # Update the Edges.csv
    append_to_csv(df_isPartOf_relationships, edge_path)
    # Update the base grap
    # append_to_csv(df_contains_relationships, base_graph_path)

[Stage 542:>                                                        (0 + 1) / 1]

# PowerPlants

In [None]:
dataset = 'PowerPlants'

node_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Nodes.csv"
edge_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Edges.csv"

data_directory = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/{dataset}.geojson"

df = get_file_to_df(f'{dataset}.geojson')

# "properties": {
#     "PGM_SYS_AC": "EIA-860",
#     "PGM_SYS_ID": "124",
#     "REGISTRY_I": "110002569569",
#     "PRIMARY_NA": "TUCSON ELECTRIC POWER DEMOSS-PETRIE DSL",
#     "LOCATION_A": "2501 NORTH FLOWING WELLS ROAD",
#     "CITY_NAME": "TUCSON",
#     "COUNTY_NAM": "PIMA",
#     "STATE_CODE": "AZ",
#     "POSTAL_COD": "85705-4015",
#     "FEDERAL_FA": "N",
#     "TRIBAL_LAN": "",
#     "DATA_QUALI": "V",
#     "LAST_REPOR": "",
#     "CREATE_DAT": "2000-03-01",
#     "UPDATE_DAT": "2014-04-30",
#     "LATITUDE83": 32.2523193359375,
#     "LONGITUDE8": -110.99149322509766,
#     "REF_POINT_": "CENTER OF A FACILITY OR STATION",
#     "DERIVED_HU": "15050301",
#     "DERIVED_WB": "150503010906",
#     "DERIVED_CB": "040190045044008",
#     "DERIVED_CD": "02",
#     "OZONE_8HR_": "",
#     "PB_2008_AR": "",
#     "PM25_1997_": "",
#     "PM25_2006_": "",
#     "OZONE_8H_1": "",
#     "UTILITY_ID": "24211",
#     "UTILITY_NA": "Tucson Electric Power Co",
#     "PLANT_CODE": "124",
#     "PLANT_NAME": "Demoss Petrie",
#     "GENERATOR_": "GT2",
#     "PRIME_MOVE": "GT",
#     "STATUS": "OP",
#     "NAMEPLATE": 85,
#     "SUMMER_CAP": 72.19999694824219,
#     "WINTER_CAP": 83.30000305175781,
#     "UNIT_CODE": "",
#     "OPERATING_": "6",
#     "OPERATIN_1": "2001",
#     "ENERGY_SOU": "NG",
#     "ENERGY_S_1": "",
#     "ENERGY_S_2": "",
#     "ENERGY_S_3": "",
#     "ENERGY_S_4": "",
#     "ENERGY_S_5": "",
#     "MULTIPLE_F": "N",
#     "DELIVER_PO": "Y",
#     "SYNCHRONIZ": "",
#     "OWNERSHIP": "S",
#     "TURBINES": ".",
#     "COGENERATO": "N",
#     "SECTOR_NAM": "Electric Utility",
#     "SECTOR": "1",
#     "TOPPING_BO": "",
#     "DUCT_BURNE": "N",
#     "PLANNED_MO": "N",
#     "PLANNED_UP": ".",
#     "PLANNED__1": ".",
#     "PLANNED__2": ".",
#     "PLANNED__3": ".",
#     "PLANNED_DE": ".",
#     "PLANNED__4": ".",
#     "PLANNED__5": ".",
#     "PLANNED__6": ".",
#     "PLANNED_NE": "",
#     "PLANNED_EN": "",
#     "PLANNED_RE": ".",
#     "PLANNED__7": ".",
#     "OTHER_MODS": "",
#     "OTHER_MOD_": ".",
#     "OTHER_MO_1": ".",
#     "PLANNED__8": ".",
#     "PLANNED__9": ".",
#     "SFG_SYSTEM": "N",
#     "PULVERIZED": "",
#     "FLUIDIZED_": "",
#     "SUBCRITICA": "",
#     "SUPERCRITI": "",
#     "ULTRASUPER": "",
#     "CARBONCAPT": "",
#     "STARTUP_SO": "",
#     "STARTUP__1": "",
#     "STARTUP__2": "",
#     "STARTUP__3": "",
#     "ENERGY_SRC": "Natural Gas",
#     "ENERGY_S_6": ""
# }

dataset_prefix = lit(dataset + "_")

# Attach all properties (Node_ID is the forein id)
df_nodes = df.withColumn("Node_ID", concat(dataset_prefix, col("properties.PRIMARY_NA"))) \
                 .withColumn("Type", lit("PowerPlants")) \
                 .withColumn("Summer Capacity", col("properties.SUMMER_CAP")) \
                .withColumn("Winter Capacity", col("properties.WINTER_CAP")) 

df_nodes = df_nodes.drop("geometry", "properties")

df_nodes.show(n=5)

create_csv(df_nodes, node_path)

In [None]:
schema = StructType([
    StructField("Subject", StringType(), True),
    StructField("Relationship", StringType(), True),
    StructField("Object", StringType(), True)
])

empty_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

empty_df.write.csv(path=edge_path, mode='overwrite', header=True)

In [None]:
base_graph_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/base_graph.csv"

df_dataset = df.withColumnRenamed("geometry", "dataset_geometry") \
                    .withColumnRenamed("properties", "dataset_properties")

for blocks_file_name, df_blocks in blocks_dataframes.items():
    df_blocks = df_blocks.withColumnRenamed("geometry", "blocks_geometry") \
                            .withColumnRenamed("properties", "blocks_properties")

    df_blocks_partOf_dataset = df_blocks.crossJoin(df_dataset).where(
        expr("ST_Intersects(dataset_geometry, blocks_geometry)") |
        expr("ST_Contains(blocks_geometry, ST_Centroid(dataset_geometry))") |
        expr("ST_Touches(dataset_geometry, blocks_geometry)") |
        expr("ST_Overlaps(dataset_geometry, blocks_geometry)")
    )

    dataset_prefix = lit(dataset + "_")

    df_isPartOf_relationships = df_blocks_partOf_dataset.select(
        concat(dataset_prefix, col("dataset_properties.PRIMARY_NA")).alias("Subject"),
        lit("isPartOf").alias("Relationship"),
        col("blocks_properties.NAME").alias("Object")
    )

    df_contains_relationships = df_blocks_partOf_dataset.select(
        col("blocks_properties.NAME").alias("Subject"),
        lit("Contains").alias("Relationship"),
        concat(dataset_prefix, col("dataset_properties.PRIMARY_NA")).alias("Object")
    )

    # Update the Edges.csv
    append_to_csv(df_isPartOf_relationships, edge_path)
    # Update the base grap
    # append_to_csv(df_contains_relationships, base_graph_path)

# NaturalGasPipelines

In [None]:
dataset = 'NaturalGasPipelines'

node_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Nodes.csv"
edge_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Edges.csv"

data_directory = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/{dataset}.geojson"

df = get_file_to_df(f'{dataset}.geojson')

# "properties": {
#     "TYPEPIPE": "Intrastate",
#     "Operator": "Crosstex Texas Systems",
#     "Shape_Leng": 0.00187974387,
#     "Shape__Len": 240.3441469695
# }

dataset_prefix = lit(dataset + "_")

# Attach all properties (Node_ID is the forein id)
df_nodes = df.withColumn("Node_ID", concat(dataset_prefix, col("properties.Operator"))) \
                 .withColumn("Type", lit("NaturalGasPipelines")) \
                 .withColumn("Pipe Type", col("properties.TYPEPIPE")) 

df_nodes = df_nodes.drop("geometry", "properties")

df_nodes.show(n=5)

create_csv(df_nodes, node_path)

In [None]:
schema = StructType([
    StructField("Subject", StringType(), True),
    StructField("Relationship", StringType(), True),
    StructField("Object", StringType(), True)
])

empty_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

empty_df.write.csv(path=edge_path, mode='overwrite', header=True)

In [None]:
base_graph_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/base_graph.csv"

df_dataset = df.withColumnRenamed("geometry", "dataset_geometry") \
                    .withColumnRenamed("properties", "dataset_properties")

for blocks_file_name, df_blocks in blocks_dataframes.items():
    df_blocks = df_blocks.withColumnRenamed("geometry", "blocks_geometry") \
                            .withColumnRenamed("properties", "blocks_properties")

    df_blocks_partOf_dataset = df_blocks.crossJoin(df_dataset).where(
        expr("ST_Intersects(dataset_geometry, blocks_geometry)") |
        expr("ST_Contains(blocks_geometry, ST_Centroid(dataset_geometry))") |
        expr("ST_Touches(dataset_geometry, blocks_geometry)") |
        expr("ST_Overlaps(dataset_geometry, blocks_geometry)")
    )

    dataset_prefix = lit(dataset + "_")

    df_isPartOf_relationships = df_blocks_partOf_dataset.select(
        concat(dataset_prefix, col("dataset_properties.Operator")).alias("Subject"),
        lit("isPartOf").alias("Relationship"),
        col("blocks_properties.NAME").alias("Object")
    )

    df_contains_relationships = df_blocks_partOf_dataset.select(
        col("blocks_properties.NAME").alias("Subject"),
        lit("Contains").alias("Relationship"),
        concat(dataset_prefix, col("dataset_properties.Operator")).alias("Object")
    )

    # Update the Edges.csv
    append_to_csv(df_isPartOf_relationships, edge_path)
    # Update the base grap
    # append_to_csv(df_contains_relationships, base_graph_path)

# Dams

In [None]:
dataset = 'Dams'

node_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Nodes.csv"
edge_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/output/{dataset}Edges.csv"

data_directory = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/{dataset}.geojson"

df = get_file_to_df(f'{dataset}.geojson')

#  "properties": {
#     "OBJECTID": 9144,
#     "NGAID": "10130292",
#     "METLNKID": "",
#     "FEATTYPE": "POLYLINE",
#     "NAME": "EL PASO DAM NO 10",
#     "CITY": "EL PASO",
#     "STATE": "TX",
#     "COUNTY": "EL PASO",
#     "FIPS": "48141",
#     "DIRECTIONS": "",
#     "EMERGTITLE": "",
#     "EMERGPHONE": "",
#     "EMERGEXT": "",
#     "CONTDATE": "1899-11-30T00:00:00.000Z",
#     "CONTHOW": "",
#     "GEODATE": "2007-03-28T00:00:00.000Z",
#     "GEOHOW": "MANUAL",
#     "HSIPTHEMES": "CRITICAL INFRASTUCTURE, PDD-63; WATER SUPPLY; DAMS",
#     "SOURCE": "USACE",
#     "X": -106.4814352,
#     "Y": 31.7778363,
#     "QC_QA": "",
#     "RECORDID": "72827",
#     "OTHER_NAME": "",
#     "FORM_NAME": "",
#     "STATEID": "",
#     "NIDID": "TX07023",
#     "SECTION": "3106-432",
#     "RIVER": "OFF CH-RIO GRANDE",
#     "CITYAFFECT": "EL PASO",
#     "NIDSTATE": "TX",
#     "NIDCOUNTY": "EL PASO",
#     "DISTANCE": 0,
#     "OWN_TYPE": "L",
#     "PRIV_DAM": "",
#     "DAM_TYPE": "RE",
#     "CORE": "XX",
#     "FOUND": "U",
#     "PURPOSES": "C",
#     "YR_COMPL": "",
#     "YR_MOD": "",
#     "DAM_LENGTH": 0,
#     "DAM_HEIGHT": 30,
#     "STR_HEIGHT": 0,
#     "HYD_HEIGHT": 0,
#     "NID_HEIGHT": 30,
#     "MAX_DIS": 0,
#     "MAX_STOR": 24,
#     "NORMAL_STO": 0,
#     "NID_STOR": 24,
#     "SURF_AREA": 0,
#     "DRAIN_AREA": 0,
#     "HAZARD": "H",
#     "EAP": "N",
#     "INSP_DATE": "1996-07-17T00:00:00.000Z",
#     "INSP_FREQU": 0,
#     "ST_REG_DAM": "Y",
#     "ST_REG_AG": "",
#     "SPILL_TYPE": "U",
#     "SPILL_WIDT": 12,
#     "OUT_GATES": "",
#     "VOLUME": 0,
#     "NO_LOCKS": 0,
#     "LEN_LOCKS": 0,
#     "WID_LOCKS": 0,
#     "FED_FUND": "",
#     "FED_DESIGN": "",
#     "FED_CONSTR": "",
#     "FED_REG": "",
#     "FED_INSP": "",
#     "FED_OPER": "",
#     "FED_OWN": "",
#     "FED_OTHER": "",
#     "SOURCE_A": "TX",
#     "SUB_DATE": "20000401",
#     "URL_ADDRES": "HTTP://WWW.TCEQ.STATE.TX.US/",
#     "CONG_DIST": "TX16",
#     "SHAPE_Leng": 216.77816378547902
# }

dataset_prefix = lit(dataset + "_")

# Attach all properties (Node_ID is the forein id)
df_nodes = df.withColumn("Node_ID", concat(dataset_prefix, col("properties.NAME"))) \
                 .withColumn("Type", lit("Dams")) \
                 .withColumn("River", col("properties.RIVER")) 

df_nodes = df_nodes.drop("geometry", "properties")

df_nodes.show(n=5)

create_csv(df_nodes, node_path)

In [None]:
schema = StructType([
    StructField("Subject", StringType(), True),
    StructField("Relationship", StringType(), True),
    StructField("Object", StringType(), True)
])

empty_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

empty_df.write.csv(path=edge_path, mode='overwrite', header=True)

In [None]:
base_graph_path = f"hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/base_graph.csv"

df_dataset = df.withColumnRenamed("geometry", "dataset_geometry") \
                    .withColumnRenamed("properties", "dataset_properties")

for blocks_file_name, df_blocks in blocks_dataframes.items():
    df_blocks = df_blocks.withColumnRenamed("geometry", "blocks_geometry") \
                            .withColumnRenamed("properties", "blocks_properties")

    df_blocks_partOf_dataset = df_blocks.crossJoin(df_dataset).where(
        expr("ST_Intersects(dataset_geometry, blocks_geometry)") |
        expr("ST_Contains(blocks_geometry, ST_Centroid(dataset_geometry))") |
        expr("ST_Touches(dataset_geometry, blocks_geometry)") |
        expr("ST_Overlaps(dataset_geometry, blocks_geometry)")
    )

    dataset_prefix = lit(dataset + "_")

    df_isPartOf_relationships = df_blocks_partOf_dataset.select(
        concat(dataset_prefix, col("dataset_properties.NAME")).alias("Subject"),
        lit("isPartOf").alias("Relationship"),
        col("blocks_properties.NAME").alias("Object")
    )

    df_contains_relationships = df_blocks_partOf_dataset.select(
        col("blocks_properties.NAME").alias("Subject"),
        lit("Contains").alias("Relationship"),
        concat(dataset_prefix, col("dataset_properties.NAME")).alias("Object")
    )

    # Update the Edges.csv
    append_to_csv(df_isPartOf_relationships, edge_path)
    # Update the base grap
    # append_to_csv(df_contains_relationships, base_graph_path)