# **Knowledge Graph Generation**


### Load data into Spark

In [1]:
import sys
sys.path.append("/usr/local/python-env/py39/lib/python3.9/site-packages")

import pyspark
print(pyspark.__version__)

print(sys.executable)

3.5.1
/usr/bin/python3.9


### Initialze a SparkSession

Ensuring the pyspark library is being accessed from my local usr directory.


In [2]:
import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.9'

In [3]:
import pkg_resources

sedona_version = pkg_resources.get_distribution("apache-sedona").version
print(f"Apache Sedona version: {sedona_version}")

Apache Sedona version: 1.5.1


In [4]:
print(os.environ['SPARK_HOME'])
print(os.environ['PYSPARK_PYTHON'])

/usr/local/spark/latest
/usr/bin/python3.9


In [5]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, lit,  split, expr
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql.types import IntegerType, DateType
from pyspark.sql.functions import year 
from pyspark.sql import Window
from pyspark.sql.functions import sum as pyspark_sum
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql import SparkSession
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.spark import *
import geopandas as gpd

Skipping SedonaKepler import, verify if keplergl is installed


## Now to make the app

In [6]:


spark = SparkSession \
    .builder \
    .appName('GeoSpatialQueries_Freddy') \
    .master('spark://columbus-oh.cs.colostate.edu:30800') \
    .config("spark.yarn.resourcemanager.address", "columbia.cs.colostate.edu:30799") \
    .config("spark.executor.memory", "3g") \
    .config("spark.executor.memoryOverhead", "512m") \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size", "500m") \
    .config("spark.serializer", KryoSerializer.getName) \
    .config("spark.kryo.registrator", SedonaKryoRegistrator.getName) \
    .config('spark.jars.packages',
            'org.apache.sedona:sedona-spark-3.5_2.12:1.5.1,'
            'org.datasyslab:geotools-wrapper:1.5.1-28.2') \
    .config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all') \
    .getOrCreate()

# Set log level to DEBUG
spark.sparkContext.setLogLevel("ERROR")

sedona = SedonaContext.create(spark)
SedonaRegistrator.registerAll(spark)

# create a logger
logger = spark._jvm.org.apache.log4j.LogManager.getLogger(__name__)
logger.info("Pyspark initialized...")

https://artifacts.unidata.ucar.edu/repository/unidata-all added as a remote repository with the name: repo-1
Ivy Default Cache set to: /s/chopin/a/grad/flarrieu/.ivy2/cache
The jars for the packages stored in: /s/chopin/a/grad/flarrieu/.ivy2/jars
org.apache.sedona#sedona-spark-3.5_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-022f1392-57aa-4c61-a8cc-bc896602d335;1.0
	confs: [default]


:: loading settings :: url = jar:file:/usr/local/spark/3.5.0-with-hadoop3.3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.sedona#sedona-spark-3.5_2.12;1.5.1 in central
	found org.apache.sedona#sedona-common;1.5.1 in central
	found org.apache.commons#commons-math3;3.6.1 in central
	found org.locationtech.jts#jts-core;1.19.0 in central
	found org.wololo#jts2geojson;0.16.1 in central
	found org.locationtech.spatial4j#spatial4j;0.8 in central
	found com.google.geometry#s2-geometry;2.0.0 in central
	found com.google.guava#guava;25.1-jre in central
	found com.google.code.findbugs#jsr305;3.0.2 in user-list
	found org.checkerframework#checker-qual;2.0.0 in central
	found com.google.errorprone#error_prone_annotations;2.1.3 in central
	found com.google.j2objc#j2objc-annotations;1.1 in central
	found org.codehaus.mojo#animal-sniffer-annotations;1.14 in central
	found com.uber#h3;4.1.1 in central
	found net.sf.geographiclib#GeographicLib-Java;1.52 in central
	found com.github.ben-manes.caffeine#caffeine;2.9.2 in central
	found org.checkerframework#checker-qual;3.10.0 in central
	found com.google.err

## Load the datasets

In [7]:
# Import the necessary module from py4j to interact with JVM
from py4j.java_gateway import java_import

# Import the Path class from Hadoop. This class is used to handle file paths in Hadoop.
java_import(spark._jvm, 'org.apache.hadoop.fs.Path')

# Define a function to recursively get all .json and .geojson files in a directory and its subdirectories
def get_files_recursive(path):
    # Use the listStatus method of the FileSystem class to get an array of FileStatus objects
    # Each FileStatus object represents a file or directory in the given path
    file_status_arr = fs.listStatus(spark._jvm.Path(path))
    
    # Initialize an empty list to hold the file paths
    file_paths = []
    
    # Loop through each FileStatus object in the array
    for file_status in file_status_arr:
        # If the FileStatus object represents a directory
        if file_status.isDirectory():
            # Call the get_files_recursive function with the directory path
            # This is a recursive call, which means the function calls itself
            # Add the returned file paths to the file_paths list
            file_paths += get_files_recursive(file_status.getPath().toString())
        # If the FileStatus object represents a file that ends with .json or .geojson
        elif file_status.getPath().getName().endswith(('.json', '.geojson')):
            # Add the file path to the file_paths list
            file_paths.append(file_status.getPath().toString())
    
    print(file_paths)
    # Return the list of file paths
    return file_paths

In [8]:

# Initialize a Hadoop file system 
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())

# Directory containing the files
json_directory = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/"

# Define the schema for the GeoJSON data
geojsonSchema = "type string, crs string, totalFeatures long, features array<struct<type string, geometry string, properties map<string, string>>>"


# Get a list of the JSON and GeoJSON files in the directory and its subdirectories
json_files = get_files_recursive(json_directory)

# Create a dictionary to hold the DataFrames
json_dataset_dataframes = {}

# Define the current and desired EPSG codes
current_epsg = "EPSG:3857"  # Web Mercator
desired_epsg = "EPSG:4326"  # WGS84

# Load each JSON file into a DataFrame and store it in the dictionary
for file_path in json_files:
    file_name = file_path.split('/')[-1]
    
    # Print the file path
    print(f"Processing file: {file_path}")
    
    # Read the GeoJSON file using the defined schema using sedona into a spark dataframe
    df = spark.read.schema(geojsonSchema).json(file_path, multiLine=True)
    
    # Explode the features array to create a row for each feature and select the columns
    df = (df
        .select(F.explode("features").alias("features"))
        .select("features.*")
        # Use Sedona's ST_GeomFromGeoJSON function to convert the geometry string to a geometry object
        .withColumn("geometry", F.expr("ST_GeomFromGeoJSON(geometry)"))
        )
    
    json_dataset_dataframes[file_name] = df

['hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Alabama.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Alaska.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/AmericanSamoa.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Arizona.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Arkansas.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/California.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Colorado.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/CommonwealthoftheNorthernMarianaIslands.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Connecticut.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Delaware.geojson', 'hdfs://columbus-oh.cs.colostate

# **Helper Functions**

In [9]:
def append_to_csv(df: DataFrame):
    path = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/base_graph.csv"
    df.write.csv(path=path, mode='append', header=True)

In [10]:
def load_and_display_graph() -> DataFrame:
    hdfs_path =  "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/base_graph.csv"
    
    # Define the schema for the graph data
    schema = StructType([
        StructField("Subject", StringType(), True),
        StructField("Relationship", StringType(), True),
        StructField("Object", StringType(), True)
    ])
    
    # Load the CSV file into a DataFrame with the defined schema
    graph_df = spark.read.csv(hdfs_path, header=True, schema=schema)
    
    # Show a sample of the DataFrame
    graph_df.show(truncate=False)
    
    return graph_df

# **Add Continents**

In [11]:
from pyspark.sql import functions as F

# Create a DataFrame for Earth with a single row
earth_df = spark.createDataFrame([("Earth",)], ["name"])

# Assuming df_continents already loaded
df_continents = json_dataset_dataframes['WorldContinents.geojson']

# Add columns to df_continents that establishes the 'partOf' relationship to Earth
df_continents = df_continents.withColumn("Subject", F.col("properties.CONTINENT"))
df_continents = df_continents.withColumn("Relationship", F.lit("partOf"))
df_continents = df_continents.withColumn("Object", F.lit("Earth"))

df_continents = df_continents.withColumn("Object", F.col("properties.CONTINENT"))
df_continents = df_continents.withColumn("Relationship", F.lit("Contains"))
df_continents = df_continents.withColumn("Subject", F.lit("Earth"))

# Select the new structured columns to form the triple
df_to_save = df_continents.select("Subject", "Relationship", "Object")

# Show DataFrame to verify the structure
df_to_save.show()

[Stage 6:>                                                          (0 + 1) / 1]

+-------+------------+-------------+
|Subject|Relationship|       Object|
+-------+------------+-------------+
|  Earth|    Contains|       Africa|
|  Earth|    Contains|         Asia|
|  Earth|    Contains|    Australia|
|  Earth|    Contains|      Oceania|
|  Earth|    Contains|South America|
|  Earth|    Contains|   Antarctica|
|  Earth|    Contains|       Europe|
|  Earth|    Contains|North America|
+-------+------------+-------------+



                                                                                

In [12]:
# Define the HDFS path for the output CSV file
hdfs_output_path = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/base_graph.csv"

# Try to save the DataFrame to HDFS
try:
    df_to_save.write.csv(path=hdfs_output_path, mode="overwrite", header=True)
    print(f"CSV file has been successfully saved to HDFS at {hdfs_output_path}.")
except Exception as e:
    print("Failed to write CSV file to HDFS:", e)

CSV file has been successfully saved to HDFS at hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/base_graph.csv.


                                                                                

# **Add Countries**

In [13]:
df_continents = json_dataset_dataframes['WorldContinents.geojson']
df_countries = json_dataset_dataframes['CountryTerritories.geojson']

In [14]:
df_continents = df_continents.withColumnRenamed("geometry", "continents_geometry")
df_countries = df_countries.withColumnRenamed("geometry", "country_geometry")

df_continents = df_continents.withColumnRenamed("properties", "continent_properties")
df_countries = df_countries.withColumnRenamed("properties", "country_properties")

df_country_continent_instersects = df_continents.crossJoin(df_countries).where(
    expr("ST_Contains(continents_geometry, ST_Centroid(country_geometry))")
)


In [15]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, lit

df_isPartOf_relationships = df_country_continent_instersects.select(
    col("country_properties.name").alias("Subject"),
    lit("isPartOf").alias("Relationship"),
    col("continent_properties.CONTINENT").alias("Object")
)

df_contains_relationships = df_country_continent_instersects.select(
    col("continent_properties.CONTINENT").alias("Subject"),
    lit("Contains").alias("Relationship"),
    col("country_properties.name").alias("Object")
)

In [16]:
append_to_csv(df_isPartOf_relationships)

                                                                                

In [17]:
append_to_csv(df_contains_relationships)

                                                                                

# **Add States**

In [18]:
df_countries = json_dataset_dataframes['CountryTerritories.geojson']
df_states = json_dataset_dataframes['States.geojson']

In [19]:
df_countries = df_countries.withColumnRenamed("geometry", "country_geometry")
df_states = df_states.withColumnRenamed("geometry", "state_geometry")

df_countries = df_countries.withColumnRenamed("properties", "country_properties")
df_states = df_states.withColumnRenamed("properties", "state_properties")

# Now perform the spatial join using the renamed columns
df_state_partOf_country = df_countries.crossJoin(df_states).where(
    F.expr("ST_Contains(country_geometry, ST_Centroid(state_geometry))")
)

In [20]:
df_isPartOf_relationships = df_state_partOf_country.select(
    col("state_properties.NAME").alias("Subject"),
    lit("isPartOf").alias("Relationship"),
    col("country_properties.name").alias("Object")
)

df_contains_relationships = df_state_partOf_country.select(
    col("country_properties.name").alias("Subject"),
    lit("Contains").alias("Relationship"),
    col("state_properties.NAME").alias("Object")
)

In [21]:
append_to_csv(df_isPartOf_relationships)

                                                                                

In [22]:
append_to_csv(df_contains_relationships)

                                                                                

# **Add Counties**

In [23]:
# Initialize a Hadoop file system 
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())

counties_dataframes = {}

# Directory containing the files
json_directory = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/"

# Define the schema for the GeoJSON data
geojsonSchema = "type string, crs string, totalFeatures long, features array<struct<type string, geometry string, properties map<string, string>>>"

json_files = get_files_recursive(json_directory)

# Load each JSON file into a DataFrame and store it in the dictionary
for file_path in json_files:
    file_name = file_path.split('/')[-1]
    
    # Print the file path
    print(f"Processing file: {file_name}")
    
    # Read the GeoJSON file using the defined schema using sedona into a spark dataframe
    df = spark.read.schema(geojsonSchema).json(file_path, multiLine=True)
    
    # Explode the features array to create a row for each feature and select the columns
    df = (df
        .select(F.explode("features").alias("features"))
        .select("features.*")
        # Use Sedona's ST_GeomFromGeoJSON function to convert the geometry string to a geometry object
        .withColumn("geometry", F.expr("ST_GeomFromGeoJSON(geometry)"))
        )
    
    counties_dataframes[file_name] = df

['hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/AlabamaCounties.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/AlaskaCounties.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/AmericanSamoaCounties.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/ArizonaCounties.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/ArkansasCounties.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/CaliforniaCounties.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/ColoradoCounties.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/CommonwealthoftheNorthernMarianaIslandsCounties.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/ConnecticutCounties.geojson', 'hdfs://columbus-oh.cs.colostate.ed

In [24]:
df_states = json_dataset_dataframes['States.geojson']
for state in counties_dataframes.keys():
    
    print(state)
    df_counties = counties_dataframes[state]

    df_states = df_states.withColumnRenamed("geometry", "state_geometry")
    df_counties = df_counties.withColumnRenamed("geometry", "county_geometry")

    df_states = df_states.withColumnRenamed("properties", "state_properties")
    df_counties = df_counties.withColumnRenamed("properties", "county_properties")

    df_counties_partOf_states = df_states.crossJoin(df_counties).where(
        F.expr("ST_Contains(state_geometry, ST_Centroid(county_geometry))")
    )

    df_isPartOf_relationships = df_counties_partOf_states.select(
        col("county_properties.NAME").alias("Subject"),
        lit("isPartOf").alias("Relationship"),
        col("state_properties.NAME").alias("Object")
    )

    df_contains_relationships = df_counties_partOf_states.select(
        col("state_properties.NAME").alias("Subject"),
        lit("Contains").alias("Relationship"),
        col("county_properties.NAME").alias("Object")
    )

    append_to_csv(df_isPartOf_relationships)
    append_to_csv(df_contains_relationships)


AlabamaCounties.geojson


                                                                                

AlaskaCounties.geojson


                                                                                

AmericanSamoaCounties.geojson
ArizonaCounties.geojson


                                                                                

ArkansasCounties.geojson


                                                                                

CaliforniaCounties.geojson


                                                                                

ColoradoCounties.geojson


                                                                                

CommonwealthoftheNorthernMarianaIslandsCounties.geojson
ConnecticutCounties.geojson


                                                                                

DelawareCounties.geojson
DistrictofColumbiaCounties.geojson
FloridaCounties.geojson


                                                                                

GeorgiaCounties.geojson


                                                                                

GuamCounties.geojson
HawaiiCounties.geojson
IdahoCounties.geojson


                                                                                

IllinoisCounties.geojson


                                                                                

IndianaCounties.geojson


                                                                                

IowaCounties.geojson


                                                                                

KansasCounties.geojson
KentuckyCounties.geojson


                                                                                

LouisianaCounties.geojson


                                                                                

MaineCounties.geojson


                                                                                

MarylandCounties.geojson
MassachusettsCounties.geojson


                                                                                

MichiganCounties.geojson


                                                                                

MinnesotaCounties.geojson


                                                                                

MississippiCounties.geojson


                                                                                

MissouriCounties.geojson


                                                                                

MontanaCounties.geojson


                                                                                

NebraskaCounties.geojson


                                                                                

NevadaCounties.geojson


                                                                                

NewHampshireCounties.geojson


                                                                                

NewJerseyCounties.geojson
NewMexicoCounties.geojson
NewYorkCounties.geojson
NorthCarolinaCounties.geojson


                                                                                

NorthDakotaCounties.geojson


                                                                                

OhioCounties.geojson


                                                                                

OklahomaCounties.geojson


                                                                                

OregonCounties.geojson


                                                                                

PennsylvaniaCounties.geojson
PuertoRicoCounties.geojson


                                                                                

RhodeIslandCounties.geojson


                                                                                

SouthCarolinaCounties.geojson
SouthDakotaCounties.geojson


                                                                                

TennesseeCounties.geojson


                                                                                

TexasCounties.geojson


                                                                                

UnitedStatesVirginIslandsCounties.geojson
UtahCounties.geojson


                                                                                

VermontCounties.geojson


                                                                                

VirginiaCounties.geojson


                                                                                

WashingtonCounties.geojson


                                                                                

WestVirginiaCounties.geojson


                                                                                

WisconsinCounties.geojson


                                                                                

WyomingCounties.geojson


                                                                                

# **Add Tracts**

In [25]:
# Initialize a Hadoop file system 
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())

tracts_dataframes = {}

# Directory containing the files
json_directory = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/"

# Define the schema for the GeoJSON data
geojsonSchema = "type string, crs string, totalFeatures long, features array<struct<type string, geometry string, properties map<string, string>>>"

json_files = get_files_recursive(json_directory)

# Load each JSON file into a DataFrame and store it in the dictionary
for file_path in json_files:
    file_name = file_path.split('/')[-1]
    
    # Print the file path
    print(f"Processing file: {file_path}")
    
    # Read the GeoJSON file using the defined schema using sedona into a spark dataframe
    df = spark.read.schema(geojsonSchema).json(file_path, multiLine=True)
    
    # Explode the features array to create a row for each feature and select the columns
    df = (df
        .select(F.explode("features").alias("features"))
        .select("features.*")
        # Use Sedona's ST_GeomFromGeoJSON function to convert the geometry string to a geometry object
        .withColumn("geometry", F.expr("ST_GeomFromGeoJSON(geometry)"))
        )
    
    tracts_dataframes[file_name] = df

['hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/AlabamaTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/AlaskaTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/AmericanSamoaTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/ArizonaTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/ArkansasTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/CaliforniaTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/ColoradoTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/CommonwealthoftheNorthernMarianaIslandsTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/ConnecticutTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsBySta

In [26]:
for counties_file_name in counties_dataframes.keys():
    for tracts_file_name in tracts_dataframes.keys():  
        counties = counties_file_name.replace('Counties.geojson', '')
        tracts = tracts_file_name.replace('Tracts.geojson', '')
        if counties in tracts:
            print(counties)
            df_counties = counties_dataframes[counties_file_name]
            df_tracts = tracts_dataframes[tracts_file_name]
            
            df_counties = df_counties.withColumnRenamed("geometry", "counties_geometry")
            df_tracts = df_tracts.withColumnRenamed("geometry", "tracts_geometry")

            df_counties = df_counties.withColumnRenamed("properties", "counties_properties")
            df_tracts = df_tracts.withColumnRenamed("properties", "tracts_properties")

            df_tracts_partOf_counties = df_counties.crossJoin(df_tracts).where(
                F.expr("ST_Contains(counties_geometry, ST_Centroid(tracts_geometry))")
            )

            df_isPartOf_relationships = df_tracts_partOf_counties.select(
                col("tracts_properties.NAME").alias("Subject"),
                lit("isPartOf").alias("Relationship"),
                col("counties_properties.NAME").alias("Object")
            )

            df_contains_relationships = df_tracts_partOf_counties.select(
                col("counties_properties.NAME").alias("Subject"),
                lit("Contains").alias("Relationship"),
                col("tracts_properties.NAME").alias("Object")
            )
            
            append_to_csv(df_isPartOf_relationships)
            append_to_csv(df_contains_relationships)


Alabama


                                                                                

Alaska
AmericanSamoa
Arizona


                                                                                

Arkansas


                                                                                

California


                                                                                

Colorado


                                                                                

CommonwealthoftheNorthernMarianaIslands


                                                                                

Connecticut
Delaware
DistrictofColumbia
Florida


                                                                                

Georgia


                                                                                

Guam
Hawaii
Idaho


                                                                                

Illinois


                                                                                

Indiana


                                                                                

Iowa


                                                                                

Kansas


                                                                                

Kentucky


                                                                                

Louisiana


                                                                                

Maine


                                                                                

Maryland


                                                                                

Massachusetts


                                                                                

Michigan


                                                                                

Minnesota


                                                                                

Mississippi


                                                                                

Missouri


                                                                                

Montana


                                                                                

Nebraska


                                                                                

Nevada
NewHampshire
NewJersey


                                                                                

NewMexico
NewYork


                                                                                

NorthCarolina


                                                                                

NorthDakota
Ohio


                                                                                

Oklahoma


                                                                                

Oregon


                                                                                

Pennsylvania


                                                                                

PuertoRico


                                                                                

RhodeIsland
SouthCarolina


                                                                                

SouthDakota


                                                                                

Tennessee


                                                                                

Texas


                                                                                

UnitedStatesVirginIslands
Utah


                                                                                

Vermont
Virginia


                                                                                

Virginia


                                                                                

Washington


                                                                                

WestVirginia


                                                                                

Wisconsin


                                                                                

Wyoming


# **Add Blocks**

In [27]:
# Initialize a Hadoop file system 
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())

blocks_dataframes = {}

# Directory containing the files
json_directory = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/"

# Define the schema for the GeoJSON data
geojsonSchema = "type string, crs string, totalFeatures long, features array<struct<type string, geometry string, properties map<string, string>>>"

json_files = get_files_recursive(json_directory)

# Load each JSON file into a DataFrame and store it in the dictionary
for file_path in json_files:
    file_name = file_path.split('/')[-1]
    
    # Print the file path
    print(f"Processing file: {file_path}")
    
    # Read the GeoJSON file using the defined schema using sedona into a spark dataframe
    df = spark.read.schema(geojsonSchema).json(file_path, multiLine=True)
    
    # Explode the features array to create a row for each feature and select the columns
    df = (df
        .select(F.explode("features").alias("features"))
        .select("features.*")
        # Use Sedona's ST_GeomFromGeoJSON function to convert the geometry string to a geometry object
        .withColumn("geometry", F.expr("ST_GeomFromGeoJSON(geometry)"))
        )
    
    blocks_dataframes[file_name] = df

['hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Alabama.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Alaska.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/AmericanSamoa.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Arizona.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Arkansas.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/California.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Colorado.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/CommonwealthoftheNorthernMarianaIslands.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Connecticut.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Delaware.geojson', 'hdfs://columbus-oh.cs.colostate

In [28]:
for tracts_file_name in tracts_dataframes.keys():
    for blocks_file_name in blocks_dataframes.keys():  
        blocks = blocks_file_name.replace('.geojson', '')
        tracts = tracts_file_name.replace('Tracts.geojson', '')
        if blocks in tracts:
            print(blocks)
            df_blocks = blocks_dataframes[blocks_file_name]
            df_tracts = tracts_dataframes[tracts_file_name]
            
            df_blocks = df_blocks.withColumnRenamed("geometry", "blocks_geometry")
            df_tracts = df_tracts.withColumnRenamed("geometry", "tracts_geometry")

            df_blocks = df_blocks.withColumnRenamed("properties", "blocks_properties")
            df_tracts = df_tracts.withColumnRenamed("properties", "tracts_properties")

            df_blocks_partOf_tracts = df_blocks.crossJoin(df_tracts).where(
                F.expr("ST_Contains(tracts_geometry, ST_Centroid(blocks_geometry))")
            )
           
            df_isPartOf_relationships = df_blocks_partOf_tracts.select(
                col("blocks_properties.NAME").alias("Subject"),
                lit("isPartOf").alias("Relationship"),
                col("tracts_properties.NAME").alias("Object")
            )

            df_contains_relationships = df_blocks_partOf_tracts.select(
                col("tracts_properties.NAME").alias("Subject"),
                lit("Contains").alias("Relationship"),
                col("blocks_properties.NAME").alias("Object")
            )
            
            append_to_csv(df_isPartOf_relationships)
            append_to_csv(df_contains_relationships)

Alabama


                                                                                

Alaska


                                                                                

AmericanSamoa


                                                                                

Arizona


                                                                                

Arkansas


                                                                                

California


                                                                                

Colorado


                                                                                

CommonwealthoftheNorthernMarianaIslands
Connecticut


                                                                                

Delaware
DistrictofColumbia
Florida


                                                                                

Georgia


                                                                                

Guam
Hawaii
Idaho


                                                                                

Illinois


                                                                                

Indiana


                                                                                

Iowa


                                                                                

Kansas


                                                                                

Kentucky


                                                                                

Louisiana


                                                                                

Maine


                                                                                

Maryland


                                                                                

Massachusetts


                                                                                

Michigan


                                                                                

Minnesota


                                                                                

Mississippi


                                                                                

Missouri


                                                                                

Montana


                                                                                

Nebraska


                                                                                

Nevada


                                                                                

NewHampshire
NewJersey


                                                                                

NewMexico


                                                                                

NewYork


                                                                                

NorthCarolina


                                                                                

NorthDakota
Ohio


                                                                                

Oklahoma


                                                                                

Oregon


                                                                                

Pennsylvania


                                                                                

PuertoRico


                                                                                

RhodeIsland
SouthCarolina


                                                                                

SouthDakota
Tennessee


                                                                                

Texas


                                                                                

UnitedStatesVirginIslands
Utah


                                                                                

Vermont
Virginia


                                                                                

Washington


                                                                                

Virginia


                                                                                

WestVirginia


                                                                                

Wisconsin


                                                                                

Wyoming


## More Comparitive Datasets

Temperature, Average Annual 1971 - 2000 for Wyoming at 1:250,000

Source: https://www.sciencebase.gov/catalog/item/4f4e479ee4b07f02db4927d7

World Urban Areas

Source: https://www.sciencebase.gov/catalog/item/537f6b14e4b021317a86f8dc

Land status in the Colorado Plateau coal assessment study area

Source: https://www.sciencebase.gov/catalog/item/60a6bbddd34ea221ce4ba94b

TIGER/Line Geodatabases

Source: https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-geodatabase-file.html

PAD-US 2.1 Download data by State GeoJSON

Source: https://www.sciencebase.gov/catalog/item/6025985bd34eb12031138e21