# **Knowledge Graph Generation**


### Load data into Spark

In [1]:
import sys
sys.path.append("/usr/local/python-env/py39/lib/python3.9/site-packages")

import pyspark
print(pyspark.__version__)

print(sys.executable)

3.5.1
/usr/bin/python3.9


### Initialze a SparkSession

Ensuring the pyspark library is being accessed from my local usr directory.


In [2]:
import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.9'

In [3]:
import pkg_resources

sedona_version = pkg_resources.get_distribution("apache-sedona").version
print(f"Apache Sedona version: {sedona_version}")


Apache Sedona version: 1.5.1


In [4]:
print(os.environ['SPARK_HOME'])
print(os.environ['PYSPARK_PYTHON'])

/usr/local/spark/latest
/usr/bin/python3.9


In [5]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, lit,  split, expr, concat, when, explode
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DateType
from pyspark.sql.functions import year 
from pyspark.sql import Window
from pyspark.sql.functions import sum as pyspark_sum
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql import SparkSession
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.spark import *
import geopandas as gpd

Skipping SedonaKepler import, verify if keplergl is installed


## Now to make the app

In [6]:
spark = SparkSession \
    .builder \
    .appName('GeoSpatialQueries_Freddy') \
    .master('spark://columbus-oh.cs.colostate.edu:30800') \
    .config("spark.yarn.resourcemanager.address", "columbia.cs.colostate.edu:30799") \
    .config("spark.executor.memory", "3g") \
    .config("spark.executor.memoryOverhead", "512m") \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size", "500m") \
    .config("spark.serializer", KryoSerializer.getName) \
    .config("spark.kryo.registrator", SedonaKryoRegistrator.getName) \
    .config('spark.jars.packages',
            'org.apache.sedona:sedona-spark-3.5_2.12:1.5.1,'
            'org.datasyslab:geotools-wrapper:1.5.1-28.2') \
    .config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all') \
    .getOrCreate()

# Set log level to DEBUG
spark.sparkContext.setLogLevel("ERROR")

sedona = SedonaContext.create(spark)
SedonaRegistrator.registerAll(spark)

# create a logger
logger = spark._jvm.org.apache.log4j.LogManager.getLogger(__name__)
logger.info("Pyspark initialized...")

https://artifacts.unidata.ucar.edu/repository/unidata-all added as a remote repository with the name: repo-1
Ivy Default Cache set to: /s/chopin/a/grad/flarrieu/.ivy2/cache
The jars for the packages stored in: /s/chopin/a/grad/flarrieu/.ivy2/jars
org.apache.sedona#sedona-spark-3.5_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-209a4d85-873b-4c53-9a18-3505a6c7e02b;1.0
	confs: [default]


:: loading settings :: url = jar:file:/usr/local/spark/3.5.0-with-hadoop3.3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.sedona#sedona-spark-3.5_2.12;1.5.1 in central
	found org.apache.sedona#sedona-common;1.5.1 in central
	found org.apache.commons#commons-math3;3.6.1 in central
	found org.locationtech.jts#jts-core;1.19.0 in central
	found org.wololo#jts2geojson;0.16.1 in central
	found org.locationtech.spatial4j#spatial4j;0.8 in central
	found com.google.geometry#s2-geometry;2.0.0 in central
	found com.google.guava#guava;25.1-jre in central
	found com.google.code.findbugs#jsr305;3.0.2 in user-list
	found org.checkerframework#checker-qual;2.0.0 in central
	found com.google.errorprone#error_prone_annotations;2.1.3 in central
	found com.google.j2objc#j2objc-annotations;1.1 in central
	found org.codehaus.mojo#animal-sniffer-annotations;1.14 in central
	found com.uber#h3;4.1.1 in central
	found net.sf.geographiclib#GeographicLib-Java;1.52 in central
	found com.github.ben-manes.caffeine#caffeine;2.9.2 in central
	found org.checkerframework#checker-qual;3.10.0 in central
	found com.google.err

# **Helper Functions**

In [7]:
# Import the necessary module from py4j to interact with JVM
from py4j.java_gateway import java_import

# Import the Path class from Hadoop. This class is used to handle file paths in Hadoop.
java_import(spark._jvm, 'org.apache.hadoop.fs.Path')

fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())

def get_files_recursive(path):
    file_status_arr = fs.listStatus(spark._jvm.Path(path))
    file_paths = []
    
    for file_status in file_status_arr:
        if file_status.isDirectory():
            file_paths += get_files_recursive(file_status.getPath().toString())
        elif file_status.getPath().getName().endswith(('.geojson')):
            file_paths.append(file_status.getPath().toString())
    
    print(file_paths)
    return file_paths

In [8]:
def create_csv(df: DataFrame, path: str):
    df.write.csv(path=path, mode='append', header=True)

In [26]:
def append_to_csv(df: DataFrame, path: str):
    df.write.csv(path=path, mode='append', header=True)

In [10]:
def get_file_to_df(file_name: str): 
    data_directory = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/"
    print(f"Processing file: {file_name}")

    geojson_schema =  "type string, crs string, totalFeatures long, features array<struct<type string, geometry string, properties map<string, string>>>"

    df = spark.read.schema(geojson_schema).json(data_directory + file_name, multiLine=True)
    
    df = (df
        .select(explode("features").alias("features"))
        .select("features.*")
        .withColumn("geometry", expr("ST_GeomFromGeoJSON(geometry)"))
    )
    
    return df

In [11]:
def create_edge_df(df_parent: DataFrame, subject, relationship, object):
    df_edges = df_parent.withColumn("Subject", subject) \
        .withColumn("Relationship", relationship) \
        .withColumn("Object", object) \
        .select("Subject", "Relationship", "Object")
    return df_edges

In [12]:
def create_node_df(df_edges: DataFrame, type):
    nodes_subject = df_edges.select(col("Subject").alias("Node_ID")).distinct()
    nodes_object = df_edges.select(col("Object").alias("Node_ID")).distinct()

    nodes_df = nodes_subject.union(nodes_object).distinct()

    nodes_df = nodes_df.withColumn("Type", type)
    
    return nodes_df

In [13]:
def create_contains_df(df_parent: DataFrame, df_child: DataFrame, ):
    df_parent = df_parent.withColumnRenamed("geometry", "parent_geometry")
    df_child = df_child.withColumnRenamed("geometry", "child_geometry")
    df_parent = df_parent.withColumnRenamed("properties", "parent_properties")
    df_child = df_child.withColumnRenamed("properties", "child_properties")

    df_parent.createOrReplaceTempView("parents")
    df_child.createOrReplaceTempView("children")

    df_country_continent_contains = spark.sql("""
        SELECT *
        FROM parents, children
        WHERE ST_Contains(parent_geometry, ST_Centroid(child_geometry))
    """)

    return df_country_continent_contains.select("parent_geometry", "child_geometry", "parent_properties", "child_properties")

In [46]:
from pyspark.sql import DataFrame
from functools import reduce

def union_all(dfs_dict):
    dfs = list(dfs_dict.values())
    if dfs:
        return reduce(DataFrame.unionByName, dfs)
    else:
        return None


# Global Variables

In [27]:
base_edges_path = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/BaseEdges.csv"
base_nodes_path = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/BaseNodes.csv"

# Global Dataframes

In [28]:
final_edge_dataframes = {}
final_node_dataframes = {}

# **Add Continents**

In [29]:
earth_schema = StructType([
    StructField("Node_ID", StringType(), True),
    StructField("Type", StringType(), True),
])

empty_earth_df = spark.createDataFrame([Row(Node_ID="Planet_Earth", Type="Planet")], earth_schema)

final_node_dataframes["Earth"] = empty_earth_df

In [30]:
empty_earth_df.show()

                                                                                

+------------+------+
|     Node_ID|  Type|
+------------+------+
|Planet_Earth|Planet|
+------------+------+



In [31]:
df_continents = get_file_to_df('WorldContinents.geojson')
continent_prefix = "Continent_"

df_continent_isPartOf_earth_edges = create_edge_df(df_continents, subject=concat(lit(continent_prefix), col("properties.CONTINENT")), relationship=lit("isPartOf"), object=lit("Planet_Earth"))

df_continent_isPartOf_earth_edges.show()

final_edge_dataframes['ContinentIsPartOfEarth'] = df_continent_isPartOf_earth_edges

df_earth_contains_edges = create_edge_df(df_continents, lit("Planet_Earth"),  lit("Contains"), concat(lit(continent_prefix), col("properties.CONTINENT")))

df_earth_contains_edges.show()

final_edge_dataframes['EarthContainsContinent'] = df_earth_contains_edges

Processing file: WorldContinents.geojson
+--------------------+------------+------------+
|             Subject|Relationship|      Object|
+--------------------+------------+------------+
|    Continent_Africa|    isPartOf|Planet_Earth|
|      Continent_Asia|    isPartOf|Planet_Earth|
| Continent_Australia|    isPartOf|Planet_Earth|
|   Continent_Oceania|    isPartOf|Planet_Earth|
|Continent_South A...|    isPartOf|Planet_Earth|
|Continent_Antarctica|    isPartOf|Planet_Earth|
|    Continent_Europe|    isPartOf|Planet_Earth|
|Continent_North A...|    isPartOf|Planet_Earth|
+--------------------+------------+------------+

+------------+------------+--------------------+
|     Subject|Relationship|              Object|
+------------+------------+--------------------+
|Planet_Earth|    Contains|    Continent_Africa|
|Planet_Earth|    Contains|      Continent_Asia|
|Planet_Earth|    Contains| Continent_Australia|
|Planet_Earth|    Contains|   Continent_Oceania|
|Planet_Earth|    Contains|

In [32]:
df_nodes = create_node_df(df_continent_isPartOf_earth_edges, lit("Continent"))
df_nodes = df_nodes.filter(df_nodes["Node_ID"] != "Planet_Earth")

final_node_dataframes['Continents'] = df_nodes

df_nodes.show()

+--------------------+---------+
|             Node_ID|     Type|
+--------------------+---------+
|    Continent_Europe|Continent|
|    Continent_Africa|Continent|
|   Continent_Oceania|Continent|
|      Continent_Asia|Continent|
|Continent_South A...|Continent|
|Continent_Antarctica|Continent|
| Continent_Australia|Continent|
|Continent_North A...|Continent|
+--------------------+---------+



In [33]:
create_csv(df_nodes, base_nodes_path)
create_csv(df_continent_isPartOf_earth_edges, base_edges_path)
append_to_csv(df_earth_contains_edges, base_edges_path)

                                                                                

# **Add Countries**

In [34]:
df_continents = get_file_to_df('WorldContinents.geojson')
df_countries = get_file_to_df('CountryTerritories.geojson')

Processing file: WorldContinents.geojson
Processing file: CountryTerritories.geojson


In [35]:
df_continent_contains_country = create_contains_df(df_continents, df_countries)
df_continent_contains_country.show()

+--------------------+--------------------+--------------------+--------------------+
|     parent_geometry|      child_geometry|   parent_properties|    child_properties|
+--------------------+--------------------+--------------------+--------------------+
|MULTIPOLYGON (((3...|POLYGON ((55.7219...|{FID -> 1, OBJECT...|{geo_point_2d -> ...|
|MULTIPOLYGON (((3...|MULTIPOLYGON (((4...|{FID -> 1, OBJECT...|{geo_point_2d -> ...|
|MULTIPOLYGON (((3...|MULTIPOLYGON (((4...|{FID -> 1, OBJECT...|{geo_point_2d -> ...|
|MULTIPOLYGON (((3...|POLYGON ((47.3030...|{FID -> 1, OBJECT...|{geo_point_2d -> ...|
|MULTIPOLYGON (((3...|POLYGON ((28.6863...|{FID -> 1, OBJECT...|{geo_point_2d -> ...|
|MULTIPOLYGON (((3...|POLYGON ((31.2975...|{FID -> 1, OBJECT...|{geo_point_2d -> ...|
|MULTIPOLYGON (((3...|POLYGON ((31.9685...|{FID -> 1, OBJECT...|{geo_point_2d -> ...|
|MULTIPOLYGON (((3...|POLYGON ((25.2644...|{FID -> 1, OBJECT...|{geo_point_2d -> ...|
|MULTIPOLYGON (((3...|POLYGON ((30.4157...|{FID -> 1, 

In [36]:
continent_prefix = "Continent_"
country_prefix = "Country_"

df_country_isPartOf_continent_edges = create_edge_df(df_continent_contains_country, concat(lit(country_prefix), col("child_properties.name")), lit("isPartOf"), concat(lit(continent_prefix), col("parent_properties.CONTINENT")))
df_country_isPartOf_continent_edges.show()
final_edge_dataframes['CountryIsPartOfContinent'] = df_country_isPartOf_continent_edges

df_continent_contains_country_edges = create_edge_df(df_continent_contains_country, concat(lit(continent_prefix), col("parent_properties.CONTINENT")), lit("Contains"), concat(lit(country_prefix), col("child_properties.name")))
df_continent_contains_country_edges.show()
final_edge_dataframes['ContinentContainsContry'] = df_continent_contains_country_edges

                                                                                

+--------------------+------------+----------------+
|             Subject|Relationship|          Object|
+--------------------+------------+----------------+
|     Country_Reunion|    isPartOf|Continent_Africa|
|  Country_Madagascar|    isPartOf|Continent_Africa|
|     Country_Mayotte|    isPartOf|Continent_Africa|
|Country_Glorioso ...|    isPartOf|Continent_Africa|
|     Country_Lesotho|    isPartOf|Continent_Africa|
|Country_South Africa|    isPartOf|Continent_Africa|
|   Country_Swaziland|    isPartOf|Continent_Africa|
|    Country_Botswana|    isPartOf|Continent_Africa|
|    Country_Zimbabwe|    isPartOf|Continent_Africa|
|  Country_Mozambique|    isPartOf|Continent_Africa|
|      Country_Zambia|    isPartOf|Continent_Africa|
|      Country_Malawi|    isPartOf|Continent_Africa|
|Country_United Re...|    isPartOf|Continent_Africa|
|     Country_Burundi|    isPartOf|Continent_Africa|
|Country_Democrati...|    isPartOf|Continent_Africa|
|      Country_Rwanda|    isPartOf|Continent_A

                                                                                

+----------------+------------+--------------------+
|         Subject|Relationship|              Object|
+----------------+------------+--------------------+
|Continent_Africa|    Contains|     Country_Reunion|
|Continent_Africa|    Contains|  Country_Madagascar|
|Continent_Africa|    Contains|     Country_Mayotte|
|Continent_Africa|    Contains|Country_Glorioso ...|
|Continent_Africa|    Contains|     Country_Lesotho|
|Continent_Africa|    Contains|Country_South Africa|
|Continent_Africa|    Contains|   Country_Swaziland|
|Continent_Africa|    Contains|    Country_Botswana|
|Continent_Africa|    Contains|    Country_Zimbabwe|
|Continent_Africa|    Contains|  Country_Mozambique|
|Continent_Africa|    Contains|      Country_Zambia|
|Continent_Africa|    Contains|      Country_Malawi|
|Continent_Africa|    Contains|Country_United Re...|
|Continent_Africa|    Contains|     Country_Burundi|
|Continent_Africa|    Contains|Country_Democrati...|
|Continent_Africa|    Contains|      Country_R

In [37]:
df_nodes = create_node_df(df_country_isPartOf_continent_edges, lit("Country"))
df_nodes = df_nodes.filter(~col("Node_ID").contains("Continent_"))
final_node_dataframes['Countries'] = df_nodes
df_nodes.show()

[Stage 61:>                                                         (0 + 1) / 1]

+--------------------+-------+
|             Node_ID|   Type|
+--------------------+-------+
|     Country_Bahrain|Country|
|Country_Jarvis Is...|Country|
|     Country_Belarus|Country|
|Country_Jammu-Kas...|Country|
|Country_South Africa|Country|
|Country_Glorioso ...|Country|
|     Country_Ecuador|Country|
|  Country_Madagascar|Country|
|Country_The forme...|Country|
| Country_Saint Lucia|Country|
|     Country_Hungary|Country|
|    Country_Dominica|Country|
|Country_Iran (Isl...|Country|
|       Country_China|Country|
|       Country_Malta|Country|
|Country_Trinidad ...|Country|
|Country_Heard Isl...|Country|
|     Country_Iceland|Country|
|    Country_Guernsey|Country|
|       Country_Gabon|Country|
+--------------------+-------+
only showing top 20 rows



                                                                                

In [38]:
append_to_csv(df_nodes, base_nodes_path)
append_to_csv(df_country_isPartOf_continent_edges, base_edges_path)
append_to_csv(df_continent_contains_country_edges, base_edges_path)

                                                                                

# **Add States**

In [39]:
df_countries = get_file_to_df('CountryTerritories.geojson')
df_states = get_file_to_df('States.geojson')

Processing file: CountryTerritories.geojson
Processing file: States.geojson


In [40]:
df_countries_contains_states = create_contains_df(df_countries, df_states)
df_countries_contains_states.show()

+--------------------+--------------------+--------------------+--------------------+
|     parent_geometry|      child_geometry|   parent_properties|    child_properties|
+--------------------+--------------------+--------------------+--------------------+
|MULTIPOLYGON (((-...|MULTIPOLYGON (((-...|{geo_point_2d -> ...|{STATEFP -> 28, S...|
|MULTIPOLYGON (((-...|MULTIPOLYGON (((-...|{geo_point_2d -> ...|{STATEFP -> 37, S...|
|MULTIPOLYGON (((-...|POLYGON ((-103.00...|{geo_point_2d -> ...|{STATEFP -> 40, S...|
|MULTIPOLYGON (((-...|MULTIPOLYGON (((-...|{geo_point_2d -> ...|{STATEFP -> 51, S...|
|MULTIPOLYGON (((-...|POLYGON ((-82.643...|{geo_point_2d -> ...|{STATEFP -> 54, S...|
|MULTIPOLYGON (((-...|MULTIPOLYGON (((-...|{geo_point_2d -> ...|{STATEFP -> 22, S...|
|MULTIPOLYGON (((-...|MULTIPOLYGON (((-...|{geo_point_2d -> ...|{STATEFP -> 26, S...|
|MULTIPOLYGON (((-...|MULTIPOLYGON (((-...|{geo_point_2d -> ...|{STATEFP -> 25, S...|
|MULTIPOLYGON (((-...|POLYGON ((-117.24...|{geo_point_

In [41]:
country_prefix = "Country_"
state_prefix = "State_"

df_state_isPartOf_country_edges = create_edge_df(df_countries_contains_states, concat(lit(state_prefix), col("child_properties.NAME")), lit("isPartOf"), concat(lit(country_prefix), col("parent_properties.name")))
df_state_isPartOf_country_edges.show()
final_edge_dataframes['StateIsPartOfCountry'] = df_state_isPartOf_country_edges

df_country_contains_state_edges = create_edge_df(df_countries_contains_states, concat(lit(country_prefix), col("parent_properties.name")), lit("Contains"), concat(lit(state_prefix), col("child_properties.NAME")))
df_country_contains_state_edges.show()
final_edge_dataframes['CountryContainsState'] = df_country_contains_state_edges

+--------------------+------------+--------------------+
|             Subject|Relationship|              Object|
+--------------------+------------+--------------------+
|   State_Mississippi|    isPartOf|Country_United St...|
|State_North Carolina|    isPartOf|Country_United St...|
|      State_Oklahoma|    isPartOf|Country_United St...|
|      State_Virginia|    isPartOf|Country_United St...|
| State_West Virginia|    isPartOf|Country_United St...|
|     State_Louisiana|    isPartOf|Country_United St...|
|      State_Michigan|    isPartOf|Country_United St...|
| State_Massachusetts|    isPartOf|Country_United St...|
|         State_Idaho|    isPartOf|Country_United St...|
|       State_Florida|    isPartOf|Country_United St...|
|      State_Nebraska|    isPartOf|Country_United St...|
|    State_Washington|    isPartOf|Country_United St...|
|    State_New Mexico|    isPartOf|Country_United St...|
|   State_Puerto Rico|    isPartOf| Country_Puerto Rico|
|  State_South Dakota|    isPar

[Stage 81:>                                                         (0 + 1) / 1]

+--------------------+------------+--------------------+
|             Subject|Relationship|              Object|
+--------------------+------------+--------------------+
|Country_United St...|    Contains|   State_Mississippi|
|Country_United St...|    Contains|State_North Carolina|
|Country_United St...|    Contains|      State_Oklahoma|
|Country_United St...|    Contains|      State_Virginia|
|Country_United St...|    Contains| State_West Virginia|
|Country_United St...|    Contains|     State_Louisiana|
|Country_United St...|    Contains|      State_Michigan|
|Country_United St...|    Contains| State_Massachusetts|
|Country_United St...|    Contains|         State_Idaho|
|Country_United St...|    Contains|       State_Florida|
|Country_United St...|    Contains|      State_Nebraska|
|Country_United St...|    Contains|    State_Washington|
|Country_United St...|    Contains|    State_New Mexico|
| Country_Puerto Rico|    Contains|   State_Puerto Rico|
|Country_United St...|    Conta

                                                                                

In [42]:
df_nodes = create_node_df(df_state_isPartOf_country_edges, lit("State"))
df_nodes = df_nodes.filter(~col("Node_ID").contains("Country_"))
final_node_dataframes['States'] = df_nodes
df_nodes.show()

[Stage 83:>                                                         (0 + 1) / 1]

+--------------------+-----+
|             Node_ID| Type|
+--------------------+-----+
|  State_Pennsylvania|State|
| State_West Virginia|State|
|      State_Maryland|State|
|      State_Colorado|State|
|      State_Illinois|State|
|          State_Guam|State|
|   State_Connecticut|State|
|      State_Michigan|State|
|     State_Wisconsin|State|
|       State_Indiana|State|
|        State_Oregon|State|
| State_New Hampshire|State|
|          State_Ohio|State|
|      State_Oklahoma|State|
|        State_Kansas|State|
|  State_North Dakota|State|
|    State_Washington|State|
|       State_Wyoming|State|
|         State_Maine|State|
|State_North Carolina|State|
+--------------------+-----+
only showing top 20 rows



                                                                                

In [43]:
append_to_csv(df_nodes, base_nodes_path)
append_to_csv(df_state_isPartOf_country_edges, base_edges_path)
append_to_csv(df_country_contains_state_edges, base_edges_path)

                                                                                

# **Add Counties**

In [44]:
counties_dataframes = {}
files_directory = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/"
files = get_files_recursive(files_directory)

for file_path in files:
    file_name = file_path.split('/')[-1]
    
    # Print the file path
    print(f"Processing file: {file_name}")

    geojsonSchema = "type string, crs string, totalFeatures long, features array<struct<type string, geometry string, properties map<string, string>>>"

    df = spark.read.schema(geojsonSchema).json(file_path, multiLine=True)
    
    df = (df
        .select(F.explode("features").alias("features"))
        .select("features.*")
        .withColumn("geometry", F.expr("ST_GeomFromGeoJSON(geometry)"))
        )
    
    counties_dataframes[file_name] = df

['hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/AlabamaCounties.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/AlaskaCounties.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/AmericanSamoaCounties.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/ArizonaCounties.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/ArkansasCounties.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/CaliforniaCounties.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/ColoradoCounties.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/CommonwealthoftheNorthernMarianaIslandsCounties.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/CountiesByState/ConnecticutCounties.geojson', 'hdfs://columbus-oh.cs.colostate.ed

In [47]:
df_states = get_file_to_df('States.geojson')
final_county_edges = {}
final_county_nodes = {}
for state in counties_dataframes.keys():
    print(state)
    df_counties = counties_dataframes[state]

    df_states_contains_counties = create_contains_df(df_states, df_counties)

    county_prefix = "County_"
    state_prefix = "State_"

    df_county_isPartOf_state_edges = create_edge_df(df_states_contains_counties, concat(lit(county_prefix), col("child_properties.NAME")), lit("isPartOf"), concat(lit(state_prefix), col("parent_properties.NAME")))
    # df_county_isPartOf_state_edges.show(1)
    final_edge_dataframes[f'{state}CountyIsPartOfState'] = df_county_isPartOf_state_edges
    final_county_edges[f'{state}CountyIsPartOfState'] = df_county_isPartOf_state_edges

    df_state_contains_county_edges = create_edge_df(df_states_contains_counties, concat(lit(state_prefix), col("parent_properties.NAME")), lit("Contains"), concat(lit(county_prefix), col("child_properties.NAME")))
    # df_state_contains_county_edges.show(1)
    final_edge_dataframes[f'StateContains{state}County'] = df_state_contains_county_edges
    final_county_edges[f'StateContains{state}County'] = df_state_contains_county_edges

    df_nodes = create_node_df(df_county_isPartOf_state_edges, lit("County"))
    df_nodes = df_nodes.filter(~col("Node_ID").contains("State_"))
    final_node_dataframes[f'{state}Counties'] = df_nodes
    final_county_nodes[f'{state}Counties'] = df_nodes

    # df_nodes.show(1)

Processing file: States.geojson
AlabamaCounties.geojson
AlaskaCounties.geojson
AmericanSamoaCounties.geojson
ArizonaCounties.geojson
ArkansasCounties.geojson
CaliforniaCounties.geojson
ColoradoCounties.geojson
CommonwealthoftheNorthernMarianaIslandsCounties.geojson
ConnecticutCounties.geojson
DelawareCounties.geojson
DistrictofColumbiaCounties.geojson
FloridaCounties.geojson
GeorgiaCounties.geojson
GuamCounties.geojson
HawaiiCounties.geojson
IdahoCounties.geojson
IllinoisCounties.geojson
IndianaCounties.geojson
IowaCounties.geojson
KansasCounties.geojson
KentuckyCounties.geojson
LouisianaCounties.geojson
MaineCounties.geojson
MarylandCounties.geojson
MassachusettsCounties.geojson
MichiganCounties.geojson
MinnesotaCounties.geojson
MississippiCounties.geojson
MissouriCounties.geojson
MontanaCounties.geojson
NebraskaCounties.geojson
NevadaCounties.geojson
NewHampshireCounties.geojson
NewJerseyCounties.geojson
NewMexicoCounties.geojson
NewYorkCounties.geojson
NorthCarolinaCounties.geojson


In [48]:
combined_county_edges_df = union_all(final_county_edges)
combined_county_nodes_df = union_all(final_county_nodes)

In [50]:
append_to_csv(combined_county_nodes_df, base_nodes_path)
append_to_csv(combined_county_edges_df, base_edges_path)

                                                                                

# **Add Tracts**

In [51]:
tracts_dataframes = {}
files_directory = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/"

files = get_files_recursive(files_directory)

for file_path in files:
    file_name = file_path.split('/')[-1]
    
    print(f"Processing file: {file_path}")

    geojsonSchema = "type string, crs string, totalFeatures long, features array<struct<type string, geometry string, properties map<string, string>>>"

    df = spark.read.schema(geojsonSchema).json(file_path, multiLine=True)
    
    df = (df
        .select(F.explode("features").alias("features"))
        .select("features.*")
        .withColumn("geometry", F.expr("ST_GeomFromGeoJSON(geometry)"))
        )
    
    tracts_dataframes[file_name] = df

['hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/AlabamaTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/AlaskaTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/AmericanSamoaTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/ArizonaTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/ArkansasTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/CaliforniaTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/ColoradoTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/CommonwealthoftheNorthernMarianaIslandsTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsByState/ConnecticutTracts.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/TractsBySta

In [52]:
county_prefix = "County_"
tract_prefix = "Tract_"

final_tract_edges = {}
final_tract_nodes = {}

for counties_file_name in counties_dataframes.keys():
    for tracts_file_name in tracts_dataframes.keys():  
        counties = counties_file_name.replace('Counties.geojson', '')
        tracts = tracts_file_name.replace('Tracts.geojson', '')
        if counties in tracts:
            print(counties)
            df_counties = counties_dataframes[counties_file_name]
            df_tracts = tracts_dataframes[tracts_file_name]

            df_counties_contains_tracts = create_contains_df(df_counties, df_tracts)

            df_tract_isPartOf_county_edges = create_edge_df(df_counties_contains_tracts, concat(lit(tract_prefix), col("child_properties.GEOID")), lit("isPartOf"), concat(lit(county_prefix), col("parent_properties.NAME")))
            # df_tract_isPartOf_county_edges.show(1)
            final_edge_dataframes[f'{tracts}TractIsPartOf{counties}County'] = df_tract_isPartOf_county_edges
            final_tract_edges[f'{tracts}TractIsPartOf{counties}County'] = df_tract_isPartOf_county_edges

            df_county_contains_tract_edges = create_edge_df(df_counties_contains_tracts, concat(lit(county_prefix), col("parent_properties.NAME")), lit("Contains"), concat(lit(tract_prefix), col("child_properties.GEOID")))
            # df_county_contains_tract_edges.show(1)
            final_edge_dataframes[f'{counties}CountyContains{tracts}Tract'] = df_county_contains_tract_edges
            final_tract_edges[f'{counties}CountyContains{tracts}Tract'] = df_county_contains_tract_edges

            df_nodes = create_node_df(df_tract_isPartOf_county_edges, lit("Tract"))
            df_nodes = df_nodes.filter(~col("Node_ID").contains("County_"))
            final_node_dataframes[f'{tracts}Tracts'] = df_nodes
            final_tract_nodes[f'{tracts}Tracts'] = df_nodes

            # df_nodes.show(1)

Alabama
Alaska
AmericanSamoa
Arizona
Arkansas
California
Colorado
CommonwealthoftheNorthernMarianaIslands
Connecticut
Delaware
DistrictofColumbia
Florida
Georgia
Guam
Hawaii
Idaho
Illinois
Indiana
Iowa
Kansas
Kentucky
Louisiana
Maine
Maryland
Massachusetts
Michigan
Minnesota
Mississippi
Missouri
Montana
Nebraska
Nevada
NewHampshire
NewJersey
NewMexico
NewYork
NorthCarolina
NorthDakota
Ohio
Oklahoma
Oregon
Pennsylvania
PuertoRico
RhodeIsland
SouthCarolina
SouthDakota
Tennessee
Texas
UnitedStatesVirginIslands
Utah
Vermont
Virginia
Virginia
Washington
WestVirginia
Wisconsin
Wyoming


In [53]:
combined_tract_edges_df = union_all(final_tract_edges)
combined_tract_nodes_df = union_all(final_tract_nodes)

In [54]:
append_to_csv(combined_tract_nodes_df, base_nodes_path)
append_to_csv(combined_tract_edges_df, base_edges_path)

                                                                                

# **Add Blocks**

In [55]:
blocks_dataframes = {}
files_directory = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/"

files = get_files_recursive(files_directory)

for file_path in files:
    file_name = file_path.split('/')[-1]
    
    print(f"Processing file: {file_path}")

    geojsonSchema = "type string, crs string, totalFeatures long, features array<struct<type string, geometry string, properties map<string, string>>>"
    
    df = spark.read.schema(geojsonSchema).json(file_path, multiLine=True)
    
    df = (df
        .select(F.explode("features").alias("features"))
        .select("features.*")
        .withColumn("geometry", F.expr("ST_GeomFromGeoJSON(geometry)"))
        )
    
    blocks_dataframes[file_name] = df

['hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Alabama.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Alaska.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/AmericanSamoa.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Arizona.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Arkansas.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/California.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Colorado.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/CommonwealthoftheNorthernMarianaIslands.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Connecticut.geojson', 'hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/input/BlocksByState/Delaware.geojson', 'hdfs://columbus-oh.cs.colostate

In [56]:
tract_prefix = "Tract_"
block_prefix = "BlockGroup_"

final_block_edges = {}
final_block_nodes = {}

for tracts_file_name in tracts_dataframes.keys():
    for blocks_file_name in blocks_dataframes.keys():  
        blocks = blocks_file_name.replace('.geojson', '')
        tracts = tracts_file_name.replace('Tracts.geojson', '')
        if blocks in tracts:
            print(blocks)
            df_blocks = blocks_dataframes[blocks_file_name]
            df_tracts = tracts_dataframes[tracts_file_name]
            
            df_tracts_contains_blocks = create_contains_df(df_tracts, df_blocks)

            df_block_isPartOf_tract_edges = create_edge_df(df_tracts_contains_blocks, concat(lit(block_prefix), col("child_properties.GEOID")), lit("isPartOf"), concat(lit(tract_prefix), col("parent_properties.GEOID")))
            # df_block_isPartOf_tract_edges.show(1)
            final_edge_dataframes[f'{blocks}BlockIsPartOfTract{tracts}'] = df_block_isPartOf_tract_edges
            final_block_edges[f'{blocks}BlockIsPartOfTract{tracts}'] = df_block_isPartOf_tract_edges

            df_tract_contains_block_edges = create_edge_df(df_tracts_contains_blocks, concat(lit(tract_prefix), col("parent_properties.GEOID")), lit("Contains"), concat(lit(block_prefix), col("child_properties.GEOID")))
            # df_tract_contains_block_edges.show(1)
            final_edge_dataframes[f'{tracts}TractContainsBlock{blocks}'] = df_tract_contains_block_edges
            final_block_edges[f'{tracts}TractContainsBlock{blocks}'] = df_tract_contains_block_edges

            df_nodes = create_node_df(df_block_isPartOf_tract_edges, lit("Block"))
            df_nodes = df_nodes.filter(~col("Node_ID").contains("Tract_"))
            final_node_dataframes[f'{blocks}Blocks'] = df_nodes
            final_block_nodes[f'{blocks}Blocks'] = df_nodes

            # df_nodes.show(1)

Alabama
Alaska
AmericanSamoa
Arizona
Arkansas
California
Colorado
CommonwealthoftheNorthernMarianaIslands
Connecticut
Delaware
DistrictofColumbia
Florida
Georgia
Guam
Hawaii
Idaho
Illinois
Indiana
Iowa
Kansas
Kentucky
Louisiana
Maine
Maryland
Massachusetts
Michigan
Minnesota
Mississippi
Missouri
Montana
Nebraska
Nevada
NewHampshire
NewJersey
NewMexico
NewYork
NorthCarolina
NorthDakota
Ohio
Oklahoma
Oregon
Pennsylvania
PuertoRico
RhodeIsland
SouthCarolina
SouthDakota
Tennessee
Texas
UnitedStatesVirginIslands
Utah
Vermont
Virginia
Washington
Virginia
WestVirginia
Wisconsin
Wyoming


In [57]:
combined_block_edges_df = union_all(final_block_edges)
combined_block_nodes_df = union_all(final_block_nodes)

In [58]:
append_to_csv(combined_block_nodes_df, base_nodes_path)
append_to_csv(combined_block_edges_df, base_edges_path)

                                                                                

## More Comparitive Datasets

Temperature, Average Annual 1971 - 2000 for Wyoming at 1:250,000

Source: https://www.sciencebase.gov/catalog/item/4f4e479ee4b07f02db4927d7

World Urban Areas

Source: https://www.sciencebase.gov/catalog/item/537f6b14e4b021317a86f8dc

Land status in the Colorado Plateau coal assessment study area

Source: https://www.sciencebase.gov/catalog/item/60a6bbddd34ea221ce4ba94b

TIGER/Line Geodatabases

Source: https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-geodatabase-file.html

PAD-US 2.1 Download data by State GeoJSON

Source: https://www.sciencebase.gov/catalog/item/6025985bd34eb12031138e21