In [1]:
import sys
sys.path.append("/usr/local/python-env/py39/lib/python3.9/site-packages")

import pyspark
print(pyspark.__version__)

print(sys.executable)

3.5.1
/usr/bin/python3.9


In [2]:
import os

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.9'

In [3]:
import pkg_resources

sedona_version = pkg_resources.get_distribution("apache-sedona").version
print(f"Apache Sedona version: {sedona_version}")

Apache Sedona version: 1.5.1


In [4]:
print(os.environ['SPARK_HOME'])
print(os.environ['PYSPARK_PYTHON'])

/usr/local/spark/latest
/usr/bin/python3.9


In [5]:
from pyspark.sql import SparkSession
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from pyspark.sql.functions import col


In [6]:
spark = SparkSession \
    .builder \
    .appName("GraphFramesExample") \
    .master('spark://columbus-oh.cs.colostate.edu:30800') \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.3-spark3.4-s_2.12") \
    .config("spark.yarn.resourcemanager.address", "columbia.cs.colostate.edu:30799") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
logger = spark._jvm.org.apache.log4j.LogManager.getLogger(__name__)


:: loading settings :: url = jar:file:/usr/local/spark/3.5.0-with-hadoop3.3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /s/chopin/a/grad/flarrieu/.ivy2/cache
The jars for the packages stored in: /s/chopin/a/grad/flarrieu/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2decc6c3-8899-427c-9579-8dada6759f1c;1.0
	confs: [default]
	found graphframes#graphframes;0.8.3-spark3.4-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in spark-list
:: resolution report :: resolve 125ms :: artifacts dl 6ms
	:: modules in use:
	graphframes#graphframes;0.8.3-spark3.4-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from spark-list in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	-

In [7]:
# Import the necessary module from py4j to interact with JVM
from py4j.java_gateway import java_import

# Import the Path class from Hadoop. This class is used to handle file paths in Hadoop.
java_import(spark._jvm, 'org.apache.hadoop.fs.Path')

fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())

In [8]:
def get_csv_df(file_name: str):
    data_directory = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/"
    print(f"Processing file: {file_name}")

    if "Nodes" in file_name:
        schema = "Node_ID string, Type string"
        df = spark.read.format("csv").option("header", "true").schema(schema).load(data_directory + file_name)
        df = df.withColumnRenamed("Node_ID", "id")  # Rename column for GraphFrame compatibility
    elif "Edges" in file_name:
        schema = "Subject string, Relationship string, Object string"
        df = spark.read.format("csv").option("header", "true").schema(schema).load(data_directory + file_name)
        df = df.withColumnRenamed("Subject", "src").withColumnRenamed("Object", "dst")  # Rename columns for GraphFrame compatibility
    else:
        df = None

    return df


In [9]:
from graphframes import *

In [10]:
# Load the nodes and edges data
nodes_df = get_csv_df("BaseNodes.csv")
edges_df = get_csv_df("BaseEdges.csv")

Processing file: BaseNodes.csv
Processing file: BaseEdges.csv


In [11]:
print(nodes_df.columns)
print(edges_df.columns)

['id', 'Type']
['src', 'Relationship', 'dst']


In [12]:
nodes_df.show()
edges_df.show()

                                                                                

+--------------------+-----+
|                  id| Type|
+--------------------+-----+
|BlockGroup_060871...|Block|
|BlockGroup_060375...|Block|
|BlockGroup_060133...|Block|
|BlockGroup_060372...|Block|
|BlockGroup_060014...|Block|
|BlockGroup_060855...|Block|
|BlockGroup_061070...|Block|
|BlockGroup_060830...|Block|
|BlockGroup_060133...|Block|
|BlockGroup_060374...|Block|
|BlockGroup_060371...|Block|
|BlockGroup_060590...|Block|
|BlockGroup_060190...|Block|
|BlockGroup_060855...|Block|
|BlockGroup_060770...|Block|
|BlockGroup_060470...|Block|
|BlockGroup_060910...|Block|
|BlockGroup_060816...|Block|
|BlockGroup_060371...|Block|
|BlockGroup_060730...|Block|
+--------------------+-----+
only showing top 20 rows



[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+------------+-----------------+
|                 src|Relationship|              dst|
+--------------------+------------+-----------------+
|BlockGroup_060730...|    isPartOf|Tract_06073018102|
|BlockGroup_060730...|    isPartOf|Tract_06073007200|
|BlockGroup_060730...|    isPartOf|Tract_06073017702|
|BlockGroup_060759...|    isPartOf|Tract_06075980200|
|BlockGroup_060350...|    isPartOf|Tract_06035040600|
|BlockGroup_060750...|    isPartOf|Tract_06075010500|
|BlockGroup_060590...|    isPartOf|Tract_06059099239|
|BlockGroup_060373...|    isPartOf|Tract_06037301206|
|BlockGroup_060590...|    isPartOf|Tract_06059099405|
|BlockGroup_060290...|    isPartOf|Tract_06029002002|
|BlockGroup_061130...|    isPartOf|Tract_06113010704|
|BlockGroup_060650...|    isPartOf|Tract_06065046900|
|BlockGroup_060770...|    isPartOf|Tract_06077004905|
|BlockGroup_060374...|    isPartOf|Tract_06037408900|
|BlockGroup_060790...|    isPartOf|Tract_06079012302|
|BlockGroup_061110...|    is

                                                                                

In [13]:
g = GraphFrame(nodes_df, edges_df)


In [14]:
counties_in_state = edges_df \
    .filter((col("src") == "State_Colorado") & (col("Relationship") == "Contains")) \

counties_in_state.show()



+--------------+------------+-----------------+
|           src|Relationship|              dst|
+--------------+------------+-----------------+
|State_Colorado|    Contains| County_Archuleta|
|State_Colorado|    Contains|  County_La Plata|
|State_Colorado|    Contains| County_Montezuma|
|State_Colorado|    Contains|County_Rio Grande|
|State_Colorado|    Contains|   County_Mineral|
|State_Colorado|    Contains|   County_Dolores|
|State_Colorado|    Contains|  County_San Juan|
|State_Colorado|    Contains|  County_Hinsdale|
|State_Colorado|    Contains|County_San Miguel|
|State_Colorado|    Contains|     County_Ouray|
|State_Colorado|    Contains|   County_Conejos|
|State_Colorado|    Contains|  County_Costilla|
|State_Colorado|    Contains|   County_Alamosa|
|State_Colorado|    Contains|  County_Huerfano|
|State_Colorado|    Contains|  County_Saguache|
|State_Colorado|    Contains|    County_Custer|
|State_Colorado|    Contains|    County_Pueblo|
|State_Colorado|    Contains|   County_F

                                                                                

# Aggregation Queries

In [None]:
demographics_df = spark.read.csv("demographics.csv", header=True, inferSchema=True)

# Aggregate data for each tract
tract_data = demographics_df \
    .join(edges_df, edges_df["dst"] == demographics_df["BlockGroupID"]) \
    .groupBy("src") \
    .agg(sum("Population").alias("TotalPopulation"))

# Proximity Queries

In [None]:
# Assuming 'lat' and 'lon' columns exist and 'point_lat' and 'point_lon' are given
from pyspark.sql.functions import expr

radius_query = nodes_df.filter(
    expr(f"ST_Distance(ST_Point(lon, lat), ST_Point({point_lon}, {point_lat})) <= {radius}")
)

# In-degreee

In [None]:
# Run a simple graph algorithm (optional)
print("In-degree of each vertex:")
g.inDegrees.show()