In [None]:
import sys
sys.path.append("/usr/local/python-env/py39/lib/python3.9/site-packages")

import pyspark
print(pyspark.__version__)

print(sys.executable)

In [None]:
import os

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.9'

In [None]:
import pkg_resources

sedona_version = pkg_resources.get_distribution("apache-sedona").version
print(f"Apache Sedona version: {sedona_version}")

In [None]:
print(os.environ['SPARK_HOME'])
print(os.environ['PYSPARK_PYTHON'])

In [None]:
print(os.environ['SPARK_HOME'])
print(os.environ['PYSPARK_PYTHON'])

In [None]:
from pyspark.sql import SparkSession
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from pyspark.sql.functions import col


In [None]:
spark = SparkSession \
    .builder \
    .appName("GraphFramesExample") \
    .master('spark://columbus-oh.cs.colostate.edu:30800') \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.3-spark3.4-s_2.12") \
    .config("spark.yarn.resourcemanager.address", "columbia.cs.colostate.edu:30799") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
logger = spark._jvm.org.apache.log4j.LogManager.getLogger(__name__)


In [None]:
# Import the necessary module from py4j to interact with JVM
from py4j.java_gateway import java_import

# Import the Path class from Hadoop. This class is used to handle file paths in Hadoop.
java_import(spark._jvm, 'org.apache.hadoop.fs.Path')

fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())

In [None]:
def get_csv_df(file_name: str):
    data_directory = "hdfs://columbus-oh.cs.colostate.edu:30785/geospatial/graph/"
    print(f"Processing file: {file_name}")

    if "Nodes" in file_name:
        schema = "Node_ID string, Type string"
        df = spark.read.format("csv").option("header", "true").schema(schema).load(data_directory + file_name)
        df = df.withColumnRenamed("Node_ID", "id")  # Rename column for GraphFrame compatibility
    elif "Edges" in file_name:
        schema = "Subject string, Relationship string, Object string"
        df = spark.read.format("csv").option("header", "true").schema(schema).load(data_directory + file_name)
        df = df.withColumnRenamed("Subject", "src").withColumnRenamed("Object", "dst")  # Rename columns for GraphFrame compatibility
    else:
        df = None

    return df


In [None]:
from graphframes import *

In [None]:
# Load the nodes and edges data
nodes_df = get_csv_df("BaseNodes.csv")
edges_df = get_csv_df("BaseEdges.csv")

In [None]:
print(nodes_df.columns)
print(edges_df.columns)

In [None]:
g = GraphFrame(nodes_df, edges_df)


In [None]:
# Run a simple graph algorithm (optional)
print("In-degree of each vertex:")
g.inDegrees.show()