In [1]:
import pyspark     # Call this only after findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

df = spark.read.csv("/home/sysadm/Downloads/check.csv", header = True, inferSchema = True)
df.printSchema()

root
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DISTANCE: double (nullable = true)



In [2]:
df_vertices = df.select('ORIGIN', 'ORIGIN_AIRPORT_ID').dropDuplicates(['ORIGIN']).withColumnRenamed('ORIGIN', 'id')

In [3]:
df_edges = df.withColumnRenamed('ORIGIN', 'src').withColumnRenamed('DEST', 'dst')

In [4]:
from graphframes import *

g = GraphFrame(df_vertices, df_edges)

In [5]:
df_edges.show()

+-----------------+---------------+---+---+--------+
|ORIGIN_AIRPORT_ID|DEST_AIRPORT_ID|src|dst|DISTANCE|
+-----------------+---------------+---+---+--------+
|            11953|          10397|GNV|ATL|   300.0|
|            13487|          11193|MSP|CVG|   596.0|
|            11433|          11193|DTW|CVG|   229.0|
|            15249|          10397|TLH|ATL|   223.0|
|            10397|          11778|ATL|FSM|   579.0|
|            11267|          13487|DAY|MSP|   574.0|
|            12448|          10397|JAN|ATL|   341.0|
|            12953|          11193|LGA|CVG|   585.0|
|            12451|          12953|JAX|LGA|   833.0|
|            10397|          10685|ATL|BMI|   533.0|
+-----------------+---------------+---+---+--------+



In [6]:
from graphframes import GraphFrame

# Assuming you have created the GraphFrame object 'g' using df_vertices and df_edges
num_edges = g.edges.count()
print("Number of edges:", num_edges)


Number of edges: 10


In [7]:
from graphframes import GraphFrame

# Assuming you have created the GraphFrame object 'g' using df_vertices and df_edges
vertices_df = g.vertices

# Count the number of vertices
num_vertices = vertices_df.count()
print("Number of vertices:", num_vertices)

# Show the vertices DataFrame
vertices_df.show()


Number of vertices: 9
+---+-----------------+
| id|ORIGIN_AIRPORT_ID|
+---+-----------------+
|TLH|            15249|
|LGA|            12953|
|JAN|            12448|
|ATL|            10397|
|DAY|            11267|
|DTW|            11433|
|JAX|            12451|
|MSP|            13487|
|GNV|            11953|
+---+-----------------+



In [8]:
from pyspark.sql import SparkSession
from graphframes import GraphFrame

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [9]:
from graphframes import *

g = GraphFrame(df_vertices, df_edges)

In [10]:
from pyspark.sql.functions import desc
distance = g.edges.select('src', 'dst', 'DISTANCE').distinct().sort(desc('DISTANCE'))
distance.show()

+---+---+--------+
|src|dst|DISTANCE|
+---+---+--------+
|JAX|LGA|   833.0|
|MSP|CVG|   596.0|
|LGA|CVG|   585.0|
|ATL|FSM|   579.0|
|DAY|MSP|   574.0|
|ATL|BMI|   533.0|
|JAN|ATL|   341.0|
|GNV|ATL|   300.0|
|DTW|CVG|   229.0|
|TLH|ATL|   223.0|
+---+---+--------+



In [11]:
import pyspark.sql.functions as F
sub = GraphFrame(g.vertices, distance)

In [12]:
# Find chains of two flights
result = sub.edges.alias("ab") \
    .join(sub.edges.alias("bc"), col("ab.dst") == col("bc.src")) \
    .select(col("ab.src").alias("a"), col("ab.dst").alias("b"), col("bc.dst").alias("c"))

# Display the result
result.count()

8

In [13]:
# Find chains of two flights where the origin and destination are not directly connected
result = sub.edges.alias("ab") \
    .join(sub.edges.alias("bc"), (col("ab.dst") == col("bc.src")) & (col("ab.src") != col("bc.dst"))) \
    .select(col("ab.src").alias("a"), col("ab.dst").alias("b"), col("bc.dst").alias("c"))

# Display the result
result.count()

8