## PROJECT 3

In [1]:
import pyspark
from delta import configure_spark_with_delta_pip

# Prepare the Spark builder
builder = pyspark.sql.SparkSession.builder.appName("Project3") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder,extra_packages=["graphframes:graphframes:0.8.4-spark3.5-s_2.12"]).getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", spark._sc.defaultParallelism)

spark.conf.set("spark.sql.repl.eagerEval.enabled",True) # OK for exploration, not great for performance
spark.conf.set("spark.sql.repl.eagerEval.truncate", 500)

import graphframes as gf
import pyspark.sql.functions as F

### Load data and Data preprocessing

In [2]:
# Let's load in some sample data
flight_df = spark.read.csv("input/2009.csv", header=True, inferSchema=True)
display(flight_df)

FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
2009-01-01,XE,1204,DCA,EWR,1100,1058.0,-2.0,18.0,1116.0,1158.0,8.0,1202,1206.0,4.0,0.0,,0.0,62.0,68.0,42.0,199.0,,,,,,
2009-01-01,XE,1206,EWR,IAD,1510,1509.0,-1.0,28.0,1537.0,1620.0,4.0,1632,1624.0,-8.0,0.0,,0.0,82.0,75.0,43.0,213.0,,,,,,
2009-01-01,XE,1207,EWR,DCA,1100,1059.0,-1.0,20.0,1119.0,1155.0,6.0,1210,1201.0,-9.0,0.0,,0.0,70.0,62.0,36.0,199.0,,,,,,
2009-01-01,XE,1208,DCA,EWR,1240,1249.0,9.0,10.0,1259.0,1336.0,9.0,1357,1345.0,-12.0,0.0,,0.0,77.0,56.0,37.0,199.0,,,,,,
2009-01-01,XE,1209,IAD,EWR,1715,1705.0,-10.0,24.0,1729.0,1809.0,13.0,1900,1822.0,-38.0,0.0,,0.0,105.0,77.0,40.0,213.0,,,,,,
2009-01-01,XE,1212,ATL,EWR,1915,1913.0,-2.0,19.0,1932.0,2108.0,15.0,2142,2123.0,-19.0,0.0,,0.0,147.0,130.0,96.0,745.0,,,,,,
2009-01-01,XE,1212,CLE,ATL,1645,1637.0,-8.0,12.0,1649.0,1820.0,5.0,1842,1825.0,-17.0,0.0,,0.0,117.0,108.0,91.0,554.0,,,,,,
2009-01-01,XE,1214,DCA,EWR,1915,1908.0,-7.0,9.0,1917.0,1953.0,34.0,2035,2027.0,-8.0,0.0,,0.0,80.0,79.0,36.0,199.0,,,,,,
2009-01-01,XE,1215,EWR,DCA,1715,1710.0,-5.0,28.0,1738.0,1819.0,4.0,1838,1823.0,-15.0,0.0,,0.0,83.0,73.0,41.0,199.0,,,,,,
2009-01-01,XE,1217,EWR,DCA,1300,1255.0,-5.0,15.0,1310.0,1349.0,7.0,1408,1356.0,-12.0,0.0,,0.0,68.0,61.0,39.0,199.0,,,,,,


In [3]:
flight_df.printSchema()

root
 |-- FL_DATE: date (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- DEP_TIME: double (nullable = true)
 |-- DEP_DELAY: double (nullable = true)
 |-- TAXI_OUT: double (nullable = true)
 |-- WHEELS_OFF: double (nullable = true)
 |-- WHEELS_ON: double (nullable = true)
 |-- TAXI_IN: double (nullable = true)
 |-- CRS_ARR_TIME: integer (nullable = true)
 |-- ARR_TIME: double (nullable = true)
 |-- ARR_DELAY: double (nullable = true)
 |-- CANCELLED: double (nullable = true)
 |-- CANCELLATION_CODE: string (nullable = true)
 |-- DIVERTED: double (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- ACTUAL_ELAPSED_TIME: double (nullable = true)
 |-- AIR_TIME: double (nullable = true)
 |-- DISTANCE: double (nullable = true)
 |-- CARRIER_DELAY: double (nullable = true)
 |-- WEATHER_DELAY: doub

In [4]:
#Counting the null values
null_counts = flight_df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in flight_df.columns])
null_counts

FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
0,0,0,0,0,0,82867,82867,85787,85787,89322,89321,0,89322,102361,0,6342300,0,0,102362,102361,0,5258837,5258837,5258837,5258837,5258837,6429338


In [7]:
# Creating vertices
# Get unique airport codes from both origin and destination.
origins = flight_df.select(F.col("ORIGIN").alias("id"))
destinations = flight_df.select(F.col("DEST").alias("id"))
flight_vertices = origins.union(destinations).distinct()

display(flight_vertices)

id
GRR
MFE
DAL
ICT
AVL
PBI
XNA
BFL
JFK
LGB


In [8]:
# Creating edges
flight_edges = (flight_df.filter(
        F.column("ORIGIN").isNotNull() & F.column("DEST").isNotNull())
    .select(
        F.column("ORIGIN").alias("src"), 
        F.column("DEST").alias("dst"), 
        F.column("FL_DATE"), 
        F.column("CANCELLED"), 
        F.column("ARR_TIME"), 
        F.column("DISTANCE"), 
    )
)
display(flight_edges)

src,dst,FL_DATE,CANCELLED,ARR_TIME,DISTANCE
DCA,EWR,2009-01-01,0.0,1206.0,199.0
EWR,IAD,2009-01-01,0.0,1624.0,213.0
EWR,DCA,2009-01-01,0.0,1201.0,199.0
DCA,EWR,2009-01-01,0.0,1345.0,199.0
IAD,EWR,2009-01-01,0.0,1822.0,213.0
ATL,EWR,2009-01-01,0.0,2123.0,745.0
CLE,ATL,2009-01-01,0.0,1825.0,554.0
DCA,EWR,2009-01-01,0.0,2027.0,199.0
EWR,DCA,2009-01-01,0.0,1823.0,199.0
EWR,DCA,2009-01-01,0.0,1356.0,199.0


#### Create a GraphFrame

In [9]:
#Creating GraphFrame
flight_graph = gf.GraphFrame(flight_vertices, flight_edges)

flight_vertices.cache()
flight_edges.cache()

display(flight_graph)

GraphFrame(v:[id: string], e:[src: string, dst: string ... 4 more fields])

In [8]:
#flight_vertices.count()
flight_edges.count()

6429338

### Query 1: Compute different statistics : in-degree, out-degree, total degree and triangle count.

#### 1A. in-degree, out-degree, total degree

In [10]:
# Compute out-degrees: count of edges where airport is the source.
out_degrees = flight_edges.groupBy("src").agg(F.count("dst").alias("out_degree"))

# Compute in-degrees: count of edges where airport is the destination.
in_degrees = flight_edges.groupBy("dst").agg(F.count("src").alias("in_degree"))

# Join these counts with the vertices so that every airport gets degree values.
total_degree = (flight_vertices
                  .join(in_degrees, flight_vertices.id == in_degrees.dst, "left")
                  .join(out_degrees, flight_vertices.id == out_degrees.src, "left")
                  .select(flight_vertices.id,
                          F.coalesce(in_degrees["in_degree"], F.lit(0)).alias("in_degree"),
                          F.coalesce(out_degrees["out_degree"], F.lit(0)).alias("out_degree")))
                          
# Compute total degree (simply the sum of in_degree and out_degree).
total_degree = total_degree.withColumn("total_degree", F.col("in_degree") + F.col("out_degree"))

display(total_degree)

id,in_degree,out_degree,total_degree
GRR,13970,13973,27943
MFE,4528,4529,9057
DAL,48209,48207,96416
ICT,13263,13266,26529
AVL,4585,4584,9169
PBI,25496,25500,50996
XNA,13764,13755,27519
BFL,3411,3412,6823
JFK,119571,119574,239145
LGB,14533,14529,29062


In [11]:
# Let's verify the results using inbuilt functions
out_degrees_inbuilt = flight_graph.outDegrees

in_degrees_inbuilt = flight_graph.inDegrees

total_degrees_inbuilt = flight_graph.degrees

degree_df = (flight_graph.vertices
    .join(in_degrees_inbuilt, "id", "left")
    .join(out_degrees_inbuilt, "id", "left")
    .join(total_degrees_inbuilt, "id", "left")
    .select(
        "id",
        F.coalesce(in_degrees_inbuilt["inDegree"], F.lit(0)).alias("in_degree"),
        F.coalesce(out_degrees_inbuilt["outDegree"], F.lit(0)).alias("out_degree"),
        F.coalesce(total_degrees_inbuilt["degree"], F.lit(0)).alias("total_degree")
    )
)

display(degree_df)


id,in_degree,out_degree,total_degree
GRR,13970,13973,27943
MFE,4528,4529,9057
DAL,48209,48207,96416
ICT,13263,13266,26529
AVL,4585,4584,9169
PBI,25496,25500,50996
XNA,13764,13755,27519
BFL,3411,3412,6823
JFK,119571,119574,239145
LGB,14533,14529,29062


#### 1B. triangle count

In [12]:
# Create an undirected view of the edges by ordering the airport codes.
undirected_edges = (flight_edges
                    .select(
                        F.when(F.col("src") < F.col("dst"), F.col("src")).otherwise(F.col("dst")).alias("v1"),
                        F.when(F.col("src") < F.col("dst"), F.col("dst")).otherwise(F.col("src")).alias("v2")
                    )
                    .dropDuplicates())
#display(undirected_edges)

In [13]:
# Alias the undirected edges for the self-join.
e1 = undirected_edges.alias("e1")
e2 = undirected_edges.alias("e2")
e3 = undirected_edges.alias("e3")

# Join e1 and e2 to form potential chains: v1 - v2 - v3
triangle_candidates = e1.join(e2, F.col("e1.v2") == F.col("e2.v1"))

# Now join with e3 to check if an edge exists between v1 and v3 to close the triangle.
triangles = triangle_candidates.join(
    e3,
    (F.col("e1.v1") == F.col("e3.v1")) & (F.col("e2.v2") == F.col("e3.v2"))
)

# Filter triangles to remove duplicates (v1 < v2 < v3)
triangles = triangles.filter(
    (F.col("e1.v1") < F.col("e1.v2")) &
    (F.col("e1.v2") < F.col("e2.v2"))
)


In [14]:
# Compute per-vertex triangle counts: each triangle involves three airports.
# Union the vertices from each triangle and then count how many times each airport appears.
triangle_vertices = (triangles.select(F.col("e1.v1").alias("id"))
                     .union(triangles.select(F.col("e1.v2").alias("id")))
                     .union(triangles.select(F.col("e2.v2").alias("id"))))

vertex_triangle_counts = triangle_vertices.groupBy("id").agg(F.count("*").alias("triangle_count"))

# Join with the vertex degrees so you have one table with all stats.
vertex_stats = total_degree.join(vertex_triangle_counts, "id", "left").fillna(0, subset=["triangle_count"])
display(vertex_stats)

id,in_degree,out_degree,total_degree,triangle_count
GRR,13970,13973,27943,96
MFE,4528,4529,9057,3
DAL,48209,48207,96416,73
ICT,13263,13266,26529,52
AVL,4585,4584,9169,27
PBI,25496,25500,50996,168
XNA,13764,13755,27519,97
BFL,3411,3412,6823,6
JFK,119571,119574,239145,942
LGB,14533,14529,29062,89


In [15]:
#Let's verify the above results with inbuilt function triangleCount
tcount = flight_graph.triangleCount()
display(tcount)

count,id
96,GRR
3,MFE
73,DAL
52,ICT
27,AVL
168,PBI
97,XNA
6,BFL
942,JFK
89,LGB


### Query 2 – Total Number of Triangles in the Graph

In [16]:
# Note: This we computed already in the above query 1B
# Count of triangles in the graph (each triangle counted once).
triangle_total_count = triangles.count()
print("Total number of triangles (overall):", triangle_total_count)

Total number of triangles (overall): 16015


### Query 3 –  centrality measure of your choice natively on Spark using Graphframes.


In [16]:
# Initialize each vertex with a rank (centrality) of 1.0.
ranks = flight_vertices.select("id").withColumn("rank", F.lit(1.0))

# Set parameters for the power iteration.
max_iter = 10
epsilon = 1e-4

for i in range(max_iter):
    # Each vertex's rank is distributed to its destination neighbors.
    contribs = flight_edges.join(ranks, flight_edges.src == ranks.id) \
                    .groupBy("dst") \
                    .agg(F.sum("rank").alias("contrib"))
    
    # Create new ranks by joining these contributions to the vertices.
    new_ranks = flight_vertices.join(contribs, flight_vertices.id == contribs.dst, "left") \
                        .select(flight_vertices.id,
                                F.coalesce(F.col("contrib"), F.lit(0.0)).alias("new_rank"))
    
    # Normalize the new ranks so that the sum of ranks equals 1.
    total_rank = new_ranks.agg(F.sum("new_rank")).first()[0]
    new_ranks = new_ranks.withColumn("new_rank", F.col("new_rank") / total_rank)
    
    # Check convergence.
    joined = ranks.join(new_ranks, "id")
    diff = joined.withColumn("diff", F.abs(F.col("rank") - F.col("new_rank")))
    max_diff_value = diff.agg(F.max("diff")).first()[0]
    print("Iteration", i, "max change:", max_diff_value)
    
    if max_diff_value < epsilon:
        break
    
    # Update the ranks for the next iteration.
    ranks = new_ranks.withColumnRenamed("new_rank", "rank")

# Display the computed eigenvector centrality scores.
print("Eigenvector Centrality Scores:")
ranks.orderBy(F.col("rank").desc()).show(10)


Iteration 0 max change: 0.9999995333889741
Iteration 1 max change: 0.03363152333320294
Iteration 2 max change: 0.008680665250809683
Iteration 3 max change: 0.003152686947788093
Iteration 4 max change: 0.0009518548396991663
Iteration 5 max change: 0.0003445798118187002
Iteration 6 max change: 0.00010000117637908035
Iteration 7 max change: 3.9087763740001946e-05
Eigenvector Centrality Scores:
+---+--------------------+
| id|                rank|
+---+--------------------+
|ATL| 0.03753374418630434|
|ORD| 0.03420041245905581|
|DFW| 0.02919545387978359|
|DEN| 0.02846066233421178|
|LAX| 0.02757145710733077|
|PHX| 0.02469136002748761|
|LAS|  0.0233244439405244|
|IAH|0.020739376543622005|
|SFO| 0.01988898809777288|
|BOS|0.018799291422729483|
+---+--------------------+
only showing top 10 rows



### Query 4 –  Implement the PageRank algorithm natively on Spark using Graphframes

In [17]:
damping = 0.85
max_iter = 10
epsilon = 1e-4

# Pre-compute out-degrees for normalization.
out_degrees = flight_edges.groupBy("src").agg(F.count("dst").alias("out_degree"))

# Initialize PageRank values.
pranks = flight_vertices.select("id").withColumn("rank", F.lit(1.0))

for i in range(max_iter):
    # Calculate contributions from each vertex to its neighbors.
    edge_contribs = (flight_edges
                     .join(pranks, flight_edges.src == pranks.id)
                     .join(out_degrees, flight_edges.src == out_degrees.src)
                     .select(flight_edges.dst,
                             (F.col("rank") / F.col("out_degree")).alias("contrib")))
    
    # Sum contributions by destination vertex.
    summed_contribs = edge_contribs.groupBy("dst") \
                                   .agg(F.sum("contrib").alias("total_contrib"))
    
    # Apply the PageRank update rule.
    new_pranks = (flight_vertices
                  .join(summed_contribs, flight_vertices.id == summed_contribs.dst, "left")
                  .select(flight_vertices.id,
                          ((1 - damping) + damping * F.coalesce(F.col("total_contrib"), F.lit(0.0)))
                           .alias("new_rank")))
    
    # Check convergence.
    joined_ranks = pranks.join(new_pranks, "id")
    diff = joined_ranks.withColumn("diff", F.abs(F.col("rank") - F.col("new_rank")))
    max_diff_value = diff.agg(F.max("diff")).first()[0]
    print("Iteration", i, "max change:", max_diff_value)
    
    if max_diff_value < epsilon:
        break
        
    pranks = new_pranks.withColumnRenamed("new_rank", "rank")

print("PageRank Scores:")
pranks.orderBy(F.col("rank").desc()).show(10)


Iteration 0 max change: 37.871000762263726
Iteration 1 max change: 26.092607763304223
Iteration 2 max change: 9.466411593974177
Iteration 3 max change: 4.944223546565226
Iteration 4 max change: 2.2046881070030047
Iteration 5 max change: 0.998699902937723
Iteration 6 max change: 0.51081928935427
Iteration 7 max change: 0.19801791533111057
Iteration 8 max change: 0.12155779308112358
Iteration 9 max change: 0.03591800430051251
PageRank Scores:
+---+------------------+
| id|              rank|
+---+------------------+
|ATL| 18.90501041323751|
|ORD|12.927110422702702|
|DFW|11.735613068886071|
|DEN| 9.998483451764905|
|LAX| 7.726101869875372|
|IAH| 7.159305038124591|
|PHX| 7.065371562599747|
|SLC| 7.038754134167395|
|DTW| 7.019968536748612|
|SFO| 5.904248175402509|
+---+------------------+
only showing top 10 rows



In [None]:
# !!DO NOT RUN THIS CELL IT MIGHT CRASH SPARK SESSION
#Let's verify the above results with inbuilt function pageRank 
#g_pagerank = flight_graph.pageRank(resetProbability=0.15, maxIter=10)
#display(g_pagerank.vertices)
#g_pagerank.vertices.orderBy('pagerank', ascending=False).show(10)

### Query 5 – Finding the Group of the Most Connected Airports

In [17]:
# Create an Undirected Edges DataFrame 
# For connectivity, treat the graph as undirected by unioning edges with their reverse.
undirected_edges = (flight_edges.select("src", "dst")
                    .union(flight_edges.select(F.col("dst").alias("src"), F.col("src").alias("dst")))
                    .dropDuplicates())

print("Undirected edges sample:")
undirected_edges.show(5)

Undirected edges sample:
+---+---+
|src|dst|
+---+---+
|EWR|ATL|
|IAH|CVG|
|OKC|IAH|
|PIT|EWR|
|IAH|BHM|
+---+---+
only showing top 5 rows



In [18]:
# Initialize Vertices with Component Labels
# Initially, each vertex is its own component.
vertices_df = flight_vertices.withColumn("component", F.col("id"))

print("Initial vertices with component labels:")
vertices_df.show(5)


Initial vertices with component labels:
+---+---------+
| id|component|
+---+---------+
|GRR|      GRR|
|MFE|      MFE|
|DAL|      DAL|
|ICT|      ICT|
|AVL|      AVL|
+---+---------+
only showing top 5 rows



In [19]:
# Iterative Union-Find (Label Propagation)
max_iter = 10  
iteration = 0
converged = False

while not converged and iteration < max_iter:
    # For each vertex, get the minimum component label among its neighbors.
    # Join undirected_edges with vertices_df so we can retrieve neighbor's current component.
    neighbor_components = undirected_edges.join(
        vertices_df, undirected_edges.dst == vertices_df.id, "left"
    ).groupBy("src") \
     .agg(F.min("component").alias("min_neighbor_component"))
    
    # Now, for each vertex, update its component to the minimum of its current value and the minimum found among neighbors.
    updated_vertices = vertices_df.join(
        neighbor_components, vertices_df.id == neighbor_components.src, "left"
    ).withColumn(
        "new_component",
        F.when(F.col("min_neighbor_component").isNotNull(),
               F.least(F.col("component"), F.col("min_neighbor_component"))
              ).otherwise(F.col("component"))
    ).select("id", "new_component")
    
    # Check for convergence: count how many vertices change their component label.
    diff_count = updated_vertices.filter(F.col("new_component") != vertices_df.component).count()
    print("Iteration", iteration, "number of component updates:", diff_count)
    
    if diff_count == 0:
        converged = True
    else:
        # Prepare for next iteration: rename new_component to component.
        vertices_df = updated_vertices.withColumnRenamed("new_component", "component")
        iteration += 1

# Show the Connected Components

print("Final vertices with computed connected component labels (using custom union-find):")
vertices_df.orderBy("component").show(20, truncate=False)


Iteration 0 number of component updates: 246
Iteration 1 number of component updates: 283
Iteration 2 number of component updates: 96
Iteration 3 number of component updates: 4
Iteration 4 number of component updates: 0
Final vertices with computed connected component labels (using custom union-find):
+---+---------+
|id |component|
+---+---------+
|CLE|ABE      |
|MAF|ABE      |
|ORD|ABE      |
|DTW|ABE      |
|DCA|ABE      |
|LGA|ABE      |
|IAH|ABE      |
|DAY|ABE      |
|ATL|ABE      |
|PNS|ABE      |
|COS|ABE      |
|SDF|ABE      |
|LFT|ABE      |
|ROC|ABE      |
|LIT|ABE      |
|COD|ABE      |
|DFW|ABE      |
|GCC|ABE      |
|OMA|ABE      |
|KOA|ABE      |
+---+---------+
only showing top 20 rows



In [20]:
# Group vertices by component and count the number of vertices in each component.
components_group = vertices_df.groupBy("component").count().orderBy("count", ascending=False)
print("Connected component groups:")
components_group.show(truncate=False)

Connected component groups:
+---------+-----+
|component|count|
+---------+-----+
|ABE      |296  |
+---------+-----+

