In [8]:
!pip3 install graphframes

Collecting graphframes
  Downloading graphframes-0.6-py2.py3-none-any.whl.metadata (934 bytes)
Collecting nose (from graphframes)
  Downloading nose-1.3.7-py3-none-any.whl.metadata (1.7 kB)
Downloading graphframes-0.6-py2.py3-none-any.whl (18 kB)
Downloading nose-1.3.7-py3-none-any.whl (154 kB)
Installing collected packages: nose, graphframes
Successfully installed graphframes-0.6 nose-1.3.7


In [3]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.sql import SparkSession
from delta import *

import pyspark.sql.functions as F
import os
import time

## Spark Session

In [5]:
# Create SparkSession with Delta Lake support
# Prepare the Spark builder
builder = SparkSession.builder.appName("project_3") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder,extra_packages=["graphframes:graphframes:0.8.4-spark3.5-s_2.12"]).getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", spark._sc.defaultParallelism)

spark.conf.set("spark.sql.repl.eagerEval.enabled",True) # OK for exploration, not great for performance
spark.conf.set("spark.sql.repl.eagerEval.truncate", 500)

import graphframes as gf


## Dataset read

In [6]:
flight_df = spark.read.csv("data/2009.csv", header=True, inferSchema=True)
display(flight_df)

FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
2009-01-01,XE,1204,DCA,EWR,1100,1058.0,-2.0,18.0,1116.0,1158.0,8.0,1202,1206.0,4.0,0.0,,0.0,62.0,68.0,42.0,199.0,,,,,,
2009-01-01,XE,1206,EWR,IAD,1510,1509.0,-1.0,28.0,1537.0,1620.0,4.0,1632,1624.0,-8.0,0.0,,0.0,82.0,75.0,43.0,213.0,,,,,,
2009-01-01,XE,1207,EWR,DCA,1100,1059.0,-1.0,20.0,1119.0,1155.0,6.0,1210,1201.0,-9.0,0.0,,0.0,70.0,62.0,36.0,199.0,,,,,,
2009-01-01,XE,1208,DCA,EWR,1240,1249.0,9.0,10.0,1259.0,1336.0,9.0,1357,1345.0,-12.0,0.0,,0.0,77.0,56.0,37.0,199.0,,,,,,
2009-01-01,XE,1209,IAD,EWR,1715,1705.0,-10.0,24.0,1729.0,1809.0,13.0,1900,1822.0,-38.0,0.0,,0.0,105.0,77.0,40.0,213.0,,,,,,
2009-01-01,XE,1212,ATL,EWR,1915,1913.0,-2.0,19.0,1932.0,2108.0,15.0,2142,2123.0,-19.0,0.0,,0.0,147.0,130.0,96.0,745.0,,,,,,
2009-01-01,XE,1212,CLE,ATL,1645,1637.0,-8.0,12.0,1649.0,1820.0,5.0,1842,1825.0,-17.0,0.0,,0.0,117.0,108.0,91.0,554.0,,,,,,
2009-01-01,XE,1214,DCA,EWR,1915,1908.0,-7.0,9.0,1917.0,1953.0,34.0,2035,2027.0,-8.0,0.0,,0.0,80.0,79.0,36.0,199.0,,,,,,
2009-01-01,XE,1215,EWR,DCA,1715,1710.0,-5.0,28.0,1738.0,1819.0,4.0,1838,1823.0,-15.0,0.0,,0.0,83.0,73.0,41.0,199.0,,,,,,
2009-01-01,XE,1217,EWR,DCA,1300,1255.0,-5.0,15.0,1310.0,1349.0,7.0,1408,1356.0,-12.0,0.0,,0.0,68.0,61.0,39.0,199.0,,,,,,


In [7]:
flight_df.printSchema()

root
 |-- FL_DATE: date (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- DEP_TIME: double (nullable = true)
 |-- DEP_DELAY: double (nullable = true)
 |-- TAXI_OUT: double (nullable = true)
 |-- WHEELS_OFF: double (nullable = true)
 |-- WHEELS_ON: double (nullable = true)
 |-- TAXI_IN: double (nullable = true)
 |-- CRS_ARR_TIME: integer (nullable = true)
 |-- ARR_TIME: double (nullable = true)
 |-- ARR_DELAY: double (nullable = true)
 |-- CANCELLED: double (nullable = true)
 |-- CANCELLATION_CODE: string (nullable = true)
 |-- DIVERTED: double (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- ACTUAL_ELAPSED_TIME: double (nullable = true)
 |-- AIR_TIME: double (nullable = true)
 |-- DISTANCE: double (nullable = true)
 |-- CARRIER_DELAY: double (nullable = true)
 |-- WEATHER_DELAY: doub

## Filtering data

### Edges

In [8]:
flight_edgs_df = (flight_df
    .filter(
        F.column("ORIGIN").isNotNull() & F.column("DEST").isNotNull()
    )
    .select(
        F.column("ORIGIN").alias("src"), 
        F.column("DEST").alias("dst"), 
        F.column("FL_DATE"), 
        F.column("CANCELLED"), 
        F.column("ARR_TIME"), 
        F.column("DISTANCE"), 
    )
)
display(flight_edgs_df)

src,dst,FL_DATE,CANCELLED,ARR_TIME,DISTANCE
DCA,EWR,2009-01-01,0.0,1206.0,199.0
EWR,IAD,2009-01-01,0.0,1624.0,213.0
EWR,DCA,2009-01-01,0.0,1201.0,199.0
DCA,EWR,2009-01-01,0.0,1345.0,199.0
IAD,EWR,2009-01-01,0.0,1822.0,213.0
ATL,EWR,2009-01-01,0.0,2123.0,745.0
CLE,ATL,2009-01-01,0.0,1825.0,554.0
DCA,EWR,2009-01-01,0.0,2027.0,199.0
EWR,DCA,2009-01-01,0.0,1823.0,199.0
EWR,DCA,2009-01-01,0.0,1356.0,199.0


### Vertex

In [9]:
flight_vertex_df = (flight_df
    .filter(
        F.column("ORIGIN").isNotNull() & F.column("DEST").isNotNull()
    )
    .select(
        F.column("ORIGIN").alias("id"),
    )
    .union(flight_df
        .select(
            F.column("DEST").alias("id"),
        )
    )
    .distinct()
)
display(flight_vertex_df)

id
JAN
JAX
ABQ
ORF
TUL
STL
GPT
SYR
CID
FAT


In [11]:
flight_graph = gf.GraphFrame(flight_vertex_df, flight_edgs_df)
flight_vertex_df.cache()
flight_edgs_df.cache()

display(flight_graph)

GraphFrame(v:[id: string], e:[src: string, dst: string ... 4 more fields])

In [13]:
display(flight_graph.vertices)

id
JAN
JAX
ABQ
ORF
TUL
STL
GPT
SYR
CID
FAT


In [17]:
display(flight_graph.edges)

src,dst,FL_DATE,CANCELLED,ARR_TIME,DISTANCE
DCA,EWR,2009-01-01,0.0,1206.0,199.0
EWR,IAD,2009-01-01,0.0,1624.0,213.0
EWR,DCA,2009-01-01,0.0,1201.0,199.0
DCA,EWR,2009-01-01,0.0,1345.0,199.0
IAD,EWR,2009-01-01,0.0,1822.0,213.0
ATL,EWR,2009-01-01,0.0,2123.0,745.0
CLE,ATL,2009-01-01,0.0,1825.0,554.0
DCA,EWR,2009-01-01,0.0,2027.0,199.0
EWR,DCA,2009-01-01,0.0,1823.0,199.0
EWR,DCA,2009-01-01,0.0,1356.0,199.0


# QUERY 1

## Expected results

In [21]:
expected_degrees_df = flight_graph.degrees
expected_inDegrees_df = flight_graph.inDegrees
expected_outDegrees_df = flight_graph.outDegrees
expected_triplets_df = flight_graph.triplets

In [43]:
# Taken from https://stackoverflow.com/questions/31197353/dataframe-equality-in-apache-spark
def are_dfs_equal(df1, df2):
    if df1.schema != df2.schema:
        print("schema is not the same")
        return False
    if df1.collect() != df2.collect():
        print("data is not the same")
        return False
    return True

In [67]:
display(expected_triplets_df)

id,degree
JAN,25058
JAX,57623
ABQ,71159
ORF,30483
TUL,41466
GPT,13176
STL,117393
SYR,18666
CID,18096
RKS,3531


## inDegrees

In [56]:
in_edges_df = (flight_graph
    .find("()-[edge]->(a)")
    .groupBy("a.id")
    .count()
    .withColumn("inDegree", F.col("count").cast(IntegerType()))
    .select(F.col("id"), F.col("inDegree"))

)

display(in_edges_df)

id,inDegree
JAX,28813
ABQ,35577
ORF,15245
JAN,12528
TUL,20731
GPT,6588
STL,58691
SYR,9330
RKS,1764
PWM,6510


In [57]:
are_dfs_equal(in_edges_df, expected_inDegrees_df)

True

## outDegrees

In [52]:
outgoing_edges_df = (flight_graph
    .find("(a)-[edge]->()")
    .groupBy("a.id")
    .count()
    .withColumn("outDegree", F.col("count").cast(IntegerType()))
    .select(F.col("id"), F.col("outDegree"))

)

display(outgoing_edges_df)

id,outDegree
JAN,12530
JAX,28810
ABQ,35582
ORF,15238
TUL,20735
STL,58702
GPT,6588
SYR,9336
CID,9049
FAT,12319


In [53]:
are_dfs_equal(outgoing_edges_df, expected_outDegrees_df)

True

## Degrees

In [69]:
degrees_df = (outgoing_edges_df
    .join(in_edges_df, "id")
    .withColumn("degree", F.col("outDegree")+ F.col("inDegree"))
    .select(F.col("id"), F.col("degree"))
)
display(degrees_df)

id,degree
JAX,57623
ABQ,71159
ORF,30483
JAN,25058
TUL,41466
GPT,13176
STL,117393
SYR,18666
RKS,3531
PWM,13018


In [80]:
expected_degrees_df.count(), degrees_df.count()

(296, 296)

In [85]:
letsCheck_df = (expected_degrees_df
    .withColumnRenamed("degree", "expected_degree")
    .join(degrees_df, "id")
    .withColumn("diff", F.col("expected_degree") -  F.col("degree"))
    .filter(F.col("diff")!= 0)
    .select(F.col("id"), F.col("diff"), F.col("expected_degree"), F.col("degree"))
)
display(letsCheck_df)

id,diff,expected_degree,degree


## Triples