In [1]:
airportsPath = "/data/students/bigdata-01QYD/Lab9_DBD/airports.csv"
airlinesPath = "/data/students/bigdata-01QYD/Lab9_DBD/airlines.csv"
routesPath = "/data/students/bigdata-01QYD/Lab9_DBD/routes.csv"
from graphframes import GraphFrame

# Step 1 - Create the graph of flight connections

In [2]:
airports_vDF = spark.read.load(airportsPath,\
                              format='csv',\
                              header=True,\
                              inferSchema=True)

In [3]:
routes_eDF = spark.read.load(routesPath,\
                              format='csv',\
                              header=True,\
                              inferSchema=True)

In [None]:
airports_vDF.printSchema()

In [None]:
routes_eDF.printSchema()

In [6]:
# filtering null airport source/destinations
routes_filtered_eDF = routes_eDF.filter("airport_source IS NOT NULL AND \
airport_source_id IS NOT NULL AND airport_destination IS NOT NULL AND \
airport_destination_id IS NOT NULL").withColumnRenamed("airport_source_id","src")\
.withColumnRenamed("airport_destination_id","dst")

In [7]:
g = GraphFrame(airports_vDF, routes_filtered_eDF)

# STEP 2 - Analyze and proccess the graph

In [100]:
# Task 1
sc.setCheckpointDir("checkpointDir_Lab9/")
vertexesInDegreesDF = g.inDegrees.sort("inDegree",ascending=False)

In [29]:
firstTenDF = spark.createDataFrame(vertexesInDegreesDF.withColumnRenamed("id","airportID").take(10))

In [31]:
firstTenDF.show()

+---------+--------+
|airportID|inDegree|
+---------+--------+
|     3682|     911|
|     3830|     550|
|     3364|     534|
|      507|     522|
|     1382|     517|
|     3484|     498|
|      340|     493|
|     3670|     467|
|     3797|     455|
|      580|     450|
+---------+--------+



In [33]:
joinedDF = firstTenDF.join(airports_vDF, firstTenDF.airportID == airports_vDF.id)\
.select("id","name","inDegree").show()

+----+--------------------+--------+
|  id|                name|inDegree|
+----+--------------------+--------+
|3682|Hartsfield Jackso...|     911|
|3830|Chicago O'Hare In...|     550|
|3364|Beijing Capital I...|     534|
| 507|London Heathrow A...|     522|
|1382|Charles de Gaulle...|     517|
|3484|Los Angeles Inter...|     498|
| 340|Frankfurt am Main...|     493|
|3670|Dallas Fort Worth...|     467|
|3797|John F Kennedy In...|     455|
| 580|Amsterdam Airport...|     450|
+----+--------------------+--------+



In [44]:
# Task 2, we have to convert vertex_id edges_src and edges_dst columns
#into strings before applying any algorithm
airports_vDF = airports_vDF.withColumn("id", airports_vDF.id.cast("string"))
routes_filtered_eDF = routes_filtered_eDF.withColumn("src",\
                                        routes_filtered_eDF.src.cast("string"))\
                                         .withColumn("dst",\
                                        routes_filtered_eDF.dst.cast("string"))
fixedGraph = GraphFrame(airports_vDF, routes_filtered_eDF)

In [49]:
motif1 = fixedGraph.find("(Turin)-[]->(dest)")

In [55]:
turinDestinationsOneF = motif1.filter("Turin.id = 1526")

In [None]:
turinDestinationsOneF.show()

In [63]:
turinDestinationsOneF.count()

44

In [103]:
# ALTERNATIVE : 
print(fixedGraph.edges.filter("src=1526").count()

44


In [104]:
motif2 = fixedGraph.find("(Turin)-[]->(a1);(a1)-[]->(a2)")
turinDestinationsTwoF = motif2.filter("Turin.id = 1526 AND a2.id <> 1526").select("Turin","a2")

In [105]:
turinDestinationsTwoF.show()

+--------------------+--------------------+
|               Turin|                  a2|
+--------------------+--------------------+
|[1526, Turin Airp...|[1741, Podgorica ...|
|[1526, Turin Airp...|[1688, Antalya In...|
|[1526, Turin Airp...|[1706, Adnan Mend...|
|[1526, Turin Airp...|[1226, Jerez Airp...|
|[1526, Turin Airp...|[1056, Tenerife S...|
|[1526, Turin Airp...|[1472, Diagoras A...|
|[1526, Turin Airp...|[3998, Palma De M...|
|[1526, Turin Airp...|[1054, Gran Canar...|
|[1526, Turin Airp...|[1458, Kos Airpor...|
|[1526, Turin Airp...|[1130, Hurghada I...|
|[1526, Turin Airp...|[1452, Heraklion ...|
|[1526, Turin Airp...|[1473, Araxos Air...|
|[1526, Turin Airp...|[1051, Fuertevent...|
|[1526, Turin Airp...|[1626, Faro Airpo...|
|[1526, Turin Airp...|[1715, Dalaman In...|
|[1526, Turin Airp...|[1460, Ioannis Ka...|
|[1526, Turin Airp...|[1688, Antalya In...|
|[1526, Turin Airp...|[4330, Imam Khome...|
|[1526, Turin Airp...|[1218, Barcelona ...|
|[1526, Turin Airp...|[3830, Chi

In [106]:
turinDestinationsTwoF.select("a2.id").distinct().count()

589

In [69]:
motif3 = fixedGraph.find("(Turin)-[]->(a1);(a1)-[]->(a2);(a2)-[]->(a3)")
turinDestinationsThreeF = motif3.filter("Turin.id = 1526").select("Turin","a3")

In [70]:
turinDestinationsThreeF.show()

+--------------------+--------------------+
|               Turin|                  a3|
+--------------------+--------------------+
|[1526, Turin Airp...|[1678, Zürich Air...|
|[1526, Turin Airp...|[1613, Vienna Int...|
|[1526, Turin Airp...|[1569, Ljubljana ...|
|[1526, Turin Airp...|[340, Frankfurt a...|
|[1526, Turin Airp...|[1555, Leonardo d...|
|[1526, Turin Airp...|[345, Düsseldorf ...|
|[1526, Turin Airp...|[4029, Domodedovo...|
|[1526, Turin Airp...|[1382, Charles de...|
|[1526, Turin Airp...|[1739, Belgrade N...|
|[1526, Turin Airp...|[1701, Atatürk In...|
|[1526, Turin Airp...|[1613, Vienna Int...|
|[1526, Turin Airp...|[1739, Belgrade N...|
|[1526, Turin Airp...|[1569, Ljubljana ...|
|[1526, Turin Airp...|[548, London Stan...|
|[1526, Turin Airp...|[304, Brussels So...|
|[1526, Turin Airp...|[1555, Leonardo d...|
|[1526, Turin Airp...|[478, Manchester ...|
|[1526, Turin Airp...|[492, London Luto...|
|[1526, Turin Airp...|[502, London Gatw...|
|[1526, Turin Airp...|[517, Leed

In [72]:
turinDestinationsThreeF.select("a3.id").distinct().count()

2210

In [76]:
# Task 3
shPathsToTurin = fixedGraph.shortestPaths(['1526'])

In [78]:
shPathsToTurin.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- iata: string (nullable = true)
 |-- icao: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- altitude: integer (nullable = true)
 |-- timezone: string (nullable = true)
 |-- DST: string (nullable = true)
 |-- tz_timezone: string (nullable = true)
 |-- type: string (nullable = true)
 |-- source: string (nullable = true)
 |-- distances: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = false)



In [86]:
hops = shPathsToTurin.selectExpr("distances['1526'] AS numHops")

In [94]:
maxHops = hops.agg({"numHops":"max"}).first()

In [93]:
maxHops

[Row(max(numHops)=8)]

In [98]:
# farther airport from turin
shPathsToTurin.filter("distances['1526'] ="+str(maxHops['max(numHops)'])).selectExpr("name","city","country","distances['1526'] AS numHops").show()

+-----------------+---------+-------+-------+
|             name|     city|country|numHops|
+-----------------+---------+-------+-------+
|Peawanuck Airport|Peawanuck| Canada|      8|
+-----------------+---------+-------+-------+



In [101]:
# TASK 4 - Connected Components
connectedComponents = fixedGraph.connectedComponents()

In [110]:
# print num of connectedComponents with at least 2 components
connectedComponents.groupBy("component").count().filter("count >= 2").count()

7