In [1]:
# Obtain airports and flights dataset
AirportFilePath="wasb://sparklabdata@sparkclusterlab.blob.core.windows.net/References/Airports.csv"
FlightFilePath="wasb://sparklabdata@sparkclusterlab.blob.core.windows.net/Flight/*/*.csv"

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
3,application_1522764435524_0007,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import udf

def clean(x):
    for i in range(len(x)):
        x[i]=x[i].replace('"','').replace('\'',' ').strip()
    return(x)

In [3]:
# RDD creation
# split document in lines
airportData = sc.textFile(AirportFilePath)
USAirportDataFinal = airportData.map(lambda l: l.split(",")).map(clean).filter(lambda c: c[3] == 'United States' and c[4] != '\\N')

In [4]:
airportDataFields = [StructField("AirportId", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("City", StringType(), True),
    StructField("Country", StringType(), True),
    StructField("IATA", StringType(), True),
    StructField("ICAO", StringType(), True),
    StructField("Latitude", StringType(), True),
    StructField("Longitude", StringType(), True),
    StructField("Altitude", StringType(), True),
    StructField("Timezone", StringType(), True),
    StructField("DST", StringType(), True),
    StructField("TzDatabase", StringType(), True),
    StructField("Type", StringType(), True),
    StructField("Source", StringType(), True)]

# Apply schema to the RDD
airportDataSchema = StructType(airportDataFields)

In [5]:
#Creation du DataFrame depuis le RDD
airportData_DataFrame = USAirportDataFinal.toDF(airportDataSchema)
flight_df = sqlContext.read.format("com.databricks.spark.csv").options(header='true').load(FlightFilePath)

In [7]:
## Creates a temporary view based on the DataFrame
airportData_DataFrame.createOrReplaceTempView("airports_na")
flight_df.createOrReplaceTempView("departureDelays")

In [9]:
airport_traffic = sqlContext.sql("SELECT \
ORIGIN_STATE_NM as origin_state, \
ORIGIN_CITY_NAME as origin_city, \
ORIGIN as origin_airport, \
cast(O.Latitude as double) as origin_latitude,\
cast(O.Longitude as double) as origin_longitude, \
DEST_STATE_NM as destination_state, \
DEST_CITY_NAME as destination_city, \
DEST as destination_airport, \
cast(Dest.Latitude as double) as dest_latitude, \
cast(Dest.Longitude as double) as dest_longitude, \
COUNT(*) as FlightCount, AVG(DEP_DELAY) as dep_delay, \
AVG(ARR_DELAY) as arr_delay \
FROM departureDelays D \
JOIN airports_na O ON D.ORIGIN = O.IATA \
JOIN airports_na Dest ON D.DEST = Dest.IATA \
GROUP BY ORIGIN_STATE_NM, ORIGIN_CITY_NAME, ORIGIN, O.Latitude, O.Longitude, DEST_CITY_NAME, DEST, \
DEST_STATE_NM, Dest.Latitude, Dest.Longitude")
airport_traffic.write.saveAsTable('airports_traffic')