In [76]:
"""
TODO List:
[] Arrange delayed flights and delayed time by arrival/departure hub
"""

'\nTODO List:\n[] Find the same flight schedule and see if they are similarly delayed\n[] Arrange delayed flights and delayed time by arrival/departure hub\n'

In [16]:
from datetime import datetime, timezone
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType,IntegerType
from pyspark.sql.functions import UserDefinedFunction, to_timestamp
from pyspark.sql.functions import collect_list,split,regexp_replace,col,round,concat,lit,avg,when,length,abs
from pyspark.sql.functions import udf

#formats time to more readable format
@udf(returnType = StringType())
def modify_time(time):
    year = str(int(time[0:4]) - 2000)
    month = time[5:7]
    day = time[8:10]
    hour = time[11:13]
    day_night = "AM"
    if (int(hour) > 12):
        hour = str(int(hour) - 12)
        day_night = "PM"
    return month + "/" + day + "/" + year + " " + hour + time[13:19] + " " + day_night



In [21]:

spark = SparkSession.builder.master("local[*]")\
    .appName('flight-data')\
    .config("spark.driver.extraClassPath", "/home/ubuntu/postgresql-42.5.0.jar")\
    .getOrCreate()
# flight_data = spark.read.format("json").options(inferschema='true',header='true').load('../part1/flights/flights_000.jsonl')
flight_data = spark.read.format("json").options(inferschema='true',header='true').load('../part1/flights/')
flight_data = flight_data.select('@acid', '@airline', '@arrArpt', '@depArpt',\
                                 'fdm:trackInformation.nxcm:qualifiedAircraftId.nxce:igtd',\
                                 'fdm:trackInformation.nxcm:ncsmTrackData.nxcm:eta.@timeValue',\
                                 'fdm:trackInformation.nxcm:ncsmTrackData.nxcm:arrivalFixAndTime.@arrTime')
# renames the table columns
new_column_names = ["Flight Number", "Airline", "Arrival Airport", "Departure Airport", "Departure Time", "Scheduled Arrival Time", "Actual Arrival Time"]
flight_data = flight_data.toDF(*new_column_names)
# removing invalid departure and arrival times, null values
flight_data = flight_data.na.drop()

# removes placeholder letter in front of airport code, if it exists
flight_data = flight_data.withColumn('Arrival Airport', when(length(col('Arrival Airport')) == 4, col('Arrival Airport').substr(2,3))\
                         .otherwise(col('Arrival Airport')))\
                         .withColumn('Departure Airport', when(length(col('Departure Airport')) == 4, col('Departure Airport').substr(2,3))\
                         .otherwise(col('Departure Airport')))\
                         .withColumn('Delayed', to_timestamp(flight_data["Scheduled Arrival Time"]).cast('long') < to_timestamp(flight_data["Actual Arrival Time"]).cast('long'))\
                         .withColumn('Delay Time', when(to_timestamp(flight_data["Scheduled Arrival Time"]).cast('long') < to_timestamp(flight_data["Actual Arrival Time"]).cast('long'),\
                            abs(to_timestamp(flight_data["Scheduled Arrival Time"]).cast('long') - to_timestamp(flight_data["Actual Arrival Time"]).cast('long')))\
                            .otherwise("N/A"))\
                         .withColumn('Departure Time', modify_time(col('Departure Time')))\
                         .withColumn('Scheduled Arrival Time', modify_time(col('Scheduled Arrival Time')))\
                         .withColumn('Actual Arrival Time', modify_time(col('Actual Arrival Time')))


# combine with ticket price data
fares = spark.read.parquet('./ticket_fares/output.parquet/')
flight_data = flight_data.join(fares, (flight_data.Airline == fares.Airline) 
                            & (flight_data['Arrival Airport'] == fares.Origin)
                            & (flight_data['Departure Airport'] == fares.Dest))\
                    .drop(fares.Airline).drop('ItinID', 'Origin', 'Dest','Passengers', 'ItinFare')\
                    .dropDuplicates()


                                                                                

In [22]:
flight_data.show(10)



+-------------+-------+---------------+-----------------+--------------------+----------------------+--------------------+-------+----------+--------+
|Flight Number|Airline|Arrival Airport|Departure Airport|      Departure Time|Scheduled Arrival Time| Actual Arrival Time|Delayed|Delay Time|Distance|
+-------------+-------+---------------+-----------------+--------------------+----------------------+--------------------+-------+----------+--------+
|       ASA790|    ASA|            EWR|              SEA|10/22/22 04:50:00 AM|  10/22/22 09:50:30 AM|10/22/22 09:32:10 AM|  false|       N/A|  2402.0|
|      DAL2077|    DAL|            ATL|              BOI|10/22/22 05:58:00 AM|  10/22/22 09:51:55 AM|10/22/22 09:34:24 AM|  false|       N/A|  1838.0|
|       ASA315|    ASA|            DFW|              SEA|10/22/22 06:20:00 AM|  10/22/22 09:43:51 AM|10/22/22 09:30:41 AM|  false|       N/A|  1660.0|
|       DAL881|    DAL|            CVG|              LAX|10/22/22 06:50:00 AM|  10/22/22 10:48

                                                                                