In [76]:
"""
TODO List:
[] Arrange delayed flights and delayed time by arrival/departure hub
"""

'\nTODO List:\n[] Find the same flight schedule and see if they are similarly delayed\n[] Arrange delayed flights and delayed time by arrival/departure hub\n'

In [2]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StringType
from datetime import datetime

def convert_seconds(seconds):
    seconds = seconds % (24 * 3600)
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    if hour != 0:
        return "%d hours %d minutes %d seconds" % (hour, minutes, seconds)
    elif minutes != 0:
        return "%d minutes %d seconds" % (minutes, seconds)
    else:
        return "%d seconds" % (seconds)

# determine if delayed or not
@udf(returnType = StringType())
def determine_delay(estimated_time, actual_time):
    if int(estimated_time[8:10]) == int(actual_time[8:10]):
        estimated_time_sec = int(estimated_time[11:13]) * 3600 + int(estimated_time[14:16]) * 60 + int(estimated_time[17:19])
        actual_time_sec = int(actual_time[11:13]) * 3600 + int(actual_time[14:16]) * 60 + int(actual_time[17:19])
        if estimated_time_sec >= actual_time_sec:
            delay = 'NO'
        else: 
            delay = 'YES'
    elif int(estimated_time[8:10]) > int(actual_time[8:10]): # early 
        delay = 'NO'
    else: 
        delay = 'YES'

    return delay

# calculate delayed time(sec)
@udf(returnType = StringType())
def calculate_delay(estimated_time, actual_time):
    estimated_time_sec = int(estimated_time[11:13]) * 3600 + int(estimated_time[14:16]) * 60 + int(estimated_time[17:19])
    actual_time_sec = int(actual_time[11:13]) * 3600 + int(actual_time[14:16]) * 60 + int(actual_time[17:19])
    if int(estimated_time[8:10]) == int(actual_time[8:10]):
        pass
    elif int(estimated_time[8:10]) > int(actual_time[8:10]): 
        estimated_time_sec += 24 * 3600
    else : 
        actual_time_sec += 24 * 3600
    delay_time = abs(estimated_time_sec - actual_time_sec)
    return convert_seconds(delay_time)

@udf(returnType = StringType())
def modify_time(time):
    year = str(int(time[0:4]) - 2000)
    month = time[5:7]
    day = time[8:10]
    hour = time[11:13]
    day_night = "AM"
    if (int(hour) > 12):
        hour = hour - 12
        day_night = "PM"
    return month + "/" + day + "/" + year + " " + hour + time[13:19] + " " + day_night

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType,IntegerType
from pyspark.sql.functions import collect_list,split,regexp_replace,col,round,concat,lit,avg,when,length
spark = SparkSession.builder.master("local[*]").appName('flight-data').getOrCreate()
# for all jsonl file

for i in range(0,1):
    if i >= 100:
        file_name = str(i)
    elif i >= 10:
        file_idx = '0' + str(i)
    else :
        file_idx = '00' + str(i)
    file_loc = '/home/ubuntu/cs179g_project/part1/flights/flights_' + file_idx +'.jsonl'

    flight_data = spark.read.format("json").options(inferschema = 'true',header = 'true').load(file_loc)
    flight_data = flight_data.select('@acid', '@airline', '@arrArpt', '@depArpt',\
                                    'fdm:trackInformation.nxcm:ncsmTrackData.nxcm:departureFixAndTime.@arrTime',\
                                    'fdm:trackInformation.nxcm:ncsmTrackData.nxcm:eta.@timeValue',\
                                    'fdm:trackInformation.nxcm:ncsmTrackData.nxcm:arrivalFixAndTime.@arrTime')


    # renames the table columns
    new_column_names = ["Flight Number", "Airline", "Arrival Airport", "Departure Airport", "Departure Time", "Estimated Arrival Time", "Actual Arrival Time"]
    flight_data = flight_data.toDF(*new_column_names)
    # removing invalid departure and arrival times, null values
    flight_data = flight_data.na.drop()
    # removes placeholder letter in front of airport code, if it exists and delete duplicates
    flight_data = flight_data.withColumn('Arrival Airport', when(length(col('Arrival Airport')) == 4, col('Arrival Airport').substr(2,3))\
                            .otherwise(col('Arrival Airport')))\
                            .withColumn('Departure Airport', when(length(col('Departure Airport')) == 4, col('Departure Airport').substr(2,3))\
                            .otherwise(col('Departure Airport'))).distinct()

    flight_data = flight_data.withColumn('Delay', lit(determine_delay(col('Estimated Arrival Time'), col('Actual Arrival Time'))))
    fd = flight_data.withColumn('Delayed Time', lit(calculate_delay(col('Estimated Arrival Time'), col('Actual Arrival Time'))))

    fd = fd.withColumn('Departure Time', modify_time(col('Departure Time')))
    fd = fd.withColumn('Estimated Arrival Time', modify_time(col('Estimated Arrival Time')))
    fd = fd.withColumn('Actual Arrival Time', modify_time(col('Actual Arrival Time')))
    
    if i == 0:
        delayed_flight = fd.filter(fd.Delay == "YES")

    # combine every delayed flight of jsonl files
    delayed_flight = delayed_flight.union(fd.filter(fd.Delay == "YES"))
    
    # print(str(i)+".jsonl finished")
delayed_flight.show(10)




+-------------+-------+---------------+-----------------+--------------------+----------------------+--------------------+-----+--------------------+
|Flight Number|Airline|Arrival Airport|Departure Airport|      Departure Time|Estimated Arrival Time| Actual Arrival Time|Delay|        Delayed Time|
+-------------+-------+---------------+-----------------+--------------------+----------------------+--------------------+-----+--------------------+
|      AAL2308|    AAL|            ORD|              LAX|10/22/22 08:30:00 AM|  10/22/22 11:42:24 AM|10/22/22 11:43:19 AM|  YES|          55 seconds|
|       DAL850|    DAL|            RDU|              LAS|10/22/22 05:55:00 AM|  10/22/22 09:18:16 AM|10/22/22 09:23:20 AM|  YES| 5 minutes 4 seconds|
|       DAL881|    DAL|            CVG|              LAX|10/22/22 07:38:00 AM|  10/22/22 10:49:08 AM|10/22/22 10:58:54 AM|  YES|9 minutes 46 seconds|
|      ROU1850|    ROU|            YUL|              LAS|10/22/22 06:49:56 AM|  10/22/22 10:58:35 AM

                                                                                