In [1]:
from pyspark import SparkContext
import os
os.chdir('/Users/chkapsalis/Documents/GitHub/Big_Data_Architectures/Assignments/my_assignment_3')

# For some reason i need to run this every time in order to get it work
import os
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk/Contents/Home" 

spark = SparkContext("local[1]", "app")


25/03/28 17:08:34 WARN Utils: Your hostname, ChristoorossAir resolves to a loopback address: 127.0.0.1; using 192.168.1.18 instead (on interface en0)
25/03/28 17:08:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/28 17:08:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
cols='Date,AirlineCode,Airline,ORG,DST,DeptTime,DeptDelta,DeptDelay,ArrTime,ArrDelta,ArrDelay,Cancelled,Distance,CarDelay,WeaDelay,NASDelay,SecDelay,LaADelay'.split(',')

# We are not using spark dataframes, so ingesting a csv involved manually removing the file's header line.

In [3]:
# I will first ingest the file once to create an airline_id -> airline symbol mapper
datafile= spark.textFile('file:///' + os.getcwd() + '/ontime_flights.csv')

# The first map => creating sub-rdds like ('airline_id', 'airline_symbol') - these might be duplicate; we need 'distinct()' before collect 
# to only retrieve he unique key-value pairs!
airlines = datafile.map(lambda full_line: full_line.split(',')) \
    .map(lambda x: (x[1], x[2])) \
    .distinct() \
         .collect()

print(airlines)

[Stage 0:>                                                          (0 + 1) / 1]

[('19805', 'AA'), ('19930', 'AS'), ('20409', 'B6'), ('19790', 'DL'), ('20366', 'EV'), ('20436', 'F9'), ('20437', 'FL'), ('19690', 'HA'), ('20398', 'MQ'), ('20304', 'OO'), ('19977', 'UA'), ('20355', 'US'), ('21171', 'VX'), ('19393', 'WN')]


                                                                                

In [12]:
import datetime 

def parse(row):
    """ This function allows for the more straightforward ingestion of the file's contents per line. 
    It also facilitates making sure we ignore any invalid lines"""
    
    try:
        date = datetime.datetime.strptime(row[0], "%Y-%m-%d")
        airline_code = int(row[1])
        airline_symbol = row[2]
        arr_delay = int(row[6])
        dest_delay = int(row[9])
        origin = row[3]
        dest = row[4]
        return (date, airline_code, airline_symbol, arr_delay, dest_delay, origin, dest)
    except:
        return 


In [13]:
# Now we will ingest all data of interest pertaining to flights; date,airline code,arr_delay,dept_delay
flights = datafile.map(lambda full_line: full_line.split(',')) \
                .map(parse) \
                .filter(lambda x: x is not None)
                
# Peaking into the resulting rdd
for el in flights.take(5):
    print(el)

(datetime.datetime(2014, 4, 1, 0, 0), 19805, 'AA', -6, 2, 'JFK', 'LAX')
(datetime.datetime(2014, 4, 1, 0, 0), 19805, 'AA', 14, -29, 'LAX', 'JFK')
(datetime.datetime(2014, 4, 1, 0, 0), 19805, 'AA', -6, 39, 'JFK', 'LAX')
(datetime.datetime(2014, 4, 1, 0, 0), 19805, 'AA', 25, -27, 'LAX', 'JFK')
(datetime.datetime(2014, 4, 1, 0, 0), 19805, 'AA', -5, 15, 'DFW', 'HNL')


In [14]:
# Question 1

# I will only keep each line's data about the name of the airline and the total delay on the relevant line (aka particular flight)
# I now have key-value pairs -> i will find the max value per key (aka the max delay per airline)
# sort in descending order
# keep only the first rdd - that pertaining to the max overall delay that has ever been observed, linked to the relevant airline!

max_total_delay = flights \
        .map(lambda x: (x[2], x[3]+x[4])) \
        .reduceByKey(lambda x, y: max(x, y)) \
        .sortBy(lambda kv: kv[1], ascending=False) \
        .take(1)  #returns a list object
        

print(max_total_delay)

[Stage 13:>                                                         (0 + 1) / 1]

[('AA', 3489)]


                                                                                

In [15]:
# Question 2
# From each line, I will only the departure delay of the flight described
# I will then compute the overall mean
avg_overall_delay = flights \
    .map(lambda x: x[3]) \
    .mean()

print('Avg dept delay out of JFK airport:', avg_overall_delay)

[Stage 15:>                                                         (0 + 1) / 1]

Avg dept delay out of JFK airport: 8.313877046894083


                                                                                

In [16]:
# Question 3
daily_avg_delays = flights \
    .map(lambda x: (x[0], (x[3]+x[4], 1))) \
    .reduceByKey(lambda x, y: (x[0]+y[0], x[1]+y[1])) \
    .mapValues(lambda x: x[0] / x[1])

print(daily_avg_delays.take(5))

[Stage 16:>                                                         (0 + 1) / 1]

[(datetime.datetime(2014, 4, 1, 0, 0), 8.307189948998632), (datetime.datetime(2014, 4, 2, 0, 0), 5.3777832150721805), (datetime.datetime(2014, 4, 3, 0, 0), 31.550911300121506), (datetime.datetime(2014, 4, 4, 0, 0), 24.078086549426803), (datetime.datetime(2014, 4, 5, 0, 0), 3.408638838475499)]


                                                                                

In [17]:
# Question 4
avg_total_for_aa = flights \
    .filter(lambda x: x[2] == "AA") \
    .map(lambda x: x[3]+x[4]) \
    .mean()

print('Avg total delays for AA per flight:', avg_total_for_aa)

[Stage 18:>                                                         (0 + 1) / 1]

Avg total delays for AA per flight: 9.981389864989794


                                                                                

In [21]:
# Question 5
from_jfk_to_lax_daily = flights \
    .filter(lambda x: x[5] == "JFK" and x[6] == "LAX") \
    .map(lambda x: (x[0],1)) \
    .reduceByKey(lambda x, y: x + y) \
    .sortByKey(ascending=True)

print(from_jfk_to_lax_daily.take(5))

[Stage 24:>                                                         (0 + 1) / 1]

[(datetime.datetime(2014, 4, 1, 0, 0), 35), (datetime.datetime(2014, 4, 2, 0, 0), 34), (datetime.datetime(2014, 4, 3, 0, 0), 33), (datetime.datetime(2014, 4, 4, 0, 0), 34), (datetime.datetime(2014, 4, 5, 0, 0), 26)]


                                                                                