In [None]:
# test flight data from Mockaroo, limited to 1k rows
# Calculate number of flights by date
# Calculate most popular destination by number of flights to that airport
# What is the avg age of passenger
# What is the median ticket price
# give the top 5 nationalities of passengers who have traveled




In [28]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions as F


In [29]:
flights = "MOCK_DATA.json"

In [30]:
spark = SparkSession.builder.appName("sample_flight_data").getOrCreate()

In [32]:
flights_df = spark.read.format("json").option("inferSchema", "true").load(flights)
flights_df.count()           

1000

In [48]:
flights_df.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- aircraft_type: string (nullable = true)
 |-- airline_name: string (nullable = true)
 |-- arrival_airport: string (nullable = true)
 |-- arrival_date: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- baggage_weight: double (nullable = true)
 |-- departure_airport: string (nullable = true)
 |-- departure_date: string (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- flight_duration: double (nullable = true)
 |-- flight_number: string (nullable = true)
 |-- flight_status: string (nullable = true)
 |-- passenger_age: long (nullable = true)
 |-- passenger_gender: string (nullable = true)
 |-- passenger_name: string (nullable = true)
 |-- passenger_nationality: string (nullable = true)
 |-- pilot_name: string (nullable = true)
 |-- seat_number: string (nullable = true)
 |-- ticket_price: double (nullable = true)
 |-- arrival_date_formatted: date (nullable = true)



In [47]:
# drop corrupt_record column
flights_df.drop("_corrupt_record")

DataFrame[aircraft_type: string, airline_name: string, arrival_airport: string, arrival_date: string, arrival_time: string, baggage_weight: double, departure_airport: string, departure_date: string, departure_time: string, flight_duration: double, flight_number: string, flight_status: string, passenger_age: bigint, passenger_gender: string, passenger_name: string, passenger_nationality: string, pilot_name: string, seat_number: string, ticket_price: double, arrival_date_formatted: date]

In [35]:
# flights_df = flights_df.withColumn("arrival_date_formatted", to_date(flights_df['arrival_date'], 'yyyy-MM-dd'))

In [50]:
check_date = flights_df.select("arrival_date")
check_date.show()

+------------+
|arrival_date|
+------------+
|   9/12/2022|
|   7/22/2022|
|   7/30/2022|
|   4/25/2023|
|   3/19/2023|
|    9/8/2022|
|   5/11/2023|
|    9/3/2022|
|   9/29/2022|
|   6/14/2022|
|   7/19/2022|
|    2/9/2023|
|   7/25/2022|
|   9/21/2022|
|  10/18/2022|
|   1/19/2023|
|   4/17/2023|
|   9/26/2022|
|   1/26/2023|
|   2/13/2023|
+------------+
only showing top 20 rows



In [36]:
def removeDuplicates(df):
    dup_count = df.count()
    no_dup_count = df.distinct().count()
    diff = dup_count - no_dup_count
    if diff:
        print('Duplicates present, dropping them. Count of duplicates is {diff}')
        return df.dropDuplicates()
    else:
        print('no duplicates')
        return df

In [37]:
removeDuplicates(flights_df)

no duplicates


DataFrame[_corrupt_record: string, aircraft_type: string, airline_name: string, arrival_airport: string, arrival_date: string, arrival_time: string, baggage_weight: double, departure_airport: string, departure_date: string, departure_time: string, flight_duration: double, flight_number: string, flight_status: string, passenger_age: bigint, passenger_gender: string, passenger_name: string, passenger_nationality: string, pilot_name: string, seat_number: string, ticket_price: double, arrival_date_formatted: date]

In [38]:
# drop rows where arrival_airport and arrival_date are NULL
columns = ['arrival_airport', 'departure_airport']
flights_df = flights_df.na.drop(subset=columns)

In [39]:
flights_df.count()

999

In [43]:
# create temp table
flights_df.createOrReplaceTempView("flights")

In [57]:
# Calculate number of flights by date
flights_by_date = spark.sql(
                 f"""
                SELECT arrival_date,
                COUNT(*) AS cnt_flights
                FROM flights
                WHERE arrival_airport != '0'
                GROUP BY 1
                ORDER BY 2 DESC
                """)
flights_by_date.show()

+------------+-----------+
|arrival_date|cnt_flights|
+------------+-----------+
|   7/25/2022|          8|
|   2/20/2023|          8|
|   2/26/2023|          8|
|   1/26/2023|          8|
|   2/28/2023|          8|
|    5/3/2023|          8|
|  11/24/2022|          7|
|   9/22/2022|          7|
|   7/28/2022|          7|
|    1/5/2023|          7|
|   4/23/2023|          6|
|  11/22/2022|          6|
|   3/23/2023|          6|
|   9/16/2022|          6|
|    9/4/2022|          6|
|  12/19/2022|          6|
|    4/9/2023|          6|
|   8/10/2022|          6|
|   1/21/2023|          6|
|    3/4/2023|          6|
+------------+-----------+
only showing top 20 rows



In [56]:
# Calculate most popular destination by number of flights to that airport
most_popular_destinations = spark.sql(
                 f"""
                SELECT arrival_airport,
                COUNT(*) AS cnt_flights
                FROM flights
                WHERE arrival_airport != '0'
                GROUP BY 1
                ORDER BY 2 DESC
                LIMIT 1
                """)
most_popular_destinations.show()

+---------------+-----------+
|arrival_airport|cnt_flights|
+---------------+-----------+
|            EWR|          3|
+---------------+-----------+



In [60]:
# # What is the avg age of passenger
avg_age = spark.sql(
                 f"""
                SELECT ROUND(AVG(passenger_age)) as avg_age
                FROM flights
                WHERE arrival_airport != '0'
                AND passenger_age IS NOT NULL
                """)
avg_age.show()

+-------+
|avg_age|
+-------+
|   60.0|
+-------+



In [61]:
# What is the median ticket price
median_price = spark.sql(
                 f"""
                SELECT ROUND(PERCENTILE_APPROX(ticket_price, 0.5)) as median_price
                FROM flights
                WHERE arrival_airport != '0'
                AND ticket_price IS NOT NULL
                """)
median_price.show()

+------------+
|median_price|
+------------+
|       984.0|
+------------+



In [62]:
# give the top 5 nationalities of passengers who have traveled
top_5_nationalities = spark.sql(
                 f"""
                SELECT 
                passenger_nationality,
                COUNT(*) AS flights_cnt
                FROM flights
                WHERE arrival_airport != '0'
                AND passenger_nationality IS NOT NULL
                GROUP BY 1
                ORDER BY 2 DESC
                LIMIT 5
                """)
top_5_nationalities.show()

+---------------------+-----------+
|passenger_nationality|flights_cnt|
+---------------------+-----------+
|                China|        177|
|            Indonesia|        105|
|          Philippines|         54|
|               Russia|         54|
|               Brazil|         41|
+---------------------+-----------+

