In [41]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count
from pyspark.sql import functions as F

In [65]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, FloatType

In [5]:
spark = SparkSession.builder.getOrCreate()

In [66]:
schema = StructType([
    StructField("id", StringType(), True),
    StructField("year", IntegerType(), True),
    StructField("month", StringType(), True),
    StructField("day", StringType(), True),
    StructField("dep_time", FloatType(), True),
    StructField("sched_dep_time", FloatType(), True),
    StructField("dep_delay", FloatType(), True),
    StructField("arr_time", FloatType(), True),
    StructField("sched_arr_time", FloatType(), True),
    StructField("arr_delay", FloatType(), True),
    StructField("carrier", StringType(), True),
    StructField("flight", StringType(), True),
    StructField("tailnum", StringType(), True),
    StructField("origin", StringType(), True),
    StructField("dest", StringType(), True),
    StructField("air_time", FloatType(), True),
    StructField("distance", FloatType(), True),
    StructField("hour", IntegerType(), True),
    StructField("minute", IntegerType(), True),
    StructField("time_hour", StringType(), True),
    StructField("name", StringType(), True)
])

In [67]:
df = spark.read.csv("/content/flights.csv", header=True, schema=schema)

In [68]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- dep_time: float (nullable = true)
 |-- sched_dep_time: float (nullable = true)
 |-- dep_delay: float (nullable = true)
 |-- arr_time: float (nullable = true)
 |-- sched_arr_time: float (nullable = true)
 |-- arr_delay: float (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: float (nullable = true)
 |-- distance: float (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- time_hour: string (nullable = true)
 |-- name: string (nullable = true)



In [69]:
df.show()

+---+----+-----+---+--------+--------------+---------+--------+--------------+---------+-------+------+-------+------+----+--------+--------+----+------+-------------------+--------------------+
| id|year|month|day|dep_time|sched_dep_time|dep_delay|arr_time|sched_arr_time|arr_delay|carrier|flight|tailnum|origin|dest|air_time|distance|hour|minute|          time_hour|                name|
+---+----+-----+---+--------+--------------+---------+--------+--------------+---------+-------+------+-------+------+----+--------+--------+----+------+-------------------+--------------------+
|  0|2013|    1|  1|   517.0|         515.0|      2.0|   830.0|         819.0|     11.0|     UA|  1545| N14228|   EWR| IAH|   227.0|  1400.0|   5|    15|2013-01-01 05:00:00|United Air Lines ...|
|  1|2013|    1|  1|   533.0|         529.0|      4.0|   850.0|         830.0|     20.0|     UA|  1714| N24211|   LGA| IAH|   227.0|  1416.0|   5|    29|2013-01-01 05:00:00|United Air Lines ...|
|  2|2013|    1|  1|   54

In [70]:
df = df[['id','name','year','month','day','origin','distance','hour','minute']]

In [71]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- distance: float (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)



In [20]:
df.show(truncate=False)

+---+------------------------+----+-----+---+------+--------+----+------+
|id |name                    |year|month|day|origin|distance|hour|minute|
+---+------------------------+----+-----+---+------+--------+----+------+
|0  |United Air Lines Inc.   |2013|1    |1  |EWR   |1400    |5   |15    |
|1  |United Air Lines Inc.   |2013|1    |1  |LGA   |1416    |5   |29    |
|2  |American Airlines Inc.  |2013|1    |1  |JFK   |1089    |5   |40    |
|3  |JetBlue Airways         |2013|1    |1  |JFK   |1576    |5   |45    |
|4  |Delta Air Lines Inc.    |2013|1    |1  |LGA   |762     |6   |0     |
|5  |United Air Lines Inc.   |2013|1    |1  |EWR   |719     |5   |58    |
|6  |JetBlue Airways         |2013|1    |1  |EWR   |1065    |6   |0     |
|7  |ExpressJet Airlines Inc.|2013|1    |1  |LGA   |229     |6   |0     |
|8  |JetBlue Airways         |2013|1    |1  |JFK   |944     |6   |0     |
|9  |American Airlines Inc.  |2013|1    |1  |LGA   |733     |6   |0     |
|10 |JetBlue Airways         |2013|1  

# Let's find the longest distance the plane travels.

In [74]:
airline_with_the_most_flights = df.groupBy(
    col("name").alias("airline")
).agg(
    count("*").alias("count_of_flights_by_airline")
).orderBy(
    col("count_of_flights_by_airline").desc()
)

In [73]:
df.groupBy("name").agg(max(col("distance")).alias("max_distance")).orderBy(col("max_distance").desc()).show(truncate=False)


+---------------------------+------------+
|name                       |max_distance|
+---------------------------+------------+
|Hawaiian Airlines Inc.     |4983.0      |
|United Air Lines Inc.      |4963.0      |
|Virgin America             |2586.0      |
|JetBlue Airways            |2586.0      |
|Delta Air Lines Inc.       |2586.0      |
|American Airlines Inc.     |2586.0      |
|Alaska Airlines Inc.       |2402.0      |
|US Airways Inc.            |2153.0      |
|Southwest Airlines Co.     |2133.0      |
|Frontier Airlines Inc.     |1620.0      |
|Endeavor Air Inc.          |1587.0      |
|ExpressJet Airlines Inc.   |1389.0      |
|Envoy Air                  |1147.0      |
|SkyWest Airlines Inc.      |1008.0      |
|AirTran Airways Corporation|762.0       |
|Mesa Airlines Inc.         |544.0       |
+---------------------------+------------+



In [79]:
df.groupBy(
    col("name").alias("AirLine")
).agg(
    max(col("distance")).alias("biggest_distance")
).orderBy(col("biggest_distance").desc()).show()

+--------------------+----------------+
|             AirLine|biggest_distance|
+--------------------+----------------+
|Hawaiian Airlines...|          4983.0|
|United Air Lines ...|          4963.0|
|      Virgin America|          2586.0|
|     JetBlue Airways|          2586.0|
|Delta Air Lines Inc.|          2586.0|
|American Airlines...|          2586.0|
|Alaska Airlines Inc.|          2402.0|
|     US Airways Inc.|          2153.0|
|Southwest Airline...|          2133.0|
|Frontier Airlines...|          1620.0|
|   Endeavor Air Inc.|          1587.0|
|ExpressJet Airlin...|          1389.0|
|           Envoy Air|          1147.0|
|SkyWest Airlines ...|          1008.0|
|AirTran Airways C...|           762.0|
|  Mesa Airlines Inc.|           544.0|
+--------------------+----------------+

