In [113]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, coalesce, concat, lit
from pyspark.sql import functions as F

In [114]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, FloatType

In [115]:
spark = SparkSession.builder.getOrCreate()

In [116]:
schema = StructType([
    StructField("id", StringType(), True),
    StructField("year", IntegerType(), True),
    StructField("month", StringType(), True),
    StructField("day", StringType(), True),
    StructField("dep_time", FloatType(), True),
    StructField("sched_dep_time", FloatType(), True),
    StructField("dep_delay", FloatType(), True),
    StructField("arr_time", FloatType(), True),
    StructField("sched_arr_time", FloatType(), True),
    StructField("arr_delay", FloatType(), True),
    StructField("carrier", StringType(), True),
    StructField("flight", StringType(), True),
    StructField("tailnum", StringType(), True),
    StructField("origin", StringType(), True),
    StructField("dest", StringType(), True),
    StructField("air_time", FloatType(), True),
    StructField("distance", FloatType(), True),
    StructField("hour", IntegerType(), True),
    StructField("minute", IntegerType(), True),
    StructField("time_hour", StringType(), True),
    StructField("name", StringType(), True)
])

In [117]:
df = spark.read.csv("/content/flights.csv", header=True, schema=schema)

In [118]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- dep_time: float (nullable = true)
 |-- sched_dep_time: float (nullable = true)
 |-- dep_delay: float (nullable = true)
 |-- arr_time: float (nullable = true)
 |-- sched_arr_time: float (nullable = true)
 |-- arr_delay: float (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: float (nullable = true)
 |-- distance: float (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- time_hour: string (nullable = true)
 |-- name: string (nullable = true)



In [119]:
df.show()

+---+----+-----+---+--------+--------------+---------+--------+--------------+---------+-------+------+-------+------+----+--------+--------+----+------+-------------------+--------------------+
| id|year|month|day|dep_time|sched_dep_time|dep_delay|arr_time|sched_arr_time|arr_delay|carrier|flight|tailnum|origin|dest|air_time|distance|hour|minute|          time_hour|                name|
+---+----+-----+---+--------+--------------+---------+--------+--------------+---------+-------+------+-------+------+----+--------+--------+----+------+-------------------+--------------------+
|  0|2013|    1|  1|   517.0|         515.0|      2.0|   830.0|         819.0|     11.0|     UA|  1545| N14228|   EWR| IAH|   227.0|  1400.0|   5|    15|2013-01-01 05:00:00|United Air Lines ...|
|  1|2013|    1|  1|   533.0|         529.0|      4.0|   850.0|         830.0|     20.0|     UA|  1714| N24211|   LGA| IAH|   227.0|  1416.0|   5|    29|2013-01-01 05:00:00|United Air Lines ...|
|  2|2013|    1|  1|   54

In [120]:
df = df[['id','name','year','month','day','origin','distance','hour','minute']]

In [121]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- distance: float (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)



In [122]:
df.show(truncate=False)

+---+------------------------+----+-----+---+------+--------+----+------+
|id |name                    |year|month|day|origin|distance|hour|minute|
+---+------------------------+----+-----+---+------+--------+----+------+
|0  |United Air Lines Inc.   |2013|1    |1  |EWR   |1400.0  |5   |15    |
|1  |United Air Lines Inc.   |2013|1    |1  |LGA   |1416.0  |5   |29    |
|2  |American Airlines Inc.  |2013|1    |1  |JFK   |1089.0  |5   |40    |
|3  |JetBlue Airways         |2013|1    |1  |JFK   |1576.0  |5   |45    |
|4  |Delta Air Lines Inc.    |2013|1    |1  |LGA   |762.0   |6   |0     |
|5  |United Air Lines Inc.   |2013|1    |1  |EWR   |719.0   |5   |58    |
|6  |JetBlue Airways         |2013|1    |1  |EWR   |1065.0  |6   |0     |
|7  |ExpressJet Airlines Inc.|2013|1    |1  |LGA   |229.0   |6   |0     |
|8  |JetBlue Airways         |2013|1    |1  |JFK   |944.0   |6   |0     |
|9  |American Airlines Inc.  |2013|1    |1  |LGA   |733.0   |6   |0     |
|10 |JetBlue Airways         |2013|1  

# Let's find the airline with the most flights.

In [124]:
airline_with_the_most_flights = df.groupBy(
    col("name").alias("airline")
).agg(
    count("*").alias("count_of_flights_by_airline")
).orderBy(
    col("count_of_flights_by_airline").desc()
)

# Let's find the longest distance the planes travels.

In [125]:
longest_distance = df.groupBy(
    col("name").alias("AirLine")
).agg(
    max(col("distance")).alias("longest_distance")
).orderBy(col("longest_distance").desc())

In [126]:
df = df.withColumn(
    "minutes_total",
    (col("hour").cast("int") * 60) + col("minute").cast("int")
)

# Let's find the flight that take the most time.

In [127]:
df.show()

+---+--------------------+----+-----+---+------+--------+----+------+-------------+
| id|                name|year|month|day|origin|distance|hour|minute|minutes_total|
+---+--------------------+----+-----+---+------+--------+----+------+-------------+
|  0|United Air Lines ...|2013|    1|  1|   EWR|  1400.0|   5|    15|          315|
|  1|United Air Lines ...|2013|    1|  1|   LGA|  1416.0|   5|    29|          329|
|  2|American Airlines...|2013|    1|  1|   JFK|  1089.0|   5|    40|          340|
|  3|     JetBlue Airways|2013|    1|  1|   JFK|  1576.0|   5|    45|          345|
|  4|Delta Air Lines Inc.|2013|    1|  1|   LGA|   762.0|   6|     0|          360|
|  5|United Air Lines ...|2013|    1|  1|   EWR|   719.0|   5|    58|          358|
|  6|     JetBlue Airways|2013|    1|  1|   EWR|  1065.0|   6|     0|          360|
|  7|ExpressJet Airlin...|2013|    1|  1|   LGA|   229.0|   6|     0|          360|
|  8|     JetBlue Airways|2013|    1|  1|   JFK|   944.0|   6|     0|       

# To find the flight route that takes the most time we need to add a new column (dest).

In [128]:
df2 = spark.read.csv("/content/flights.csv", header=True, schema=schema)

In [129]:
df2 = df2.select(col("id"),col("dest"))

In [130]:
df2.show()

+---+----+
| id|dest|
+---+----+
|  0| IAH|
|  1| IAH|
|  2| MIA|
|  3| BQN|
|  4| ATL|
|  5| ORD|
|  6| FLL|
|  7| IAD|
|  8| MCO|
|  9| ORD|
| 10| PBI|
| 11| TPA|
| 12| LAX|
| 13| SFO|
| 14| DFW|
| 15| BOS|
| 16| LAS|
| 17| FLL|
| 18| ATL|
| 19| PBI|
+---+----+
only showing top 20 rows



In [131]:
df = df.join(df2, on="id", how="inner")

In [132]:
df.show()

+---+--------------------+----+-----+---+------+--------+----+------+-------------+----+
| id|                name|year|month|day|origin|distance|hour|minute|minutes_total|dest|
+---+--------------------+----+-----+---+------+--------+----+------+-------------+----+
|  0|United Air Lines ...|2013|    1|  1|   EWR|  1400.0|   5|    15|          315| IAH|
|  1|United Air Lines ...|2013|    1|  1|   LGA|  1416.0|   5|    29|          329| IAH|
|  2|American Airlines...|2013|    1|  1|   JFK|  1089.0|   5|    40|          340| MIA|
|  3|     JetBlue Airways|2013|    1|  1|   JFK|  1576.0|   5|    45|          345| BQN|
|  4|Delta Air Lines Inc.|2013|    1|  1|   LGA|   762.0|   6|     0|          360| ATL|
|  5|United Air Lines ...|2013|    1|  1|   EWR|   719.0|   5|    58|          358| ORD|
|  6|     JetBlue Airways|2013|    1|  1|   EWR|  1065.0|   6|     0|          360| FLL|
|  7|ExpressJet Airlin...|2013|    1|  1|   LGA|   229.0|   6|     0|          360| IAD|
|  8|     JetBlue Air