In [1]:
!pip install pyspark
!pip install findspark



In [1]:
import findspark
findspark.init()
import re
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql import Window
from pyspark.sql.functions import rank
from datetime import datetime

In [2]:
# Expressoes regulares comuns
REGEX_ALPHA    = r'[a-zA-Z]+'
REGEX_INTEGER  = r'[0-9]+'
REGEX_FLOAT    = r'[0-9]+\.[0-9]+'
REGEX_ALPHANUM = r'[0-9a-zA-Z]+'
REGEX_EMPTY_STR= r'[\t ]+$'
REGEX_SPECIAL  = r'[!@#$%&*\(\)_]+'
REGEX_NNUMBER  = r'^N[1-9][0-9]{2,3}([ABCDEFGHJKLMNPRSTUVXWYZ]{1,2})'
REGEX_NNUMBER_INVALID = r'(N0.*$)|(.*[IO].*)'
REGEX_TIME_FMT = r'^(([0-1]?[0-9])|(2[0-3]))([0-5][0-9])$'

In [3]:
# Variaveis de data
current_year = datetime.today().strftime('%Y')

In [4]:
# Criar o contexto do spark
sc = SparkContext()

# Instancia o criador de sessao do spark
spark = (SparkSession.builder
                     .master("local[7]")
                     .appName("Aceleração PySpark - Capgemini"))

In [5]:
schema_airports = StructType([
    StructField("faa",  StringType(),  True),
    StructField("name", StringType(),  True),
    StructField("lat",  FloatType(),   True),
    StructField("lon",  FloatType(),   True),
    StructField("alt",  IntegerType(), True),
    StructField("tz",   IntegerType(), True),
    StructField("dst",  StringType(),  True)
])

schema_planes = StructType([
    StructField("tailnum",      StringType(),  True),
    StructField("year",         IntegerType(), True),
    StructField("type",         StringType(),  True),
    StructField("manufacturer", StringType(),  True),
    StructField("model",        StringType(),  True),
    StructField("engines",      IntegerType(), True),
    StructField("seats",        IntegerType(), True),
    StructField("speed",        IntegerType(), True),
    StructField("engine",       StringType(),  True)
])

schema_flights = StructType([
    StructField("year",      IntegerType(), True),
    StructField("month",     IntegerType(), True),
    StructField("day",       IntegerType(), True),
    StructField("dep_time",  StringType(),  True),
    StructField("dep_delay", IntegerType(), True),
    StructField("arr_time",  StringType(),  True),
    StructField("arr_delay", IntegerType(), True),
    StructField("carrier",   StringType(),  True),
    StructField("tailnum",   StringType(),  True),
    StructField("flight",    StringType(),  True),
    StructField("origin",    StringType(),  True),
    StructField("dest",      StringType(),  True),
    StructField("air_time",  IntegerType(), True),
    StructField("distance",  IntegerType(), True),
    StructField("hour",      IntegerType(), True),
    StructField("minute",    IntegerType(), True),
])

In [6]:
df_airports = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_airports)
                  .load("Datasets/airports.csv"))

df_planes = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_planes)
                  .load("Datasets/planes.csv"))

df_flights = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_flights)
                  .load("Datasets/flights.csv"))

In [7]:
df_airports.show(5)

+---+--------------------+---------+---------+----+---+---+
|faa|                name|      lat|      lon| alt| tz|dst|
+---+--------------------+---------+---------+----+---+---+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4|  A|
+---+--------------------+---------+---------+----+---+---+
only showing top 5 rows



## Airports Dataset

### Pergunta 1

In [21]:
df_airports = df_airports.select(
                            'faa', 
                            'name', 
                            'lat', 
                            'lon', 
                            (F.when(
                                F.col('alt') < 0, 0)
                                 .otherwise(F.col('alt'))
                                 .alias('alt')), 
                            'tz', 
                            'dst')

df_airports.show(5)

+---+--------------------+---------+---------+----+---+---+
|faa|                name|      lat|      lon| alt| tz|dst|
+---+--------------------+---------+---------+----+---+---+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4| -4|
+---+--------------------+---------+---------+----+---+---+
only showing top 5 rows



### Pergunta 2

In [9]:

df_airports = df_airports.select(
                                'faa', 
                                'name', 
                                'lat', 
                                'lon', 
                                'alt',
                                'tz',
                                (F.when(
                                    F.col('tz')
                                     .between(-7, -5), 'A')
                                     .otherwise(F.col('tz')).alias('dst'))
                                )

df_airports.show(5)

+---+--------------------+---------+---------+----+---+---+
|faa|                name|      lat|      lon| alt| tz|dst|
+---+--------------------+---------+---------+----+---+---+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4| -4|
+---+--------------------+---------+---------+----+---+---+
only showing top 5 rows



### Pergunta 3

In [19]:
df_airports = df_airports.select(
                            'faa', 
                            'name', 
                            'lat', 
                            'lon', 
                            'alt', 
                            'tz', 
                            (F.when(
                                F.col('dst') == 'U', 'A')
                                 .otherwise(F.col('dst'))
                                 .alias('dst')))

df_airports.show(5)

+---+--------------------+---------+---------+----+---+---+
|faa|                name|      lat|      lon| alt| tz|dst|
+---+--------------------+---------+---------+----+---+---+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4| -4|
+---+--------------------+---------+---------+----+---+---+
only showing top 5 rows



### Pergunta 4

In [22]:
df_airports = df_airports.withColumn('region', (
                                        F.when((F.col('lon') < -124), 'ALASKA')
                                         .when((F.col('lon') > -50) | (F.col('lon') < 24), 'OFFSHORE')
                                         .when((F.col('lon') <= -95) & (F.col('lon').between(-124, -50)), 'MAINLAND-WEST')
                                         .when((F.col('lon') > -95) & (F.col('lon').between(-124, -50)), 'MAINLAND-EAST')
                                        ).otherwise('NaN'))

df_airports.show(5)

+---+--------------------+---------+---------+----+---+---+--------+
|faa|                name|      lat|      lon| alt| tz|dst|  region|
+---+--------------------+---------+---------+----+---+---+--------+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|OFFSHORE|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|OFFSHORE|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|OFFSHORE|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|OFFSHORE|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4| -4|OFFSHORE|
+---+--------------------+---------+---------+----+---+---+--------+
only showing top 5 rows



### Pergunta 5

In [23]:
REGEX_AP_CATEGORY = '.*Airport*|.*Tradeport*|.*Heliport*|.*Airpor*|.*Arpt*'
REGEX_AD_CATEGORY = '.*Aerodrome*'
REGEX_AK_CATEGORY = '.*Airpark*|.*Aero Park*'
REGEX_AS_CATEGORY = '.*Station*|.*Air Station*'
REGEX_FL_CATEGORY = '.*Field*|.*Fld*'

df_airports = df_airports.withColumn('type', (
                                        F.when(F.col('name').rlike(REGEX_AP_CATEGORY), 'AP')
                                         .when(F.col('name').rlike(REGEX_AD_CATEGORY), 'AD')
                                         .when(F.col('name').rlike(REGEX_AK_CATEGORY), 'AK')
                                         .when(F.col('name').rlike(REGEX_AS_CATEGORY), 'AS')
                                         .when(F.col('name').rlike(REGEX_FL_CATEGORY), 'FL')
                                        ).otherwise('NaN'))

df_airports.show(5)

+---+--------------------+---------+---------+----+---+---+--------+----+
|faa|                name|      lat|      lon| alt| tz|dst|  region|type|
+---+--------------------+---------+---------+----+---+---+--------+----+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|OFFSHORE|  AP|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|OFFSHORE|  AP|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|OFFSHORE| NaN|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|OFFSHORE|  AP|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4| -4|OFFSHORE|  AP|
+---+--------------------+---------+---------+----+---+---+--------+----+
only showing top 5 rows



### Pergunta 6

In [24]:
MILITARY_SUBSTRINGS = [
    "Base", 
    "Aaf", 
    "AFs", 
    "Ahp", 
    "Afb", 
    "LRRS", 
    "Lrrs", 
    "Arb", 
    "Naf", 
    "NAS", 
    "Nas", 
    "Jrb", 
    "Ns", 
    "As", 
    "Cgas", 
    "Angb"
]

REGEX_MILITARY = r'|'.join(map(lambda x : f".*(^| ){x}($| ).*", MILITARY_SUBSTRINGS))

df_airports = df_airports.withColumn('military', (
                                        F.when(F.col('name').rlike(REGEX_MILITARY), True)
                                        ).otherwise(False))

df_airports.show(5)

+---+--------------------+---------+---------+----+---+---+--------+----+--------+
|faa|                name|      lat|      lon| alt| tz|dst|  region|type|military|
+---+--------------------+---------+---------+----+---+---+--------+----+--------+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|OFFSHORE|  AP|   false|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|OFFSHORE|  AP|   false|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|OFFSHORE| NaN|   false|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|OFFSHORE|  AP|   false|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4| -4|OFFSHORE|  AP|   false|
+---+--------------------+---------+---------+----+---+---+--------+----+--------+
only showing top 5 rows



### Pergunta 7

In [25]:
REGEX_I_SUBS = '.*International*|.*Intl*|.*Intercontinental*'
REGEX_N_SUBS = '.*National*|.*Natl*'
REGEX_R_SUBS = '.*Regional*|.*Reigonal*|.*Rgnl*|.*County*|.*Metro*|.*Metropolitan*'
REGEX_M_SUBS = '.*Municipal*|.*Muni*|.*City*'

df_airports = df_airports.withColumn('administration', (
                                        F.when(F.col('name').rlike(REGEX_I_SUBS), 'I')
                                         .when(F.col('name').rlike(REGEX_N_SUBS), 'N')
                                         .when(F.col('name').rlike(REGEX_R_SUBS), 'R')
                                         .when(F.col('name').rlike(REGEX_M_SUBS), 'M')
                                        ).otherwise('NaN'))

df_airports.show(5)

+---+--------------------+---------+---------+----+---+---+--------+----+--------+--------------+
|faa|                name|      lat|      lon| alt| tz|dst|  region|type|military|administration|
+---+--------------------+---------+---------+----+---+---+--------+----+--------+--------------+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|OFFSHORE|  AP|   false|           NaN|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|OFFSHORE|  AP|   false|             M|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|OFFSHORE| NaN|   false|             R|
|06N|     Randall Airport| 41.43191|-74.39156| 523| -5|  A|OFFSHORE|  AP|   false|           NaN|
|09J|Jekyll Island Air...|31.074472|-81.42778|  11| -4| -4|OFFSHORE|  AP|   false|           NaN|
+---+--------------------+---------+---------+----+---+---+--------+----+--------+--------------+
only showing top 5 rows



# Planes Dataset

### Pergunta 1

In [26]:
df_planes = df_planes.select(
                            'tailnum', 
                            'year', 
                            'type', 
                            'manufacturer', 
                            'model', 
                            'engines', 
                            'seats', 
                            'speed', 
                            'engine'
                            )

df_planes = df_planes.withColumn('tailchar', F.when(F.col('tailnum').rlike('[ABCDEFGHJKLMPRSTUVXWYZ]'), F.regexp_extract(F.col('tailnum'), '[ABCDEFGHJKLMPRSTUVXWYZ]+', 0)))

df_planes.show(5)

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+--------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|tailchar|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+--------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      UW|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      US|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      UW|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      UW|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      US|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+--------+
only showing top 5 rows



### Pergunta 2

In [27]:
df_planes = df_planes.select('tailnum', 
                 (F.when(
                     F.col('year') == 0, 1996)
                      .otherwise(F.col('year'))
                      .alias('year')),
                'type', 
                'manufacturer', 
                'model', 
                'engines', 
                'seats', 
                'speed', 
                'engine',
                 'tailchar'
                )

df_planes.show(5)

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+--------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|tailchar|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+--------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      UW|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      US|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      UW|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      UW|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      US|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+--------+
only showing top 5 rows



### Pergunta 3

In [31]:
df_aux1 = df_planes.select('manufacturer', 'model', 'year').groupBy('manufacturer', 'model').agg(F.min('year').alias('aux_year_1')).orderBy(F.col('manufacturer'), F.col('model'))

df_aux2 = df_planes.select('manufacturer', 'year').groupBy('manufacturer').agg(F.min('year').alias('aux_year_2')).orderBy(F.col('manufacturer'))

df_aux1 = df_aux1.withColumnRenamed('manufacturer', 'manufacturer_aux1').withColumnRenamed('model', 'model_aux1')

df_aux2 = df_aux2.withColumnRenamed('manufacturer', 'manufacturer_aux2')

condition = [df_planes.model == df_aux1.model_aux1, df_planes.manufacturer == df_aux1.manufacturer_aux1]

df_aux3 = df_planes.join(df_aux1, condition, 'left').withColumnRenamed('aux_year_1', 'aux_year_first_condition')

df_aux3 = df_aux3.drop('manufacturer_aux1', 'model_aux1')
  
condition1 = [df_aux3.manufacturer == df_aux2.manufacturer_aux2]

df_aux4 = df_aux3.join(df_aux2, condition1, 'left').withColumnRenamed('aux_year_2', 'aux_year_second_condition')

df_aux4 = df_aux4.drop('manufacturer_aux2')

In [32]:
df = df_aux4.withColumn('year', F.when(F.col('year').isNull(), F.col('aux_year_first_condition'))
                                 .otherwise(F.col('aux_year_first_condition'))
                       )

df = df_aux4.withColumn('year', F.when(F.col('aux_year_first_condition').isNull(), F.col('aux_year_second_condition'))
                                 .otherwise(F.col('aux_year_first_condition'))
                       )
                               
df = df.select('year', 'manufacturer', 'model', 'aux_year_first_condition', 'aux_year_second_condition')

df.createOrReplaceTempView('planes')

spark.getOrCreate().sql("select * from planes where year is Null").show()


+----+---------------+-------------+------------------------+-------------------------+
|year|   manufacturer|        model|aux_year_first_condition|aux_year_second_condition|
+----+---------------+-------------+------------------------+-------------------------+
|null|LAMBERT RICHARD|    FALCON XP|                    null|                     null|
|null|  BARKER JACK L|ZODIAC 601HDS|                    null|                     null|
+----+---------------+-------------+------------------------+-------------------------+



### Pergunta 4

In [33]:
df_planes = df_planes.withColumn('age', int(current_year) - F.col('year'))
df_planes.show(5)

+-------+----+---------+----------------+--------+-------+-----+-----+---------+--------+---+
|tailnum|year|     type|    manufacturer|   model|engines|seats|speed|   engine|tailchar|age|
+-------+----+---------+----------------+--------+-------+-----+-----+---------+--------+---+
| N102UW|1998|MULTI_ENG|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      UW| 24|
| N103US|1999|MULTI_ENG|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      US| 23|
| N104UW|1999|MULTI_ENG|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      UW| 23|
| N105UW|1999|MULTI_ENG|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      UW| 23|
| N107US|1999|MULTI_ENG|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      US| 23|
+-------+----+---------+----------------+--------+-------+-----+-----+---------+--------+---+
only showing top 5 rows



### Pergunta 5

In [29]:
df_planes = df_planes.select(
                'tailnum',
                'year',
                (F.when(F.col('type').startswith('Fixed wing multi engine'), 'MULTI_ENG')
                  .when(F.col('type').startswith('Fixed wing single engine'), 'SINGLE_ENG')
                  .when(F.col('type').startswith('Rotorcraft'), 'ROTORCRAFT')
                  .alias('type')), 
                'manufacturer', 
                'model', 
                'engines', 
                'seats', 
                'speed', 
                'engine',
                'tailchar',
                'age'
                )

df_planes.show(5)

+-------+----+---------+----------------+--------+-------+-----+-----+---------+--------+---+
|tailnum|year|     type|    manufacturer|   model|engines|seats|speed|   engine|tailchar|age|
+-------+----+---------+----------------+--------+-------+-----+-----+---------+--------+---+
| N102UW|1998|MULTI_ENG|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      UW| 24|
| N103US|1999|MULTI_ENG|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      US| 23|
| N104UW|1999|MULTI_ENG|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      UW| 23|
| N105UW|1999|MULTI_ENG|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      UW| 23|
| N107US|1999|MULTI_ENG|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      US| 23|
+-------+----+---------+----------------+--------+-------+-----+-----+---------+--------+---+
only showing top 5 rows



### Pergunta 6

In [34]:
df_planes = df_planes.select(
                'tailnum',
                'year',
                'type', 
                (F.when(F.col('manufacturer').contains('AIRBUS'), 'AIRBUS')
                  .when(F.col('manufacturer').contains('BOEING'), 'BOEING')
                  .when(F.col('manufacturer').contains('BOMBARDIER'), 'BOMBARDIER')
                  .when(F.col('manufacturer').contains('CESSNA'), 'CESSNA')
                  .when(F.col('manufacturer').contains('EMBRAER'), 'EMBRAER')
                  .when(F.col('manufacturer').contains('SIKORSKY'), 'SIKORSKY')
                  .when(F.col('manufacturer').contains('CANADAIR'), 'CANADAIR')
                  .when(F.col('manufacturer').contains('PIPER'), 'PIPER')
                  .when(F.col('manufacturer').contains('MCDONNELL DOUGLAS'), 'MCDONNELL DOUGLAS')
                  .when(F.col('manufacturer').contains('BELL'), 'BELL')
                  .when(F.col('manufacturer').contains('KILDALL GARY'), 'KILDALL GARY')
                  .when(F.col('manufacturer').contains('LAMBERT RICHARD'), 'LAMBERT RICHARD')
                  .when(F.col('manufacturer').contains('BARKER JACK'), 'BARKER JACK')
                  .when(F.col('manufacturer').contains('ROBINSON HELICOPTER'), 'ROBINSON HELICOPTER')
                  .when(F.col('manufacturer').contains('GULFSTREAM'), 'GULFSTREAM')
                  .when(F.col('manufacturer').contains('MARZ BARRY'), 'MARZ BARRY').alias('manufacturer')),
                'model', 
                'engines', 
                'seats', 
                'speed', 
                'engine',
                'tailchar',
                'age'
                )

df_planes.show(5)

+-------+----+---------+------------+--------+-------+-----+-----+---------+--------+---+
|tailnum|year|     type|manufacturer|   model|engines|seats|speed|   engine|tailchar|age|
+-------+----+---------+------------+--------+-------+-----+-----+---------+--------+---+
| N102UW|1998|MULTI_ENG|      AIRBUS|A320-214|      2|  182| null|Turbo-fan|      UW| 24|
| N103US|1999|MULTI_ENG|      AIRBUS|A320-214|      2|  182| null|Turbo-fan|      US| 23|
| N104UW|1999|MULTI_ENG|      AIRBUS|A320-214|      2|  182| null|Turbo-fan|      UW| 23|
| N105UW|1999|MULTI_ENG|      AIRBUS|A320-214|      2|  182| null|Turbo-fan|      UW| 23|
| N107US|1999|MULTI_ENG|      AIRBUS|A320-214|      2|  182| null|Turbo-fan|      US| 23|
+-------+----+---------+------------+--------+-------+-----+-----+---------+--------+---+
only showing top 5 rows



### Pergunta 7

In [35]:
df_planes = df_planes.select(
                'tailnum',
                'year',
                'type', 
                'manufacturer',
                (F.when(F.col('model').contains('('), F.regexp_replace(F.trim(F.col('model')), '(', ''))
                 .when(F.col('model').contains(')'), F.regexp_replace(F.trim(F.col('model')), ')', ''))
                 .otherwise(F.trim(F.col('model')))
                 .alias('model')),
                'engines', 
                'seats', 
                'speed', 
                'engine',
                'tailchar',
                'age'
                )

df_planes.show(5)

+-------+----+---------+------------+--------+-------+-----+-----+---------+--------+---+
|tailnum|year|     type|manufacturer|   model|engines|seats|speed|   engine|tailchar|age|
+-------+----+---------+------------+--------+-------+-----+-----+---------+--------+---+
| N102UW|1998|MULTI_ENG|      AIRBUS|A320-214|      2|  182| null|Turbo-fan|      UW| 24|
| N103US|1999|MULTI_ENG|      AIRBUS|A320-214|      2|  182| null|Turbo-fan|      US| 23|
| N104UW|1999|MULTI_ENG|      AIRBUS|A320-214|      2|  182| null|Turbo-fan|      UW| 23|
| N105UW|1999|MULTI_ENG|      AIRBUS|A320-214|      2|  182| null|Turbo-fan|      UW| 23|
| N107US|1999|MULTI_ENG|      AIRBUS|A320-214|      2|  182| null|Turbo-fan|      US| 23|
+-------+----+---------+------------+--------+-------+-----+-----+---------+--------+---+
only showing top 5 rows



### Pergunta 8

In [36]:
df_planes = df_planes.select(
                'tailnum',
                'year',
                'type', 
                'manufacturer',
                'model',
                'engines', 
                'seats', 
                F.when(F.col('speed').isNull(), F.col('seats') / 0.36).alias('speed').cast(IntegerType()), 
                'engine',
                'tailchar',
                'age'
                )

df_planes.show(5)

+-------+----+---------+------------+--------+-------+-----+-----+---------+--------+---+
|tailnum|year|     type|manufacturer|   model|engines|seats|speed|   engine|tailchar|age|
+-------+----+---------+------------+--------+-------+-----+-----+---------+--------+---+
| N102UW|1998|MULTI_ENG|      AIRBUS|A320-214|      2|  182|  505|Turbo-fan|      UW| 24|
| N103US|1999|MULTI_ENG|      AIRBUS|A320-214|      2|  182|  505|Turbo-fan|      US| 23|
| N104UW|1999|MULTI_ENG|      AIRBUS|A320-214|      2|  182|  505|Turbo-fan|      UW| 23|
| N105UW|1999|MULTI_ENG|      AIRBUS|A320-214|      2|  182|  505|Turbo-fan|      UW| 23|
| N107US|1999|MULTI_ENG|      AIRBUS|A320-214|      2|  182|  505|Turbo-fan|      US| 23|
+-------+----+---------+------------+--------+-------+-----+-----+---------+--------+---+
only showing top 5 rows



### Pergunta 9

In [23]:
df_planes = df_planes.select(
                'tailnum',
                'year',
                'type', 
                'manufacturer',
                'model',
                'engines', 
                'seats', 
                'speed', 
                'engine',
                'tailchar',
                'age'
                )

df_planes = df_planes.withColumn('engine_type', 
                    F.when(F.col('engine').contains('Turbo-fan'), 'FAN')
                     .when(F.col('engine').contains('Turbo-jet'), 'JET')
                     .when(F.col('engine').contains('Turbo-prop'), 'PROP')
                     .when(F.col('engine').contains('Turbo-shaft'), 'SHAFT')
                     .when(F.col('engine').contains('4 Cycle'), 'CYCLE'))

df_planes.show(5)

+-------+----+---------+------------+--------+-------+-----+-----+---------+--------+---+-----------+
|tailnum|year|     type|manufacturer|   model|engines|seats|speed|   engine|tailchar|age|engine_type|
+-------+----+---------+------------+--------+-------+-----+-----+---------+--------+---+-----------+
| N102UW|1998|MULTI_ENG|      AIRBUS|A320-214|      2|  182|  505|Turbo-fan|      UW| 24|        FAN|
| N103US|1999|MULTI_ENG|      AIRBUS|A320-214|      2|  182|  505|Turbo-fan|      US| 23|        FAN|
| N104UW|1999|MULTI_ENG|      AIRBUS|A320-214|      2|  182|  505|Turbo-fan|      UW| 23|        FAN|
| N105UW|1999|MULTI_ENG|      AIRBUS|A320-214|      2|  182|  505|Turbo-fan|      UW| 23|        FAN|
| N107US|1999|MULTI_ENG|      AIRBUS|A320-214|      2|  182|  505|Turbo-fan|      US| 23|        FAN|
+-------+----+---------+------------+--------+-------+-----+-----+---------+--------+---+-----------+
only showing top 5 rows



# Flights Dataset

### Pergunta 1

In [41]:
df_flights = df_flights.select(
                       'year',
                       'month',
                       'day', 
                       'dep_time',
                       'dep_delay',
                       'arr_time', 
                       'arr_delay', 
                       'carrier', 
                       'tailnum',
                       'flight',
                       'origin',
                       'dest', 
                       'air_time',
                       'distance',
                      F.when(F.col('hour').isNull() | F.col('hour').like('NA'), 0).otherwise(F.col('hour')).alias('hour'),
                      F.when(F.col('minute').isNull() | F.col('minute').like('NA'), 0).otherwise(F.col('minute')).alias('minute'))

df_flights.show(5)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|
|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|
|2014|    4|  9|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|
|2014|    3|  9|     754|       -1|    1015|        1|     AS| N612AS|   522|   SEA| BUR|     127|     937|   7|    54|
+----+-----+---+--------+---------+-----

### Pergunta 2

In [39]:
df_flights = df_flights.select(
                'year',
                'month',
                'day', 
                'dep_time',
                'dep_delay',
                'arr_time', 
                'arr_delay', 
                'carrier', 
                'tailnum',
                'flight',
                'origin',
                'dest', 
                'air_time',
                'distance',
                F.when(F.col('hour') == 24, 0).otherwise(F.col('hour')).alias('hour'),
                'minute'
                )

df_flights.show(5)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|
|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|
|2014|    4|  9|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|
|2014|    3|  9|     754|       -1|    1015|        1|     AS| N612AS|   522|   SEA| BUR|     127|     937|   7|    54|
+----+-----+---+--------+---------+-----

### Pergunta 3

In [43]:
# # Fazer o cast pro timestamp depois pois está ficando nulo
# df_flights = df_flights.withColumn('dep_year', F.concat_ws('-' ,F.col('year'), F.lpad(F.col('month'), 2, '0'), F.lpad(F.col('day'), 2, '0')))
# df_flights = df_flights.withColumn('dep_hour', F.concat_ws(':', F.lpad(F.col('hour'), 2, '0'), F.lpad(F.col('minute'), 2, '0'), F.lit('00')))
# df_flights = df_flights.withColumn('dep_datetime', F.concat_ws(' ', 'dep_year', 'dep_hour'))

# df_flights.drop('dep_year', 'dep_hour').show(5)

df_flights = df_flights.withColumn('dep_datetime', 
                    F.to_timestamp( F.concat_ws(' ', F.concat_ws('-', df_flights.year, df_flights.month, df_flights.day),
                                                     F.concat_ws(':', df_flights.hour, df_flights.minute, F.lit(00))       
                                               )
                                  )
)

df_flights.createOrReplaceTempView('flights')

spark.getOrCreate().sql("select dep_datetime, year, month, day, hour, minute from flights").show(5)

+-------------------+----+-----+---+----+------+
|       dep_datetime|year|month|day|hour|minute|
+-------------------+----+-----+---+----+------+
|2014-12-08 06:58:00|2014|   12|  8|   6|    58|
|2014-01-22 10:40:00|2014|    1| 22|  10|    40|
|2014-03-09 14:43:00|2014|    3|  9|  14|    43|
|2014-04-09 17:05:00|2014|    4|  9|  17|     5|
|2014-03-09 07:54:00|2014|    3|  9|   7|    54|
+-------------------+----+-----+---+----+------+
only showing top 5 rows



### Pergunta 4

In [47]:
df_flights = df_flights.select(
                'year',
                'month',
                'day',
                'dep_delay',
                'arr_time',
                'arr_delay',
                'carrier',
                'tailnum',
                'flight',
                'origin',
                'dest',
                'air_time',
                'distance',
                'hour',
                'minute',
                'dep_datetime',
                F.when(
                    F.col('dep_time').isNull(),
                    F.col('dep_time'))
                     .otherwise(F.col('dep_time'))
                     .alias('dep_time'))
df_flights.show(5)

+----+-----+---+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-------------------+--------+
|year|month|day|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|       dep_datetime|dep_time|
+----+-----+---+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-------------------+--------+
|2014|   12|  8|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|2014-12-08 06:58:00|     658|
|2014|    1| 22|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|2014-01-22 10:40:00|    1040|
|2014|    3|  9|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|2014-03-09 14:43:00|    1443|
|2014|    4|  9|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|2014-04-09 17:05:00|    1705|
|2014|    3|  9|    

### Pergunta 5

In [48]:
df_flights = df_flights.select(
                'year',
                'month',
                'day',
                'dep_time',
                (F.when(
                    F.col('dep_delay').isNull() | F.col('dep_delay').like('NA'), 0)
                     .otherwise(F.col('dep_delay'))
                     .alias('dep_delay')),
                'arr_time',
                'arr_delay',
                'carrier',
                'tailnum',
                'flight',
                'origin',
                'dest',
                'air_time',
                'distance',
                'hour',
                'minute',
                'dep_datetime')

df_flights.show(5)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-------------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|       dep_datetime|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-------------------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|2014-12-08 06:58:00|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|2014-01-22 10:40:00|
|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|2014-03-09 14:43:00|
|2014|    4|  9|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|2014-04-09 17:05:00|
|2014|    3|  9|    

### Pergunta 6

In [49]:
df_flights = df_flights.select(
                'year',
                'month',
                'day',
                'dep_time',
                'dep_delay',
                'arr_time',
                (F.when(
                    F.col('arr_delay').isNull() | F.col('dep_delay').like('NA'), 0)
                     .otherwise(F.col('arr_delay'))
                     .alias('arr_delay')),
                'carrier',
                'tailnum',
                'flight',
                'origin',
                'dest',
                'air_time',
                'distance',
                'hour',
                'minute',
                'dep_datetime'
                )

df_flights.show(5)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-------------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|       dep_datetime|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-------------------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|2014-12-08 06:58:00|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|2014-01-22 10:40:00|
|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|2014-03-09 14:43:00|
|2014|    4|  9|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|2014-04-09 17:05:00|
|2014|    3|  9|    

### Pergunta 7

In [50]:
df_flights = df_flights.drop('year', 'month', 'day', 'hour', 'minute')
df_flights.show(5)

+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+-------------------+
|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|       dep_datetime|
+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+-------------------+
|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|2014-12-08 06:58:00|
|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|2014-01-22 10:40:00|
|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|2014-03-09 14:43:00|
|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|2014-04-09 17:05:00|
|     754|       -1|    1015|        1|     AS| N612AS|   522|   SEA| BUR|     127|     937|2014-03-09 07:54:00|
+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----

### Pergunta 8

In [None]:
df_flights = df_flights.withColumn('air_time_projected', ((F.col('distance')* 0.1) + 20).cast('integer'))

df_flights.show(5)

### Pergunta 9

In [51]:
df_flights_aux = df_flights.groupBy(['origin', 'dest']).agg({"air_time":"avg"})

df_flights_aux = df_flights_aux.select(
                    F.concat_ws('_', 'origin', 'dest').alias('origin_dest_aux'),
                    F.round(F.col('avg(air_time)'), 2)
                     .alias('air_time_expected'))

df_flights = df_flights.select('*', F.concat_ws('_', 'origin', 'dest').alias('origin_dest_main'))

df_flights = df_flights.join(df_flights_aux, df_flights.origin_dest_main == df_flights_aux.origin_dest_aux, 'left')

df_flights = df_flights.drop('origin_dest_aux', 'origin_dest_main')

df_flights.show(5)

+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+-------------------+-----------------+
|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|       dep_datetime|air_time_expected|
+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+-------------------+-----------------+
|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|2014-12-08 06:58:00|           126.86|
|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|2014-01-22 10:40:00|           343.51|
|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|2014-03-09 14:43:00|           101.39|
|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|2014-04-09 17:05:00|            85.81|
|     754|       -1|    1015|        1|     AS| N612AS|   522|   SEA| BUR|     127|

### Pergunta 10

In [41]:
df_flights = df_flights.withColumn('air_time', F.coalesce(F.col('air_time'), F.greatest('air_time_projected', 'air_time_expected')).cast('integer'))

df_flights.show(5)

+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+-------------------+------------------+-----------------+
|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|       dep_datetime|air_time_projected|air_time_expected|
+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+-------------------+------------------+-----------------+
|     658|       -7|   935.0|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|2014-12-08 06:58:00|               115|           126.86|
|    1040|        5|  1505.0|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|2014-01-22 10:40:00|               287|           343.51|
|    1443|       -2|  1652.0|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|2014-03-09 14:43:00|                87|           101.39|
|    1705|       45|  1839.0|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|2014-04-

### Pergunta 11

In [52]:
df_flights = df_flights.withColumn('arr_time', F.coalesce(F.col('arr_time'), (F.col('dep_time') + F.col('air_time'))))

df_flights.show(5)

+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+-------------------+-----------------+
|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|       dep_datetime|air_time_expected|
+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+-------------------+-----------------+
|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|2014-12-08 06:58:00|           126.86|
|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|2014-01-22 10:40:00|           343.51|
|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|2014-03-09 14:43:00|           101.39|
|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|2014-04-09 17:05:00|            85.81|
|     754|       -1|    1015|        1|     AS| N612AS|   522|   SEA| BUR|     127|

### Pergunta 12

In [54]:
df_flights = df_flights.withColumn('haul_duration', F.when(F.col('air_time').between(20, 180), 'SHORT-HAUL')
                                                     .when(F.col('air_time').between(180, 300), 'MEDIUM-HAUL')
                                                     .when(F.col('air_time')>300 , 'LONG-HAUL') 
                                  )

df_flights.createOrReplaceTempView('flights')

spark.getOrCreate().sql("select haul_duration, air_time from flights").show(5)

+-------------+--------+
|haul_duration|air_time|
+-------------+--------+
|   SHORT-HAUL|     132|
|    LONG-HAUL|     360|
|   SHORT-HAUL|     111|
|   SHORT-HAUL|      83|
|   SHORT-HAUL|     127|
+-------------+--------+
only showing top 5 rows



### Pergunta 13

In [55]:
df_flights = df_flights.withColumn('dep_season',
                                        F.when(F.col('dep_datetime').between(F.concat(F.year('dep_datetime'),F.lit('-03-20 15:33:00')),F.concat(F.year('dep_datetime'),F.lit('-06-21 10:14:00'))), 'SPRING')
                                         .when(F.col('dep_datetime').between(F.concat(F.year('dep_datetime'),F.lit('-06-21 10:14:00')),F.concat(F.year('dep_datetime'),F.lit('-09-23 02:04:00'))), 'SUMMER')
                                         .when(F.col('dep_datetime').between(F.concat(F.year('dep_datetime'),F.lit('-09-23 02:04:00')),F.concat(F.year('dep_datetime'),F.lit('-12-21 21:48:00'))), 'FALL')
                                         .otherwise('WINTER')
                                  )


df_flights.select('dep_season').groupBy('dep_season').count().show()
df_flights.select('dep_datetime','dep_season').filter(df_flights.dep_season == 'SPRING').show()

df_flights.createOrReplaceTempView('flights')

spark.getOrCreate().sql("select dep_datetime, dep_season from flights where dep_season = 'WINTER'").show()

+----------+-----+
|dep_season|count|
+----------+-----+
|    WINTER| 2149|
|    SPRING| 2560|
|      FALL| 2373|
|    SUMMER| 2918|
+----------+-----+

+-------------------+----------+
|       dep_datetime|dep_season|
+-------------------+----------+
|2014-04-09 17:05:00|    SPRING|
|2014-05-12 16:55:00|    SPRING|
|2014-04-19 12:36:00|    SPRING|
|2014-06-05 17:33:00|    SPRING|
|2014-06-05 11:33:00|    SPRING|
|2014-06-04 11:15:00|    SPRING|
|2014-06-11 19:57:00|    SPRING|
|2014-06-07 18:23:00|    SPRING|
|2014-04-30 08:01:00|    SPRING|
|2014-06-02 22:22:00|    SPRING|
|2014-05-21 05:15:00|    SPRING|
|2014-06-11 07:50:00|    SPRING|
|2014-06-13 22:33:00|    SPRING|
|2014-05-02 12:53:00|    SPRING|
|2014-05-22 10:18:00|    SPRING|
|2014-05-16 07:46:00|    SPRING|
|2014-05-12 14:24:00|    SPRING|
|2014-04-06 18:44:00|    SPRING|
|2014-04-01 10:10:00|    SPRING|
|2014-04-25 10:49:00|    SPRING|
+-------------------+----------+
only showing top 20 rows

+-------------------+--------

### Pergunta 14

In [56]:
df_flights = df_flights.withColumn('dep_delay_category', F.when(F.col('dep_delay')<0, 'ANTECIPATED')
                                                          .when(F.col('dep_delay')==0, 'INTIME')
                                                          .when(F.col('dep_delay').between(0,60), 'MINOR')
                                                          .when(F.col('dep_delay')>60, 'MAJOR')
                                  )


df_flights.select('dep_delay', 'dep_delay_category').show()

+---------+------------------+
|dep_delay|dep_delay_category|
+---------+------------------+
|       -7|       ANTECIPATED|
|        5|             MINOR|
|       -2|       ANTECIPATED|
|       45|             MINOR|
|       -1|       ANTECIPATED|
|        7|             MINOR|
|       42|             MINOR|
|       -5|       ANTECIPATED|
|       -4|       ANTECIPATED|
|       -3|       ANTECIPATED|
|       -2|       ANTECIPATED|
|        0|            INTIME|
|       21|             MINOR|
|       -4|       ANTECIPATED|
|       89|             MAJOR|
|        3|             MINOR|
|       50|             MINOR|
|       -3|       ANTECIPATED|
|       -9|       ANTECIPATED|
|      -12|       ANTECIPATED|
+---------+------------------+
only showing top 20 rows

