In [None]:
# Installing required packages
!pip install pyspark
!pip install findspark

In [None]:
# starting spark
import findspark
findspark.init()

In [None]:
# PySpark is the Spark API for Python. In this lab, we use PySpark to initialize the spark context. 
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, lit, udf, length, substring, expr, regexp_replace, sum_distinct
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, FloatType
from pyspark.sql import functions as F

In [None]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = (SparkSession.builder
                     .master("local[7]")
                     .appName("Aceleração PySpark - Capgemini"))

In [None]:
schema_airports = StructType([
    StructField("faa",  StringType(),  True),
    StructField("name", StringType(),  True),
    StructField("lat",  FloatType(),   True),
    StructField("lon",  FloatType(),   True),
    StructField("alt",  IntegerType(), True),
    StructField("tz",   IntegerType(), True),
    StructField("dst",  StringType(),  True)
])

schema_planes = StructType([
    StructField("tailnum",      StringType(),  True),
    StructField("year",         IntegerType(), True),
    StructField("type",         StringType(),  True),
    StructField("manufacturer", StringType(),  True),
    StructField("model",        StringType(),  True),
    StructField("engines",      IntegerType(), True),
    StructField("seats",        IntegerType(), True),
    StructField("speed",        IntegerType(), True),
    StructField("engine",       StringType(),  True)
])

schema_flights = StructType([
    StructField("year",      IntegerType(), True),
    StructField("month",     IntegerType(), True),
    StructField("day",       IntegerType(), True),
    StructField("dep_time",  StringType(),  True),
    StructField("dep_delay", IntegerType(), True),
    StructField("arr_time",  StringType(),  True),
    StructField("arr_delay", IntegerType(), True),
    StructField("carrier",   StringType(),  True),
    StructField("tailnum",   StringType(),  True),
    StructField("flight",    StringType(),  True),
    StructField("origin",    StringType(),  True),
    StructField("dest",      StringType(),  True),
    StructField("air_time",  IntegerType(), True),
    StructField("distance",  IntegerType(), True),
    StructField("hour",      IntegerType(), True),
    StructField("minute",    IntegerType(), True),
])

In [None]:
df_airports = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_airports)
                  .load("../Datasets/airports.csv"))

df_planes = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_planes)
                  .load("../Datasets/planes.csv"))

df_flights = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_flights)
                  .load("../Datasets/flights.csv"))

In [None]:
# Expressoes regulares comuns
REGEX_ALPHA    = r'[a-zA-Z]+'
REGEX_INTEGER  = r'[0-9]+'
REGEX_FLOAT    = r'[0-9]+\.[0-9]+'
REGEX_ALPHANUM = r'[0-9a-zA-Z]+'
REGEX_EMPTY_STR= r'[\t ]+$'
REGEX_SPECIAL  = r'[!@#$%&*\(\)_]+'
REGEX_NNUMBER  = r'^N[1-9][0-9]{2,3}([ABCDEFGHJKLMNOPQRSTUVXWYZ]{1,2})'
REGEX_NNUMBER_INVALID = r'(N0.*$)|(.*[IO].*)'
REGEX_TIME_FMT = r'^(([0-1]?[0-9])|(2[0-3]))([0-5][0-9])$'

In [None]:
# Funcoes auxiiliares
def split_csv(line):
    return tuple(map(lambda x: x.replace('"',''), line.split(",")))

def check_empty_column(col):
    return (F.col(col).isNull() | (F.col(col) == '') | F.col(col).rlike(REGEX_EMPTY_STR))

# Airports

# Pergunta 1

In [None]:
df1 = df_airports.withColumn('alt', 
    (when(col('alt') < 0 ,0)).otherwise(col('alt')))

# Teste do resultado

In [None]:
df1.groupBy("alt").count().distinct().orderBy("alt", ascending=True).show()

# Pergunta 2

In [None]:
df2 = df1.withColumn('dst', 
    (when(col('tz').between(-7, -5) ,"A")).otherwise(col('dst')))

# Teste do resultado

In [None]:
df2.groupBy('dst').count().show()

# Pergunta 3

In [None]:
df3 = df2.withColumn('dst', 
    (when(col('dst') == "U" ,"A")).otherwise(col('dst')))

# Teste do resultado

In [None]:
df3.groupBy('dst').count().show()

# Pergunta 4

In [None]:
df4 = df3.withColumn(
    "region",
    (
        when(col('lon') < -124, "Alaska")
        .when(
                (
                    (col('lon') > -50) | 
                    (col('lat') < 24)
                ), "OFFSHORE"
            )
        .when(
                (
                    (col('lon') <= -95) &
                    (col('lon').between(-124, -50))
                ), 'MAINLAND-WEST'

            )
        .when(
                (
                    (col('lon') > -95) &
                    (col('lon').between(-124, -50))
                ), 'MAINLAND-EAST'
            )
    ).otherwise('NaN')
    
    )

# Teste do resultado

In [None]:
df4.createOrReplaceTempView("REGION")
spark.sql("select region, Count(*) from REGION Group by REGION Order by REGION").show()

# Pergunta 5

In [None]:
df5 = df4.withColumn(
    "type",
    (
        when(
                (
                    (col('name').rlike("Airport"))|
                    (col('name').rlike("Tradeport"))|
                    (col('name').rlike("Heliport"))|
                    (col('name').rlike("Airpor"))|
                    (col('name').rlike("Arpt"))
                )
            , "AP"
            )
        
        .when(
            col('name').rlike("Aerodrome")
            , "AD"
            )
        
        .when(
                (
                    (col('name').rlike("Airpark")) |
                    (col('name').rlike("Aero Park"))
                ), "AK"
            )
        
        .when(
            col('name').rlike("Station")
            , "AS"
            )
        
        .when(
                (
                    (col('name').rlike("Field")) |
                    (col('name').rlike("Fld"))
                )
            , "FL"
            )
        
    ).otherwise("NaN")

    )

# Teste do resultado

In [None]:
df5.groupBy('type').count().show()

# Pergunta 6

In [None]:
LIST_MILITARY  = ['Base', 'Aaf', 'Afs', 'Ahp', 'Afb', 'LRRS', 'Lrrs', 'Arb', 'Naf', 'NAS', 'Nas', 'Jrb', 'Ns', 'As', 'Cgas', 'Angb']
REGEX_MILITARY = r'|'.join(map(lambda x : f'^{x} | {x} | {x}$', LIST_MILITARY))

df6 = df5.withColumn('military',
              when(col('name').rlike(REGEX_MILITARY), True)
               .otherwise(False))

# Teste do resultado

In [None]:
df6.groupBy('military').count().show()

# Pergunta 7

In [None]:
LIST_I  = ['International', 'Intl', 'Intercontinental']
REGEX_I = r'|'.join(map(lambda x : f'.*({x}).*', LIST_I))

LIST_N  = ['National', 'Natl']
REGEX_N = r'|'.join(map(lambda x : f'.*({x}).*', LIST_N))

LIST_R  = ['Regional', "Reigonal", 'Rgnl', 'County', 'Metro', 'Metropolitan']
REGEX_R = r'|'.join(map(lambda x : f'.*({x}).*', LIST_R))

LIST_M  = ['Municipal', 'Muni', 'City']
REGEX_M = r'|'.join(map(lambda x : f'.*({x}).*', LIST_M))

In [None]:
df7 = df6.withColumn('administration',
              F.when(F.col('name').rlike(REGEX_I), 'I')
               .when(F.col('name').rlike(REGEX_N), 'N')
               .when(F.col('name').rlike(REGEX_R), 'R')
               .when(F.col('name').rlike(REGEX_M), 'M')
               .otherwise('NaN'))

# Teste do resultado

In [None]:
df7.groupBy('administration').count().show()

# Salvando o arquivo em parquet

In [None]:
df7.write.parquet(
path = 'C:/Users/coskata/Downloads/Datasets/parquet/airports3.parquet',
mode = 'overwrite'
)

# Teste do resultado

In [None]:
path = 'C:/Users/coskata/Downloads/Datasets/parquet/airports3.parquet'
airports_parquet = spark.read.parquet(path)

In [None]:
airports_parquet.toPandas()

# Planes

# Pergunta 1

In [None]:
df1 = df_planes.withColumn(
    "tailchar", 
    regexp_replace(col('tailnum'), "[0-9]|^N", "")
    )

# Teste do resultado

In [None]:
df1.groupBy('tailchar').count().show()

# Pergunta 2

In [None]:
df2 = df1.withColumn(
    'year',
    (when(col('year') == 0, 1996)).otherwise(col('year'))
    )

# Teste do resultado

In [None]:
df2.groupBy("year").count().distinct().orderBy("year", ascending=True).show()

# Pergunta 3

In [None]:
df3 = df.withColumn(
    'manufacturer',
    when(col("manufacturer").rlike("AIRBUS"), regexp_replace(col('manufacturer'), 'AIRBUS INDUSTRIE', 'AIRBUS'))
    .when(col("manufacturer").rlike("BOEING"), regexp_replace(col('manufacturer'), 'BOEING', 'BOEING'))
    .when(col("manufacturer").rlike("BOMBARDIER"), regexp_replace(col('manufacturer'), 'BOMBARDIER INC', 'BOMBARDIER'))
    .when(col("manufacturer").rlike("CESSNA"), regexp_replace(col('manufacturer'), 'CESSNA', 'CESSNA'))
    .when(col("manufacturer").rlike("EMBRAER"), regexp_replace(col('manufacturer'), 'EMBRAER', 'EMBRAER'))
    .when(col("manufacturer").rlike("SIKORSKY"), regexp_replace(col('manufacturer'), 'SIKORSKY', 'SIKORSKY'))
    .when(col("manufacturer").rlike("CANADAIR"), regexp_replace(col('manufacturer'), 'CANADAIR', 'CANADAIR'))
    .when(col("manufacturer").rlike("PIPER"), regexp_replace(col('manufacturer'), 'PIPER', 'PIPER'))
    .when(col("manufacturer").rlike("MCDONNELL DOUGLAS"), regexp_replace(col('manufacturer'), 'MCDONNELL DOUGLAS AIRCRAFT CO', 'MCDONNELL DOUGLAS'))
    .when(col("manufacturer").rlike("CIRRUS"), regexp_replace(col('manufacturer'), 'CIRRUS DESIGN CORP', 'CIRRUS'))
    .when(col("manufacturer").rlike("BELL"), regexp_replace(col('manufacturer'), 'BELL', 'BELL'))
    .when(col("manufacturer").rlike("KILDALL GARY"), regexp_replace(col('manufacturer'), 'KILDALL GARY', 'KILDALL GARY'))
    .when(col("manufacturer").rlike("LAMBERT RICHARD"), regexp_replace(col('manufacturer'), 'LAMBERT RICHARD', 'LAMBERT RICHARD'))
    .when(col("manufacturer").rlike("BARKER JACK"), regexp_replace(col('manufacturer'), 'BARKER JACK L', 'BARKER JACK'))
    .when(col("manufacturer").rlike("ROBINSON HELICOPTER"), regexp_replace(col('manufacturer'), 'ROBINSON HELICOPTER CO', 'ROBINSON HELICOPTER'))
    .when(col("manufacturer").rlike("GULFSTREAM"), regexp_replace(col('manufacturer'), 'GULFSTREAM AEROSPACE', 'GULFSTREAM'))
    .when(col("manufacturer").rlike("MARZ BARRY"), regexp_replace(col('manufacturer'), 'MARZ BARRY', 'MARZ BARRY'))
    )

# Teste do resultado

In [None]:
df6.groupBy("manufacturer").count().distinct().orderBy("count", ascending=True).show()

# Pergunta 4

In [None]:
df4_aux = df3.groupBy('manufacturer','model').min('year').orderBy(col('manufacturer'), col('model'))

In [None]:
df4_aux = df4_aux.withColumnRenamed(
  "manufacturer", "manufacturer_aux").withColumnRenamed("model", "model_aux")

In [None]:
cond = [df3.manufacturer == df4_aux.manufacturer_aux, df3.model == df4_aux.model_aux]
df4 = df3.join(df4_aux, cond, 'left')

In [None]:
df4_aux2 = df3.groupBy('manufacturer').min('year').orderBy(col('manufacturer'), col('model'))

In [None]:
df4_aux2 = df4_aux2.withColumnRenamed(
  "manufacturer", "manufacturer_aux2")

In [None]:
cond = [df3.manufacturer == df4_aux.manufacturer_aux]
df42 = df4.join(df4_aux, cond, 'left')

In [None]:
df4_final = df42.withColumn(
    'year',
    (when(col('year').isNull(), col('min(year)')).otherwise(col('year')))
              )

# Teste do resultado

In [None]:
df4_final.groupBy("year").count().distinct().orderBy("count", ascending=True).show()

# Pergunta 5

In [None]:
df5 = df4_final.withColumn(
    "age",
    expr("2022 - year")
    )

# Teste do resultado

In [None]:
df5.groupBy("age").count().distinct().orderBy("age", ascending=True).show()

# Pergunta 6

In [None]:
df6 = df5.withColumn(
    "type",
    when(col('type').contains("Fixed wing multi engine"),regexp_replace(col('type'), 'Fixed wing multi engine', 'MULTI_ENG'))
    .when(col('type').contains("Fixed wing single engine"),regexp_replace(col('type'), 'Fixed wing single engine', 'SINGLE_ENG'))
    .when(col('type').contains("Rotorcraft"),regexp_replace(col('type'), 'Rotorcraft', 'ROTORCRAFT'))
    
)

# Teste do resultado

In [None]:
df.groupBy('type').count().show()

# Pergunta 7 

In [None]:
df7 = df6.withColumn("model",
              regexp_replace(col('model'), '\s*\([^()]*\)\s*', ""))

# Teste do resultado

In [None]:
df7.groupBy(col('model')).count().show(999)

# Pergunta 8

In [None]:
df8 = df7.withColumn(
    'speed',
    (when(
            (
                (col('speed').isNull()) |
                (col('speed') == '') |
                (col('speed') == "NA") |
                (col('speed') == " ") |
                (col('speed') == "null") &
                (col('seats').isNotNull())
            ),F.ceil( col('seats')/ 0.36))
    ).otherwise(col('speed'))
)

In [None]:
df8 = df8.withColumn(
    "speed",
    (when(~col('speed').between(50, 150), 0)).otherwise(col('speed'))
)

# Teste do resultado

In [None]:
df8.groupBy(col('speed')).count().show()

# Pergunta 9

In [None]:
df9 = df8.withColumn(
    'engine_type',
    when(col('engine').rlike('Turbo-fan'), "FAN")
    .when(col('engine').rlike('Turbo-jet'), "JET")
    .when(col('engine').rlike('Turbo-prop'), "PROP")
    .when(col('engine').rlike('Turbo-shaft'), "SHAFT")
    .when(col('engine').rlike('4 Cycle'), "CYCLE")
    )

# Teste do resultado

In [None]:
df9.groupBy(col('engine_type')).count().show()

In [None]:
df9.select(col('engine_type'), col('engine')).where(col('engine_type').isNull()).show()

# Salvando o arquivo em parquet

In [None]:
df9.write.parquet(
path = 'C:/Users/coskata/Downloads/Datasets/parquet/planes3.parquet',
mode = 'overwrite'
)

# Teste do resultado

In [None]:
path = 'C:/Users/coskata/Downloads/Datasets/parquet/planes3.parquet'
planes_parquet = spark.read.parquet(path)

In [None]:
planes_parquet.toPandas()

# Flights

# Pergunta 1

In [None]:
df1 = df_flights.withColumn(
    'hour', 
    (when(col('hour').isNull(), 0)).otherwise(col('hour'))
    
    )
df1 = df1.withColumn(
    'minute', 
    (when(col('minute').isNull(), 0)).otherwise(col('minute'))
    
    )

# Teste do resultado

In [None]:
df1.groupBy("hour").count().distinct().orderBy("hour", ascending=True).show(25)

In [None]:
df1.groupBy("minute").count().distinct().orderBy("minute", ascending=True).show(25)

# Pergunta 2

In [None]:
df2 = df1.withColumn(
    'hour', 
    (when(col('hour') == 24, 0)).otherwise(col('hour'))
    )

# Teste do resultado

In [None]:
df2.groupBy("hour").count().distinct().orderBy("hour", ascending=True).show(25)

# Pergunta 3

In [None]:
df3 = df2.withColumn('dep_datetime',
                    expr("make_timestamp(year, month, day, hour, minute, 00)"))

# Teste do resultado

In [None]:
df3.groupBy("dep_datetime").count().distinct().orderBy("dep_datetime", ascending=True).show(25)

# Pergunta 4

In [None]:
df4 = df3.withColumn(
    'dep_time',
    (when(
            (
                (col('dep_time').isNull())|
                (col('dep_time') == "") |
                (col('dep_time') == "NA")
            ), concat(col('hour'), lpad(col('minute'), 2, '0'))
        )).otherwise(col('dep_time'))
                    )

# Teste do resultado

In [None]:
df4.groupBy("dep_time").count().distinct().orderBy("dep_time", ascending=True).show(25)

# Pergunta 5

In [None]:
df5 = df4.withColumn(
    'dep_delay',
    (when(col('dep_delay').isNull(), 0)).otherwise(col('dep_delay'))
    )

# Teste do resultado

In [None]:
df5.groupBy("dep_delay").count().distinct().orderBy("dep_delay", ascending=True).show(25)

# Pergunta 6

In [None]:
df6 = df5.withColumn(
    'arr_delay',
    (when(col('arr_delay').isNull(), 0)).otherwise(col('arr_delay'))
    )

# Teste do resultado

In [None]:
df6.groupBy(col('arr_delay') == 0).count().show(50)

# Pergunta 7

In [None]:
df7 = df6.drop("year")\
        .drop("month")\
        .drop("day")\
        .drop("hour")\
        .drop('minute')

# Teste do resultado

In [None]:
df7.show()

# Pergunta 8

In [None]:
df8 = df7.withColumn(
    "air_time_projected",
    F.ceil(expr('distance *0.1 + 20'))
    )

# Teste do resultado

In [None]:
df8.groupBy("air_time_projected").count().distinct().orderBy("air_time_projected", ascending=True).show(25)

# Pergunta 9

In [None]:
df9_aux = df8.groupBy('origin', 'dest').avg('air_time')

df9_aux = df9_aux.withColumnRenamed(
  "origin", "origin_aux").withColumnRenamed("dest", "dest_aux")

cond = [df2.origin == df9_aux.origin_aux, df2.dest == df9_aux.dest_aux]
df9 = df8.join(df9_aux, cond, 'left')

df9 = df9.withColumnRenamed("avg(air_time)", "air_time_expected")

df9 = df9.drop("origin_aux")\
        .drop("dest_aux")

df9 = df9.withColumn('air_time_expected', F.ceil(col('air_time_expected')))

# Teste do resultado

In [None]:
df9.select(col('origin'), col('dest'), col('air_time_expected')).show()

# Pergunta 10

In [None]:
df10 = df9.withColumn(
    'air_time',
    (when(
            col('air_time').isNull(), F.greatest(col('air_time_expected'), col('air_time_projected'))
        )).otherwise(col('air_time'))
    )

# Teste do resultado

In [None]:
df10.groupBy("air_time").count().distinct().orderBy("air_time", ascending=True).show(25)

# Pergunta 11

In [None]:
df10aux1 = df10.withColumn('dep_time', col('dep_time').cast(IntegerType()))

In [None]:
df10_aux = df10aux1.groupBy('origin','dest', 'arr_time', 'air_time').min('dep_time')

In [None]:
df11aux = df10_aux.withColumn('min(dep_time)', col('min(dep_time)').cast(StringType()))

In [None]:
REGEX_HOUR_MIN_LEN4 = r"(^{4}[0-1][0-9]|^2[0-4])[0-5][0-9]$"
REGEX_HOUR_MIN_LEN3 = r"^{3}[0-9][0-5][0-9]$"

In [None]:
df11_aux = df11aux.withColumn(
    'dep_hour3',
    (when(
            (
                (col('arr_time') == "NA") &
                (col('min(dep_time)').rlike(REGEX_HOUR_MIN_LEN3))
            ), substring(col('min(dep_time)'), 1, 1)
        )).otherwise(0)
    )
df11_aux = df11_aux.withColumn(
    'dep_minute3',
    (when(
            (
                (col('arr_time') == "NA") &
                (col('min(dep_time)').rlike(REGEX_HOUR_MIN_LEN3))
            ), substring(col('min(dep_time)'), 2, 2)
        )).otherwise(0)
    )

In [None]:
df11_aux = df11_aux.withColumn(
    'dep_hour4',
    (when(
            (
                (col('arr_time') == "NA") &
                (col('min(dep_time)').rlike(REGEX_HOUR_MIN_LEN4))
            ), substring(col('min(dep_time)'), 1, 2)
        )).otherwise(0)
    )
df11_aux = df11_aux.withColumn(
    'dep_minute4',
    (when(
            (
                (col('arr_time') == "NA") &
                (col('min(dep_time)').rlike(REGEX_HOUR_MIN_LEN4))
            ), substring(col('min(dep_time)'), 3, 2)
        )).otherwise(0)
    )

In [None]:
df11_aux1 = df11_aux.withColumn(
    'air_hour',
    (when(length(col('air_time'))==3, F.floor(expr('air_time/60')))).otherwise(0)
    )
df11_aux1 = df11_aux1.withColumn(
    'air_minute',
    F.ceil(expr('air_time%60'))
    )

In [None]:
df11_aux1 = df11_aux1.withColumn(
    'dep_hour4',
    (when(
            (
                (col('arr_time') == "NA") &
                (col('min(dep_time)').rlike(REGEX_HOUR_MIN_LEN3))
            ), col('dep_hour3')
        )).otherwise(col('dep_hour4'))
)
df11_aux1 = df11_aux1.withColumn(
    'dep_minute4',
    (when(
            (
                (col('arr_time') == "NA") &
                (col('min(dep_time)').rlike(REGEX_HOUR_MIN_LEN3))
            ), col('dep_minute3')
        )).otherwise(col('dep_minute4'))
)

In [None]:
df11_aux1 = df11_aux1.drop("dep_hour3")\
    .drop('dep_minute3')

In [None]:
df11_aux1 = df11_aux1.withColumnRenamed("origin", "origin_aux")\
.withColumnRenamed("dest", "dest_aux")\
.withColumnRenamed("arr_time", "arr_time_aux")\
.withColumnRenamed("air_time", "air_time_aux")\
.withColumnRenamed("dep_hour4", "dep_hour")\
.withColumnRenamed("dep_minute4", "dep_minute")

In [None]:
df11_aux1 = df11_aux1.withColumn(
    'arr_hour',
    when((col('arr_time_aux')=="NA"), F.floor(expr('dep_hour + air_hour')))
)
df11_aux1 = df11_aux1.withColumn(
    'arr_minute',
    when((col('arr_time_aux')=="NA"), F.floor(expr('dep_minute + air_minute')))
)

In [None]:
df11_aux1 = df11_aux1.withColumn(
    "arr_minute_f",
    (when(col('arr_minute') > 60, F.ceil(expr('arr_minute - 60')))).otherwise(col('arr_minute'))
)
df11_aux1 = df11_aux1.withColumn(
    "arr_hour_f",
    (when(col('arr_minute') > 60, F.ceil(expr('arr_hour + 1')))).otherwise(col('arr_hour'))
)

In [None]:
df11_aux1 = df11_aux1.drop("min(dep_time)")\
    .drop('dep_hour')\
    .drop('dep_minute')\
    .drop('air_hour')\
    .drop('air_minute')\
    .drop('arr_hour')\
    .drop('arr_minute')

In [None]:
df11_aux1.select('*').where(col('arr_time_aux')=="NA").show()

In [None]:
cond = [
    df10.origin == df11_aux1.origin_aux, 
    df10.dest == df11_aux1.dest_aux, 
    df10.arr_time == df11_aux1.arr_time_aux, 
    df10.air_time == df11_aux1.air_time_aux
]

df11_f = df10.join(df11_aux1, cond, 'left')

In [None]:
df11_f = df11_f.withColumn(
    'arr_time',
    (when((col('arr_time')=="NA"), F.concat(col('arr_hour_f'), col('arr_minute_f')))).otherwise(col('arr_time'))
)

# Teste do resultado

In [None]:
df11_f.select(
    col('origin'), 
    col('dest'),
    col('arr_time'),
    col('dep_time'),
    col('air_time'),
    col('arr_hour_f'),
    col('arr_minute_f')
    ).where(col('arr_time')=="NA").show()

# Pergunta 12

In [None]:
df12 = df10.withColumn(
    "haul_duration",
    when(col('air_time').between(20, 180), 'SHORT-HAUL')
    .when(col('air_time').between(181, 360), 'MEDIUM-HAUL')
    .when(col('air_time') > 360, 'LONG-HAUL')
    )

# Teste do resultado

In [None]:
df12.groupBy('haul_duration').count().show()

# Pergunta 13

In [None]:
df13 = df12.withColumn(
    "dep_season",
    when(
            (
                (col('dep_datetime').between('2013-12-21 21:49:00','2014-03-20 15:33:00')) |
                (col('dep_datetime').between('2014-12-21 21:49:00','2015-03-20 15:33:00'))
            )
        , 'WINTER')
    .when(col('dep_datetime').between('2014-03-20 15:34:00','2014-06-21 10:14:00'), 'SPRING')
    .when(col('dep_datetime').between('2014-06-21 10:15:00','2014-09-23 02:04:00'), 'SUMMER')
    .when(col('dep_datetime').between('2014-09-23 02:05:00','2014-12-21 21:48:00'), 'FALL')
    )

# Teste do resultado

In [None]:
df13.groupBy('dep_season').count().show()

# Pergunta 14

In [None]:
df14 = df13.withColumn(
    'dep_delay_category',
    when(col('dep_delay') < 0, "ANTECIPATED")
    .when(col('dep_delay') == 0, "INTIME")
    .when(col('dep_delay').between(1, 59), "MINOR")
    .when(col('dep_delay') >= 60, "MAJOR")
    )

# Teste do resultado

In [None]:
df14.groupBy('dep_delay_category').count().show()

# Salvando o arquivo em parquet

In [None]:
df14.write.parquet(
path = 'C:/Users/coskata/Downloads/Datasets/parquet/flights3.parquet',
mode = 'overwrite'
)

# Teste do resultado

In [None]:
path = 'C:/Users/coskata/Downloads/Datasets/parquet/flights3.parquet'
flights_parquet = spark.read.parquet(path)

In [None]:
flights_parquet.toPandas()