In [1]:
# Installing required packages
!pip install pyspark
!pip install findspark



In [2]:
import findspark
findspark.init()

In [3]:
# PySpark is the Spark API for Python. In this lab, we use PySpark to initialize the spark context. 
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [4]:
# Criar o contexto do spark
sc = SparkContext()

# Instancia o criador de sessao do spark
spark = (SparkSession.builder
                     .master("local[*]")
                     .appName("Aceleração PySpark - Capgemini"))

In [5]:
schema_airports = StructType([
    StructField("faa",  StringType(),  True),
    StructField("name", StringType(),  True),
    StructField("lat",  FloatType(),   True),
    StructField("lon",  FloatType(),   True),
    StructField("alt",  IntegerType(), True),
    StructField("tz",   IntegerType(), True),
    StructField("dst",  StringType(),  True)
])

schema_planes = StructType([
    StructField("tailnum",      StringType(),  True),
    StructField("year",         IntegerType(), True),
    StructField("type",         StringType(),  True),
    StructField("manufacturer", StringType(),  True),
    StructField("model",        StringType(),  True),
    StructField("engines",      IntegerType(), True),
    StructField("seats",        IntegerType(), True),
    StructField("speed",        IntegerType(), True),
    StructField("engine",       StringType(),  True)
])

schema_flights = StructType([
    StructField("year",      IntegerType(), True),
    StructField("month",     IntegerType(), True),
    StructField("day",       IntegerType(), True),
    StructField("dep_time",  StringType(),  True),
    StructField("dep_delay", IntegerType(), True),
    StructField("arr_time",  StringType(),  True),
    StructField("arr_delay", IntegerType(), True),
    StructField("carrier",   StringType(),  True),
    StructField("tailnum",   StringType(),  True),
    StructField("flight",    StringType(),  True),
    StructField("origin",    StringType(),  True),
    StructField("dest",      StringType(),  True),
    StructField("air_time",  IntegerType(), True),
    StructField("distance",  IntegerType(), True),
    StructField("hour",      IntegerType(), True),
    StructField("minute",    IntegerType(), True),
])

In [6]:
# Carrega os dados com as configurações necessárias

path_airports = ("../data/airports.csv")

df_airports = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_airports)
                  .load(path_airports))

path_planes = ("../data/planes.csv")

df_planes = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_planes)
                  .load(path_planes))

path_flights = ("../data/flights.csv")

df_flights = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_flights)
                  .load(path_flights))

In [7]:
# Criacao das visões temporarias

df_airports.createOrReplaceTempView('airports')
df_planes.createOrReplaceTempView('planes')
df_flights.createOrReplaceTempView('flights')

## Airport - Perguntas


#### Pergunta 1

In [8]:
df_airports.show()

+---+--------------------+---------+-----------+----+---+---+
|faa|                name|      lat|        lon| alt| tz|dst|
+---+--------------------+---------+-----------+----+---+---+
|04G|   Lansdowne Airport|41.130474|  -80.61958|1044| -5|  A|
|06A|Moton Field Munic...| 32.46057|  -85.68003| 264| -5|  A|
|06C| Schaumburg Regional| 41.98934|  -88.10124| 801| -6|  A|
|06N|     Randall Airport| 41.43191|  -74.39156| 523| -5|  A|
|09J|Jekyll Island Air...|31.074472|  -81.42778|  11| -4|  A|
|0A9|Elizabethton Muni...|36.371223| -82.173416|1593| -4|  A|
|0G6|Williams County A...|41.467304| -84.506775| 730| -5|  A|
|0G7|Finger Lakes Regi...|42.883564| -76.781235| 492| -5|  A|
|0P2|Shoestring Aviati...|39.794823| -76.647194|1000| -5|  U|
|0S9|Jefferson County ...| 48.05381|-122.810646| 108| -8|  A|
|0W3|Harford County Ai...|39.566837|   -76.2024| 409| -5|  A|
|10C|  Galt Field Airport| 42.40289| -88.375114| 875| -6|  U|
|17G|Port Bucyrus-Craw...|40.781555|  -82.97481|1003| -5|  A|
|19A|Jac

In [9]:
from pyspark.sql.functions import when,col, length

df_airports = df_airports.withColumn("qa_faa",
                        when(df_airports["faa"].rlike("^(\w{3,5})"), "F")\
                       .when(df_airports["faa"].isNull(), "M"))

df_airports.groupBy("qa_faa").count().distinct().orderBy("qa_faa", ascending=True).show()

+------+-----+
|qa_faa|count|
+------+-----+
|     F| 1397|
+------+-----+



#### Pergunta 2

In [10]:
#Adiciona a coluna qa_name verifica as condições da coluna name não ser nula ou vazia e seta o valor "M"
df_airports = df_airports.withColumn("qa_name", 
                                     when(df_airports["name"].isNull(), "M")\
                                     .when(df_airports["name"]=="", "M"))

df_airports.groupBy("qa_name").count().distinct().orderBy("qa_name", ascending=True).show()

+-------+-----+
|qa_name|count|
+-------+-----+
|   null| 1397|
+-------+-----+



In [11]:
#Filtra a coluna qa_name se o valor "M"
df_airports.filter(df_airports["qa_name"]=="M").show()

+---+----+---+---+---+---+---+------+-------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|
+---+----+---+---+---+---+---+------+-------+
+---+----+---+---+---+---+---+------+-------+



#### Pergunta 3

In [12]:
df_airports = df_airports.withColumn("qa_lat", 
                                     when(df_airports["lat"].isNull(), "M")
                                     .when(~df_airports["lat"].between(-180,180), "I")
                                     .when(~df_airports["lat"].rlike("\d"), "A"))

df_airports.groupBy("qa_lat").count().distinct().orderBy("qa_lat", ascending=True).show()

+------+-----+
|qa_lat|count|
+------+-----+
|  null| 1397|
+------+-----+



#### Pergunta 4

In [13]:
df_airports = df_airports.withColumn("qa_lon", when(df_airports.lat.isNull(), "M"))

In [14]:
df_airports.show()

+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+
|faa|                name|      lat|        lon| alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|
+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+
|04G|   Lansdowne Airport|41.130474|  -80.61958|1044| -5|  A|     F|   null|  null|  null|
|06A|Moton Field Munic...| 32.46057|  -85.68003| 264| -5|  A|     F|   null|  null|  null|
|06C| Schaumburg Regional| 41.98934|  -88.10124| 801| -6|  A|     F|   null|  null|  null|
|06N|     Randall Airport| 41.43191|  -74.39156| 523| -5|  A|     F|   null|  null|  null|
|09J|Jekyll Island Air...|31.074472|  -81.42778|  11| -4|  A|     F|   null|  null|  null|
|0A9|Elizabethton Muni...|36.371223| -82.173416|1593| -4|  A|     F|   null|  null|  null|
|0G6|Williams County A...|41.467304| -84.506775| 730| -5|  A|     F|   null|  null|  null|
|0G7|Finger Lakes Regi...|42.883564| -76.781235| 492| -5|  A|     F|   null|  null|  null|

In [15]:
df_airports = df_airports.withColumn("qa_lon", when(~df_airports["lon"].between(-180, 180), "I"))

In [16]:
df_airports.show()

+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+
|faa|                name|      lat|        lon| alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|
+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+
|04G|   Lansdowne Airport|41.130474|  -80.61958|1044| -5|  A|     F|   null|  null|  null|
|06A|Moton Field Munic...| 32.46057|  -85.68003| 264| -5|  A|     F|   null|  null|  null|
|06C| Schaumburg Regional| 41.98934|  -88.10124| 801| -6|  A|     F|   null|  null|  null|
|06N|     Randall Airport| 41.43191|  -74.39156| 523| -5|  A|     F|   null|  null|  null|
|09J|Jekyll Island Air...|31.074472|  -81.42778|  11| -4|  A|     F|   null|  null|  null|
|0A9|Elizabethton Muni...|36.371223| -82.173416|1593| -4|  A|     F|   null|  null|  null|
|0G6|Williams County A...|41.467304| -84.506775| 730| -5|  A|     F|   null|  null|  null|
|0G7|Finger Lakes Regi...|42.883564| -76.781235| 492| -5|  A|     F|   null|  null|  null|

In [17]:
df_airports = df_airports.withColumn("qa_lon", when(~df_airports["lon"].rlike("\d"), "A"))

In [18]:
df_airports.show()

+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+
|faa|                name|      lat|        lon| alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|
+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+
|04G|   Lansdowne Airport|41.130474|  -80.61958|1044| -5|  A|     F|   null|  null|  null|
|06A|Moton Field Munic...| 32.46057|  -85.68003| 264| -5|  A|     F|   null|  null|  null|
|06C| Schaumburg Regional| 41.98934|  -88.10124| 801| -6|  A|     F|   null|  null|  null|
|06N|     Randall Airport| 41.43191|  -74.39156| 523| -5|  A|     F|   null|  null|  null|
|09J|Jekyll Island Air...|31.074472|  -81.42778|  11| -4|  A|     F|   null|  null|  null|
|0A9|Elizabethton Muni...|36.371223| -82.173416|1593| -4|  A|     F|   null|  null|  null|
|0G6|Williams County A...|41.467304| -84.506775| 730| -5|  A|     F|   null|  null|  null|
|0G7|Finger Lakes Regi...|42.883564| -76.781235| 492| -5|  A|     F|   null|  null|  null|

In [19]:
df_airports.groupBy("qa_lon").count().distinct().orderBy("qa_lon", ascending=True).show()

+------+-----+
|qa_lon|count|
+------+-----+
|  null| 1397|
+------+-----+



#### Pergunta 5

In [20]:
df_airports.show()

+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+
|faa|                name|      lat|        lon| alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|
+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+
|04G|   Lansdowne Airport|41.130474|  -80.61958|1044| -5|  A|     F|   null|  null|  null|
|06A|Moton Field Munic...| 32.46057|  -85.68003| 264| -5|  A|     F|   null|  null|  null|
|06C| Schaumburg Regional| 41.98934|  -88.10124| 801| -6|  A|     F|   null|  null|  null|
|06N|     Randall Airport| 41.43191|  -74.39156| 523| -5|  A|     F|   null|  null|  null|
|09J|Jekyll Island Air...|31.074472|  -81.42778|  11| -4|  A|     F|   null|  null|  null|
|0A9|Elizabethton Muni...|36.371223| -82.173416|1593| -4|  A|     F|   null|  null|  null|
|0G6|Williams County A...|41.467304| -84.506775| 730| -5|  A|     F|   null|  null|  null|
|0G7|Finger Lakes Regi...|42.883564| -76.781235| 492| -5|  A|     F|   null|  null|  null|

In [21]:
df_airports = df_airports.withColumn("qa_alt", 
                                     when(df_airports["alt"].isNull(), "M")
                                     .when(df_airports["alt"] < 0, "I")
                                     .when(~df_airports["alt"].rlike("\d"), "A"))

In [22]:
df_airports.groupBy("qa_alt").count().distinct().orderBy("qa_alt", ascending=True).show()

+------+-----+
|qa_alt|count|
+------+-----+
|  null| 1395|
|     I|    2|
+------+-----+



#### Pergunta 6

In [23]:
df_airports = df_airports.withColumn("qa_tz", when(df_airports.tz.isNull(), "M"))

In [24]:
df_airports.show()

+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+------+-----+
|faa|                name|      lat|        lon| alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|
+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+------+-----+
|04G|   Lansdowne Airport|41.130474|  -80.61958|1044| -5|  A|     F|   null|  null|  null|  null| null|
|06A|Moton Field Munic...| 32.46057|  -85.68003| 264| -5|  A|     F|   null|  null|  null|  null| null|
|06C| Schaumburg Regional| 41.98934|  -88.10124| 801| -6|  A|     F|   null|  null|  null|  null| null|
|06N|     Randall Airport| 41.43191|  -74.39156| 523| -5|  A|     F|   null|  null|  null|  null| null|
|09J|Jekyll Island Air...|31.074472|  -81.42778|  11| -4|  A|     F|   null|  null|  null|  null| null|
|0A9|Elizabethton Muni...|36.371223| -82.173416|1593| -4|  A|     F|   null|  null|  null|  null| null|
|0G6|Williams County A...|41.467304| -84.506775| 730| -5|  A|   

In [25]:
df_airports = df_airports.withColumn("qa_tz", when(df_airports["tz"].between(-11, 14), "I"))

In [26]:
df_airports.show()

+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+------+-----+
|faa|                name|      lat|        lon| alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|
+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+------+-----+
|04G|   Lansdowne Airport|41.130474|  -80.61958|1044| -5|  A|     F|   null|  null|  null|  null|    I|
|06A|Moton Field Munic...| 32.46057|  -85.68003| 264| -5|  A|     F|   null|  null|  null|  null|    I|
|06C| Schaumburg Regional| 41.98934|  -88.10124| 801| -6|  A|     F|   null|  null|  null|  null|    I|
|06N|     Randall Airport| 41.43191|  -74.39156| 523| -5|  A|     F|   null|  null|  null|  null|    I|
|09J|Jekyll Island Air...|31.074472|  -81.42778|  11| -4|  A|     F|   null|  null|  null|  null|    I|
|0A9|Elizabethton Muni...|36.371223| -82.173416|1593| -4|  A|     F|   null|  null|  null|  null|    I|
|0G6|Williams County A...|41.467304| -84.506775| 730| -5|  A|   

In [27]:
df_airports = df_airports.withColumn("qa_tz", when(~df_airports["tz"].rlike("\d"), "A"))

In [28]:
df_airports.show()

+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+------+-----+
|faa|                name|      lat|        lon| alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|
+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+------+-----+
|04G|   Lansdowne Airport|41.130474|  -80.61958|1044| -5|  A|     F|   null|  null|  null|  null| null|
|06A|Moton Field Munic...| 32.46057|  -85.68003| 264| -5|  A|     F|   null|  null|  null|  null| null|
|06C| Schaumburg Regional| 41.98934|  -88.10124| 801| -6|  A|     F|   null|  null|  null|  null| null|
|06N|     Randall Airport| 41.43191|  -74.39156| 523| -5|  A|     F|   null|  null|  null|  null| null|
|09J|Jekyll Island Air...|31.074472|  -81.42778|  11| -4|  A|     F|   null|  null|  null|  null| null|
|0A9|Elizabethton Muni...|36.371223| -82.173416|1593| -4|  A|     F|   null|  null|  null|  null| null|
|0G6|Williams County A...|41.467304| -84.506775| 730| -5|  A|   

In [29]:
df_airports.groupBy("qa_tz").count().distinct().orderBy("qa_tz", ascending=True).show()

+-----+-----+
|qa_tz|count|
+-----+-----+
| null| 1397|
+-----+-----+



#### Pergunta 7

In [30]:
df_airports = df_airports.withColumn("qa_dst", when(df_airports.dst.isNull(), "M"))

In [31]:
df_airports.show()

+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+------+-----+------+
|faa|                name|      lat|        lon| alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|qa_dst|
+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+------+-----+------+
|04G|   Lansdowne Airport|41.130474|  -80.61958|1044| -5|  A|     F|   null|  null|  null|  null| null|  null|
|06A|Moton Field Munic...| 32.46057|  -85.68003| 264| -5|  A|     F|   null|  null|  null|  null| null|  null|
|06C| Schaumburg Regional| 41.98934|  -88.10124| 801| -6|  A|     F|   null|  null|  null|  null| null|  null|
|06N|     Randall Airport| 41.43191|  -74.39156| 523| -5|  A|     F|   null|  null|  null|  null| null|  null|
|09J|Jekyll Island Air...|31.074472|  -81.42778|  11| -4|  A|     F|   null|  null|  null|  null| null|  null|
|0A9|Elizabethton Muni...|36.371223| -82.173416|1593| -4|  A|     F|   null|  null|  null|  null| null|  null|
|

In [32]:
df_airports = df_airports.withColumn("qa_dst", when(df_airports["dst"].rlike("E, A, S, O, Z, N, U"), "C"))

In [33]:
df_airports.show()

+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+------+-----+------+
|faa|                name|      lat|        lon| alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|qa_dst|
+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+------+-----+------+
|04G|   Lansdowne Airport|41.130474|  -80.61958|1044| -5|  A|     F|   null|  null|  null|  null| null|  null|
|06A|Moton Field Munic...| 32.46057|  -85.68003| 264| -5|  A|     F|   null|  null|  null|  null| null|  null|
|06C| Schaumburg Regional| 41.98934|  -88.10124| 801| -6|  A|     F|   null|  null|  null|  null| null|  null|
|06N|     Randall Airport| 41.43191|  -74.39156| 523| -5|  A|     F|   null|  null|  null|  null| null|  null|
|09J|Jekyll Island Air...|31.074472|  -81.42778|  11| -4|  A|     F|   null|  null|  null|  null| null|  null|
|0A9|Elizabethton Muni...|36.371223| -82.173416|1593| -4|  A|     F|   null|  null|  null|  null| null|  null|
|

In [34]:
df_airports = df_airports.withColumn("qa_dst", when(df_airports["dst"].rlike("\d"), "N"))

In [35]:
df_airports.show()

+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+------+-----+------+
|faa|                name|      lat|        lon| alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|qa_dst|
+---+--------------------+---------+-----------+----+---+---+------+-------+------+------+------+-----+------+
|04G|   Lansdowne Airport|41.130474|  -80.61958|1044| -5|  A|     F|   null|  null|  null|  null| null|  null|
|06A|Moton Field Munic...| 32.46057|  -85.68003| 264| -5|  A|     F|   null|  null|  null|  null| null|  null|
|06C| Schaumburg Regional| 41.98934|  -88.10124| 801| -6|  A|     F|   null|  null|  null|  null| null|  null|
|06N|     Randall Airport| 41.43191|  -74.39156| 523| -5|  A|     F|   null|  null|  null|  null| null|  null|
|09J|Jekyll Island Air...|31.074472|  -81.42778|  11| -4|  A|     F|   null|  null|  null|  null| null|  null|
|0A9|Elizabethton Muni...|36.371223| -82.173416|1593| -4|  A|     F|   null|  null|  null|  null| null|  null|
|

In [36]:
df_airports.groupBy("qa_dst").count().distinct().orderBy("qa_dst", ascending=True).show()

+------+-----+
|qa_dst|count|
+------+-----+
|  null| 1397|
+------+-----+



## Salvando arquivo

In [37]:
df_airports_final = df_airports.drop('name', 'lat', 'lon', 'alt', 'tz', 'dst')

In [38]:
df_airports_final.show()

+---+------+-------+------+------+------+-----+------+
|faa|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|qa_dst|
+---+------+-------+------+------+------+-----+------+
|04G|     F|   null|  null|  null|  null| null|  null|
|06A|     F|   null|  null|  null|  null| null|  null|
|06C|     F|   null|  null|  null|  null| null|  null|
|06N|     F|   null|  null|  null|  null| null|  null|
|09J|     F|   null|  null|  null|  null| null|  null|
|0A9|     F|   null|  null|  null|  null| null|  null|
|0G6|     F|   null|  null|  null|  null| null|  null|
|0G7|     F|   null|  null|  null|  null| null|  null|
|0P2|     F|   null|  null|  null|  null| null|  null|
|0S9|     F|   null|  null|  null|  null| null|  null|
|0W3|     F|   null|  null|  null|  null| null|  null|
|10C|     F|   null|  null|  null|  null| null|  null|
|17G|     F|   null|  null|  null|  null| null|  null|
|19A|     F|   null|  null|  null|  null| null|  null|
|1A3|     F|   null|  null|  null|  null| null|  null|
|1B9|     

In [39]:
(df_airports_final
            .repartition(1) # coalesce
            .write.format("parquet")
            .mode('overwrite')
            .option("header", "true")
            .save("../output/airports_qa.parquet"))

## Planes Dataset - Perguntas

#### Pergunta 1

In [40]:
df_planes = (df_planes.withColumn('qa_tailnum',
                       when((col('tailnum').isNull()), "M")
                      .when((length(col('tailnum')) != 5) & (length(col('tailnum')) != 6), "S")
                      .when((col('tailnum').rlike("^N([0-9]{1,4})([A-Z]{1,2}$)") == False), "F")
                      .when((col('tailnum').rlike("^N") == False), "FN")
                      .when((col('tailnum').rlike("^[IO0]") == True), "FE"))
            )

df_planes.groupBy("qa_tailnum").count().distinct().orderBy("qa_tailnum", ascending=True).show()

+----------+-----+
|qa_tailnum|count|
+----------+-----+
|      null| 2330|
|         F|  298|
+----------+-----+



In [41]:
df_planes.filter(col('qa_tailnum') == 'F').toPandas()

Unnamed: 0,tailnum,year,type,manufacturer,model,engines,seats,speed,engine,qa_tailnum
0,N11206,2000.0,Fixed wing multi engine,BOEING,737-824,2,149,,Turbo-fan,F
1,N12114,1995.0,Fixed wing multi engine,BOEING,757-224,2,178,,Turbo-jet,F
2,N12216,1998.0,Fixed wing multi engine,BOEING,737-824,2,149,,Turbo-fan,F
3,N12218,1998.0,Fixed wing multi engine,BOEING,737-824,2,149,,Turbo-fan,F
4,N12221,1998.0,Fixed wing multi engine,BOEING,737-824,2,149,,Turbo-fan,F
...,...,...,...,...,...,...,...,...,...,...
293,N87507,2008.0,Fixed wing multi engine,BOEING,737-824,2,149,,Turbo-fan,F
294,N87512,2008.0,Fixed wing multi engine,BOEING,737-824,2,149,,Turbo-fan,F
295,N87513,2008.0,Fixed wing multi engine,BOEING,737-824,2,149,,Turbo-fan,F
296,N87527,2010.0,Fixed wing multi engine,BOEING,737-824,2,149,,Turbo-fan,F


#### Pergunta 2

In [42]:
df_planes = df_planes.withColumn('qa_year', 
                      when((col('year').isNull()), 'M')
                     .when(col('year') < 1950, 'I'))

df_planes.groupBy("qa_year").count().distinct().orderBy("qa_year", ascending=True).show()

+-------+-----+
|qa_year|count|
+-------+-----+
|   null| 2567|
|      I|    1|
|      M|   60|
+-------+-----+



#### Pergunta 3

In [43]:
categorys_list = ["Fixed wing multi engine", "Fixed wing single engine", "Rotorcraft"]

df_planes = df_planes.withColumn('qa_type',
                      when((col('type').isNull()), 'M')
                     .when(~col('type').isin(categorys_list), 'C'))

df_planes.groupBy("qa_type").count().distinct().orderBy("qa_type", ascending=True).show()

+-------+-----+
|qa_type|count|
+-------+-----+
|   null| 2628|
+-------+-----+



#### Pergunta 4

In [44]:
manufacturer_list = ["AIRBUS", "BOEING","BOMBARDIER", "CESSNA","EMBRAER","SIKORSKY", "CANADAIR",
                    "PIPER", "MCDONNELL DOUGLAS", "CIRRUS", "BELL", "KILDALL GARY","LAMBERT RICHARD", "BARKER JACK",
                    "ROBINSON HELICOPTER", "GULFSTREAM", "MARZ BARRY"]

df_planes = df_planes.withColumn("qa_manufacturer",
                      when(col('manufacturer').isNull(), "M")
                     .when(~col('manufacturer').isin(manufacturer_list), "C"))

df_planes.groupBy("qa_manufacturer").count().distinct().orderBy("qa_manufacturer", ascending=True).show()

+---------------+-----+
|qa_manufacturer|count|
+---------------+-----+
|           null| 2007|
|              C|  621|
+---------------+-----+



#### Pergunta 5

In [45]:
df_planes = df_planes.withColumn('qa_model',
                    when(col('model').isNull(), "M")
                   .when((col('manufacturer') == "AIRBUS") & (~col('model').rlike("^A")), "F")
                   .when((col('manufacturer') == "BOEING") & (~col('model').rlike("^7")), "F")
                   .when((col('manufacturer') == "BOMBARDIER") | (col('manufacturer') == "CANADAIR")
                         & (~col('model').rlike("^CL")), "F")
                   .when((col('manufacturer') == "MCDONNELL DOUGLAS") & (~col('model').rlike('^(MD|DC)')), "F"))

df_planes.groupBy("qa_model").count().distinct().orderBy("qa_model", ascending=True).show()

+--------+-----+
|qa_model|count|
+--------+-----+
|    null| 2613|
|       F|   15|
+--------+-----+



#### Pergunta 6

In [46]:
df_planes = df_planes.withColumn("qa_engines",
                  when(col("engines").isNull(), "M")
                  .when(~(col("engines").between(1,4)), "I")
                  .when(col('engines').rlike("^[a-zA-Z_]*$") == True, "A")  
                  )

df_planes.groupBy("qa_engines").count().distinct().orderBy("qa_engines", ascending=True).show()

+----------+-----+
|qa_engines|count|
+----------+-----+
|      null| 2628|
+----------+-----+



#### Pergunta 7

In [47]:
df_planes = df_planes.withColumn("qa_seats",
                  when(col("seats").isNull(), "M")
                  .when((col("seats") > 500) | (col('seats') < 2), "I")
                  .when(col("seats").rlike("/w") == True, "A")  
                  )

df_planes.groupBy("qa_seats").count().distinct().orderBy("qa_seats", ascending=True).show()

+--------+-----+
|qa_seats|count|
+--------+-----+
|    null| 2628|
+--------+-----+



#### Pergunta 8

In [48]:
df_planes = df_planes.withColumn("qa_speed", 
                    when(col("speed").isNull(), "M")
                   .when((col("speed") > 150) | (col("speed") < 50), "I")
                   .when(~col("speed").rlike("\w"), "A"))

df_planes.groupBy("qa_speed").count().distinct().orderBy("qa_speed", ascending=True).show()

+--------+-----+
|qa_speed|count|
+--------+-----+
|    null|    6|
|       M| 2622|
+--------+-----+



In [49]:
df_planes.filter(col('qa_speed') == 'A').toPandas()

Unnamed: 0,tailnum,year,type,manufacturer,model,engines,seats,speed,engine,qa_tailnum,qa_year,qa_type,qa_manufacturer,qa_model,qa_engines,qa_seats,qa_speed


#### Pergunta 9

In [50]:
category_list = ["Turbo-fan", "Turbo-jet", "Turbo-prop", "Turbo-shaft", "4 Cycle"]

df_planes = df_planes.withColumn("qa_engine", 
                     when(col("engine").isNull(), "M")
                    .when(~col('engine').isin(category_list), "C"))

df_planes.groupBy("qa_engine").count().distinct().orderBy("qa_engine", ascending=True).show()

+---------+-----+
|qa_engine|count|
+---------+-----+
|     null| 2618|
|        C|   10|
+---------+-----+



## Salvando arquivo

In [51]:
df_planes.toPandas()

Unnamed: 0,tailnum,year,type,manufacturer,model,engines,seats,speed,engine,qa_tailnum,qa_year,qa_type,qa_manufacturer,qa_model,qa_engines,qa_seats,qa_speed,qa_engine
0,N102UW,1998.0,Fixed wing multi engine,AIRBUS INDUSTRIE,A320-214,2,182,,Turbo-fan,,,,C,,,,M,
1,N103US,1999.0,Fixed wing multi engine,AIRBUS INDUSTRIE,A320-214,2,182,,Turbo-fan,,,,C,,,,M,
2,N104UW,1999.0,Fixed wing multi engine,AIRBUS INDUSTRIE,A320-214,2,182,,Turbo-fan,,,,C,,,,M,
3,N105UW,1999.0,Fixed wing multi engine,AIRBUS INDUSTRIE,A320-214,2,182,,Turbo-fan,,,,C,,,,M,
4,N107US,1999.0,Fixed wing multi engine,AIRBUS INDUSTRIE,A320-214,2,182,,Turbo-fan,,,,C,,,,M,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2623,N983SW,2004.0,Fixed wing multi engine,BOMBARDIER INC,CL-600-2B19,2,55,,Turbo-fan,,,,C,,,,M,
2624,N984CA,1997.0,Fixed wing multi engine,CANADAIR,CL-600-2B19,2,55,,Turbo-fan,,,,,,,,M,
2625,N986CA,,Fixed wing multi engine,CANADAIR,CL-600-2B19,2,55,,Turbo-fan,,M,,,,,,M,
2626,N986SW,2004.0,Fixed wing multi engine,BOMBARDIER INC,CL-600-2B19,2,55,,Turbo-fan,,,,C,,,,M,


In [52]:
df_planes_final = df_planes.drop('year', 'type', 'manufacturer', 'model', 'engines', 'seats', 'speed', 'engine')

In [53]:
df_planes_final.show()

+-------+----------+-------+-------+---------------+--------+----------+--------+--------+---------+
|tailnum|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|qa_seats|qa_speed|qa_engine|
+-------+----------+-------+-------+---------------+--------+----------+--------+--------+---------+
| N102UW|      null|   null|   null|              C|    null|      null|    null|       M|     null|
| N103US|      null|   null|   null|              C|    null|      null|    null|       M|     null|
| N104UW|      null|   null|   null|              C|    null|      null|    null|       M|     null|
| N105UW|      null|   null|   null|              C|    null|      null|    null|       M|     null|
| N107US|      null|   null|   null|              C|    null|      null|    null|       M|     null|
| N108UW|      null|   null|   null|              C|    null|      null|    null|       M|     null|
| N109UW|      null|   null|   null|              C|    null|      null|    null|       M| 

In [54]:
(df_planes_final
            .repartition(1) # coalesce
            .write.format("parquet")
            .mode('overwrite')
            .option("header", "true")
            .save("../output/planes_qa.parquet"))

## Flights Dataset - Perguntas

In [55]:
df_flights = df_flights[['year', 'month', 'day','hour', 'minute','dep_time','arr_time', 'dep_delay', 'arr_delay', 
                         'carrier', 'tailnum', 'flight','origin','dest', 'air_time', 'distance']]

df_flights.printSchema()
df_flights.show()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- dep_time: string (nullable = true)
 |-- arr_time: string (nullable = true)
 |-- dep_delay: integer (nullable = true)
 |-- arr_delay: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: integer (nullable = true)

+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+
|year|month|day|hour|minute|dep_time|arr_time|dep_delay|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|
+----+-----+---+----+------+--------+--------+---------+---------+-------+-------+------+------+----+--------+--------+
|2014|   12

#### Pergunta 1

In [56]:
df_flights = (df_flights.withColumn("qa_year_month_day", 
                    when(col("year").isNull(), "MY")
                   .when(col("month").isNull(), "MM")
                   .when(col("day").isNull(), "MD")
                   .when(col("year") < 1950, "IY")
                   .when(~(col("month").between(1, 12)) , "IM")
                   .when(~(col("day").between(1, 31)) , "ID")
                   .when((col("month") == 2) & ((col("day") >29)) , "ID"))            
                   
                   )

df_flights.groupBy("qa_year_month_day").count().distinct().orderBy("qa_year_month_day", ascending=True).show()

+-----------------+-----+
|qa_year_month_day|count|
+-----------------+-----+
|             null|10000|
+-----------------+-----+



In [57]:
df_flights.filter(col('qa_year_month_day') == 'ID').toPandas()

Unnamed: 0,year,month,day,hour,minute,dep_time,arr_time,dep_delay,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,qa_year_month_day


#### Pergunta 2

In [58]:
df_flights = (df_flights.withColumn("qa_hour_minute",
                   when(col("hour").isNull(), "MH")
                  .when(col("minute").isNull(), "MM")
                  .when(~col("hour").between(0, 24), "IH")
                  .when(~col("minute").between(0, 59), "IM"))
       
                  )

df_flights.groupBy("qa_hour_minute").count().distinct().orderBy("qa_hour_minute", ascending=True).show()

+--------------+-----+
|qa_hour_minute|count|
+--------------+-----+
|          null| 9952|
|            MH|   48|
+--------------+-----+



#### Pergunta 3

In [59]:
df_flights = df_flights.withColumn("qa_dep_arr_time",
         when((col("dep_time").isNull()) | (col("dep_time") == "NA") , "MD")
        .when(col("arr_time").isNull() | (col("arr_time") == "NA"),"MA")
        .when(~col("dep_time").rlike("^([0-9]|1[0-9]|2[0-3])[0-5][0-9]$"), "FD")
        .when(~col("arr_time").rlike("^([0-9]|1[0-9]|2[0-3])[0-5][0-9]$"), "FA")
)

df_flights.groupBy("qa_dep_arr_time").count().distinct().orderBy("qa_dep_arr_time", ascending=True).show()

+---------------+-----+
|qa_dep_arr_time|count|
+---------------+-----+
|           null| 9704|
|             FA|  151|
|             FD|   90|
|             MA|    7|
|             MD|   48|
+---------------+-----+



#### Pergunta 4

In [60]:
df_flights = (df_flights.withColumn("qa_dep_arr_delay",
                   when(col("dep_delay").isNull(), "MD")
                  .when(col("arr_delay").isNull(), "MA"))
             )

df_flights.groupBy("qa_dep_arr_delay").count().distinct().orderBy("qa_dep_arr_delay", ascending=True).show()

+----------------+-----+
|qa_dep_arr_delay|count|
+----------------+-----+
|            null| 9925|
|              MA|   27|
|              MD|   48|
+----------------+-----+



#### Pergunta 5

In [61]:
df_flights = (df_flights.withColumn("qa_carrier",
                   when(col("carrier").isNull(), "M")
                  .when(~col("carrier").rlike("([0-9a-zA-Z]{2})"), "F"))
             )

df_flights.groupBy("qa_carrier").count().distinct().orderBy("qa_carrier", ascending=True).show()

+----------+-----+
|qa_carrier|count|
+----------+-----+
|      null|10000|
+----------+-----+



#### Pergunta 6

In [62]:
df_flights = (df_flights.withColumn('qa_tailnum',
                         when((col('tailnum').isNull()) | (col('tailnum') == 'NA') , "M")
                        .when((length(col('tailnum')) != 5) & (length(col('tailnum')) != 6), "S")
                        .when((col('tailnum').rlike("^N") == False), "FN")
                        .when((col('tailnum').rlike("^[IO0]") == True), "FE")
                        .when((col('tailnum').rlike("^N([0-9]{1,4})([A-Z]{1,2}$)") == False), "F"))
             )

df_flights.groupBy("qa_tailnum").count().distinct().orderBy("qa_tailnum", ascending=True).show()

+----------+-----+
|qa_tailnum|count|
+----------+-----+
|      null| 8997|
|         F|  987|
|        FN|    2|
|         M|   14|
+----------+-----+



#### Pergunta 7

In [63]:
df_flights = (df_flights.withColumn("qa_flight",
                         when(col("flight").isNull(), "M")
                        .when(~col("flight").rlike("([0-9]{4})"), "F"))
             )

df_flights.groupBy("qa_flight").count().distinct().orderBy("qa_flight", ascending=True).show()

+---------+-----+
|qa_flight|count|
+---------+-----+
|     null| 3842|
|        F| 6158|
+---------+-----+



#### Pergunta 8

In [64]:
df_flights = (df_flights.withColumn("qa_origin_dest",
                         when(col("origin").isNull(), "MO")
                        .when(col("dest").isNull(), "MD")   
                        .when(~col("origin").rlike("\w{3}"), "FO")
                        .when(~col("dest").rlike("\w{3}"), "FD"))
             )

df_flights.groupBy("qa_origin_dest").count().distinct().orderBy("qa_origin_dest", ascending=True).show()

+--------------+-----+
|qa_origin_dest|count|
+--------------+-----+
|          null|10000|
+--------------+-----+



#### Pergunta 9

In [65]:
df_flights = (df_flights.withColumn("qa_air_time",
                         when(col("air_time").isNull(), "M")
                        .when(~col("air_time").between(20, 500), "I"))
             )

df_flights.groupBy("qa_air_time").count().distinct().orderBy("qa_air_time", ascending=True).show()

+-----------+-----+
|qa_air_time|count|
+-----------+-----+
|       null| 9925|
|          M|   75|
+-----------+-----+



#### Pergunta 10

In [66]:
df_flights = (df_flights.withColumn("qa_distance",
                         when(col("distance").isNull(), "M")
                        .when(~col("distance").between(50, 3000), "I"))
             )

df_flights.groupBy("qa_distance").count().distinct().orderBy("qa_distance", ascending=True).show()

+-----------+-----+
|qa_distance|count|
+-----------+-----+
|       null|10000|
+-----------+-----+



#### Pergunta 11

In [67]:
df_flights = (df_flights.withColumn("qa_distance_airtime",
                        when((col("distance").isNull())|(col("air_time").isNull()), "M")
                       .when((col("air_time")) >= ((col("distance")*0.1) + 30), "TL")
                       .when((col("air_time")) <= ((col("distance")*0.1) +10), "TS")
                       .otherwise("TR"))
             )

df_flights.groupBy("qa_distance_airtime").count().distinct().orderBy("qa_distance_airtime", ascending=True).show()

+-------------------+-----+
|qa_distance_airtime|count|
+-------------------+-----+
|                  M|   75|
|                 TL| 5027|
|                 TR| 4831|
|                 TS|   67|
+-------------------+-----+



## Salvando arquivo

In [68]:
df_flights.toPandas()

Unnamed: 0,year,month,day,hour,minute,dep_time,arr_time,dep_delay,arr_delay,carrier,...,qa_hour_minute,qa_dep_arr_time,qa_dep_arr_delay,qa_carrier,qa_tailnum,qa_flight,qa_origin_dest,qa_air_time,qa_distance,qa_distance_airtime
0,2014,12,8,6.0,58.0,658,935,-7.0,-5.0,VX,...,,,,,,,,,,TL
1,2014,1,22,10.0,40.0,1040,1505,5.0,5.0,AS,...,,,,,,F,,,,TL
2,2014,3,9,14.0,43.0,1443,1652,-2.0,2.0,VX,...,,,,,,F,,,,TL
3,2014,4,9,17.0,5.0,1705,1839,45.0,34.0,WN,...,,,,,,F,,,,TR
4,2014,3,9,7.0,54.0,754,1015,-1.0,1.0,AS,...,,,,,,F,,,,TL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2014,6,23,18.0,6.0,1806,2104,-4.0,-6.0,OO,...,,,,,,,,,,TR
9996,2014,8,31,23.0,36.0,2336,452,11.0,-13.0,AA,...,,,,,F,,,,,TR
9997,2014,8,8,9.0,4.0,904,1042,-1.0,-5.0,AS,...,,,,,,F,,,,TR
9998,2014,8,29,14.0,41.0,1441,1820,26.0,10.0,WN,...,,,,,,,,,,TR


In [69]:
df_flights_final = df_flights.drop('year', 'month', 'day', 'hour', 'minute', 'dep_time', 'arr_time', 'dep_delay',
                                  'arr_delay', 'carrier', 'flight', 'air_time', 'distance')

In [70]:
df_flights_final.toPandas()

Unnamed: 0,tailnum,origin,dest,qa_year_month_day,qa_hour_minute,qa_dep_arr_time,qa_dep_arr_delay,qa_carrier,qa_tailnum,qa_flight,qa_origin_dest,qa_air_time,qa_distance,qa_distance_airtime
0,N846VA,SEA,LAX,,,,,,,,,,,TL
1,N559AS,SEA,HNL,,,,,,,F,,,,TL
2,N847VA,SEA,SFO,,,,,,,F,,,,TL
3,N360SW,PDX,SJC,,,,,,,F,,,,TR
4,N612AS,SEA,BUR,,,,,,,F,,,,TL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,N225AG,SEA,SLC,,,,,,,,,,,TR
9996,N3LEAA,SEA,DFW,,,,,,F,,,,,TR
9997,N523AS,SEA,SMF,,,,,,,F,,,,TR
9998,N8647A,SEA,ABQ,,,,,,,,,,,TR


In [71]:
(df_flights_final
            .repartition(1) # coalesce
            .write.format("parquet")
            .mode('overwrite')
            .option("header", "true")
            .save("../output/flights_qa.parquet"))