In [1]:
!pip install pyspark
!pip install findspark


You should consider upgrading via the '/opt/python/envs/default/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/opt/python/envs/default/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, StructField, StructType, IntegerType, FloatType
from datetime import datetime as dt

In [3]:
sc = SparkContext()

spark = (SparkSession.builder.master('local[7]').appName("Relatórios - carlos"))

In [4]:
# Expressoes regulares comuns
REGEX_ALPHA    = r'[a-zA-Z]+'
REGEX_INTEGER  = r'[0-9]+'
REGEX_FLOAT    = r'[0-9]+\.[0-9]+'
REGEX_ALPHANUM = r'[0-9a-zA-Z]+'
REGEX_EMPTY_STR= r'[\t ]+$'
REGEX_SPECIAL  = r'[!@#$%&*\(\)_]+'
REGEX_NNUMBER  = r'^N[1-9][0-9]{2,3}([ABCDEFGHJKLMNPRSTUVXWYZ]{1,2})'
REGEX_NNUMBER_INVALID = r'(N0.*$)|(.*[IO].*)'
REGEX_TIME_FMT = r'^(([0-1]?[0-9])|(2[0-3]))([0-5][0-9])$'

In [5]:
import re

def check_empty_column(col):
    return (F.col(col).isNull() | (F.col(col) == '') | (F.col(col).rlike(REGEX_EMPTY_STR) ))

In [6]:
# Lendo arquivo parquet

flights_qa = (spark.getOrCreate().read
                   .format("parquet")
                   .option("header","True")
                   .load("./parquet_files/qa_outputs/qa_flights.parquet"))


# Lendo arquivo parquet
planes_qa = (spark.getOrCreate().read
                   .format("parquet")
                   .option("header","True")
                   .load("./parquet_files/qa_outputs/planes_qa.parquet"))

# Lendo arquivo parquet
airports_qa = (spark.getOrCreate().read
                   .format("parquet")
                   .option("header","True")
                   .load("./parquet_files/qa_outputs/airport_qa.parquet"))

In [7]:
# Criacao das visões temporarias
flights_qa.createOrReplaceTempView('flights')
planes_qa.createOrReplaceTempView('planes')
airports_qa.createOrReplaceTempView('airports')

In [8]:
planes_qa.show()

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+-------+-------+----------------+--------+----------+--------+--------+---------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_tailnum|qa_year|qa_type| qa_manufacturer|qa_model|qa_engines|qa_seats|qa_speed|qa_engine|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+-------+-------+----------------+--------+----------+--------+--------+---------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|         S|   1998|      M|AIRBUS INDUSTRIE|A320-214|         2|       F|       M|Turbo-fan|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|         S|   1999|      M|AIRBUS INDUSTRIE|A320-214|         2|       F|       M|Turbo-fan|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null

In [9]:
flights_qa.show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+-------+--------+--------+----+------+-----------------+--------------+---------------+----------------+----------+---------+--------------+-----------+-----------+----------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|destiny|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|qa_dep_arr_time|qa_dep_arr_delay|qa_carrier|qa_flight|qa_origin_dest|qa_air_time|qa_distance|qa_airtime|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+-------+--------+--------+----+------+-----------------+--------------+---------------+----------------+----------+---------+--------------+-----------+-----------+----------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA|    LAX|     132|     954|   6|    58|             null|          null|           null|            null|        VX|     1780|          n

In [10]:
airports_qa.show()

+---+--------------------+---------+-----------+----+----+---+------+--------------------+---------+-----------+------+-----+------+
|faa|                name|      lat|        lon| alt|  tz|dst|qa_faa|             qa_name|   qa_lat|     qa_lon|qa_alt|qa_tz|qa_dst|
+---+--------------------+---------+-----------+----+----+---+------+--------------------+---------+-----------+------+-----+------+
|04G|   Lansdowne Airport|41.130474|  -80.61958|1044|-5.0|  A|   04G|   Lansdowne Airport|41.130474|  -80.61958|  1044| -5.0|     A|
|06A|Moton Field Munic...| 32.46057|  -85.68003| 264|-5.0|  A|   06A|Moton Field Munic...| 32.46057|  -85.68003|   264| -5.0|     A|
|06C| Schaumburg Regional| 41.98934|  -88.10124| 801|-6.0|  A|   06C| Schaumburg Regional| 41.98934|  -88.10124|   801| -6.0|     A|
|06N|     Randall Airport| 41.43191|  -74.39156| 523|-5.0|  A|   06N|     Randall Airport| 41.43191|  -74.39156|   523| -5.0|     A|
|09J|Jekyll Island Air...|31.074472|  -81.42778|  11|-4.0|  A|   09J|

In [11]:
# Renomeando colunas
flights_qa = flights_qa.select(*(F.col(x).alias('fl_' + x ) for x in flights_qa.columns))
planes_qa = planes_qa.select(*(F.col(x).alias('pl_' + x ) for x in planes_qa.columns))
airports_qa2 = airports_qa.select(*(F.col(x).alias('air2_' + x ) for x in airports_qa.columns))
airports_qa = airports_qa.select(*(F.col(x).alias('air_' + x ) for x in airports_qa.columns))

In [12]:
flights_qa.show()

+-------+--------+------+-----------+------------+-----------+------------+----------+----------+---------+---------+----------+-----------+-----------+-------+---------+--------------------+-----------------+------------------+-------------------+-------------+------------+-----------------+--------------+--------------+-------------+
|fl_year|fl_month|fl_day|fl_dep_time|fl_dep_delay|fl_arr_time|fl_arr_delay|fl_carrier|fl_tailnum|fl_flight|fl_origin|fl_destiny|fl_air_time|fl_distance|fl_hour|fl_minute|fl_qa_year_month_day|fl_qa_hour_minute|fl_qa_dep_arr_time|fl_qa_dep_arr_delay|fl_qa_carrier|fl_qa_flight|fl_qa_origin_dest|fl_qa_air_time|fl_qa_distance|fl_qa_airtime|
+-------+--------+------+-----------+------------+-----------+------------+----------+----------+---------+---------+----------+-----------+-----------+-------+---------+--------------------+-----------------+------------------+-------------------+-------------+------------+-----------------+--------------+--------------+-

In [13]:
joined_dfs   = spark.getOrCreate().read.parquet("parquet_files/newjoin.parquet")

myjoined_dfs = spark.getOrCreate().read.parquet("parquet_files/joined_data_new.parquet")
transformation_c_dfs = spark.getOrCreate().read.parquet("parquet_files/transformed_join.parquet")

In [14]:
print(joined_dfs.count())
print(myjoined_dfs.count())

14025
11395




# Perguntas para qualidade

## 1.

In [15]:
#3 joins com planilha renomeadas em airports_qualidade2
qa_challenge_df = flights_qa.join(airports_qa, (flights_qa.fl_origin == airports_qa.air_faa) ,'left')\
                            .join(airports_qa2, (flights_qa.fl_destiny  == airports_qa2.air2_faa),'left')\
                            .join(planes_qa, (flights_qa.fl_tailnum == planes_qa.pl_tailnum),'left')

In [16]:
qa_challenge_df.show()

+-------+--------+------+-----------+------------+-----------+------------+----------+----------+---------+---------+----------+-----------+-----------+-------+---------+--------------------+-----------------+------------------+-------------------+-------------+------------+-----------------+--------------+--------------+-------------+-------+-------------------+---------+---------+-------+------+-------+----------+-------------------+----------+----------+----------+---------+----------+--------+--------------------+---------+-----------+--------+-------+--------+-----------+--------------------+-----------+-----------+-----------+----------+-----------+----------+-------+--------------------+---------------+-----------+----------+--------+--------+---------+-------------+----------+----------+------------------+-----------+-------------+-----------+-----------+------------+
|fl_year|fl_month|fl_day|fl_dep_time|fl_dep_delay|fl_arr_time|fl_arr_delay|fl_carrier|fl_tailnum|fl_flight|fl_o

In [17]:
#criando no um novo DF 
all_cols_join = spark.getOrCreate().read.parquet("parquet_files/all_join.parquet")

In [18]:
#joined_dfs   = spark.getOrCreate().read.parquet("parquet_files/newjoin.parquet") lá em cima
joined_dfs.show()
joined_dfs.createOrReplaceTempView('reports_view')

+----+----+----+------+---+----+---+------+-------+------+------+------+-----+------+
| faa|name| lat|   lon|alt|  tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|qa_dst|
+----+----+----+------+---+----+---+------+-------+------+------+------+-----+------+
|2014|  12| 8.0| 658.0| -7| 935| -5|     F|   null|     I|  null|     I|    A|     C|
|2014|   1|22.0|1040.0|  5|1505|  5|     F|   null|     I|  null|     A|    A|     C|
|2014|   3| 9.0|1443.0| -2|1652|  2|     F|   null|     I|  null|     I|    A|     C|
|2014|   4| 9.0|1705.0| 45|1839| 34|     F|   null|     I|  null|     A|    A|     C|
|2014|   3| 9.0| 754.0| -1|1015|  1|     F|   null|     I|  null|     I|    A|     C|
|2014|   1|15.0|1037.0|  7|1352|  2|     F|   null|     I|  null|     A|    A|     C|
|2014|   7| 2.0| 847.0| 42|1041| 51|     F|   null|     I|  null|     A|    A|     C|
|2014|   5|12.0|1655.0| -5|1842|-18|     F|   null|     I|  null|     I|    A|     C|
|2014|   4|19.0|1236.0| -4|1508| -7|     F|   null|   

## 2.

In [19]:
#from collections import Counter
#import json

joined_dfs.printSchema( )

root
 |-- faa: string (nullable = true)
 |-- name: string (nullable = true)
 |-- lat: float (nullable = true)
 |-- lon: float (nullable = true)
 |-- alt: integer (nullable = true)
 |-- tz: integer (nullable = true)
 |-- dst: string (nullable = true)
 |-- qa_faa: string (nullable = true)
 |-- qa_name: string (nullable = true)
 |-- qa_lat: string (nullable = true)
 |-- qa_lon: string (nullable = true)
 |-- qa_alt: string (nullable = true)
 |-- qa_tz: string (nullable = true)
 |-- qa_dst: string (nullable = true)



In [73]:
#transformation_c_dfs = spark.getOrCreate().read.parquet("parquet_files/transformed_join.parquet") lá em cima
for i in ['M', 'F', 'I', 'S', 'T']:
    print(i)
    qa_challenge_df.select([F.count(F.when(F.col(c) ==i, 1)).alias(c) 
    for c in 
    qa_challenge_df.columns]).show()

M
+-------+--------+------+-----------+------------+-----------+------------+----------+----------+---------+---------+----------+-----------+-----------+-------+---------+--------------------+-----------------+------------------+-------------------+-------------+------------+-----------------+--------------+--------------+-------------+-------+--------+-------+-------+-------+------+-------+----------+-----------+----------+----------+----------+---------+----------+--------+---------+--------+--------+--------+-------+--------+-----------+------------+-----------+-----------+-----------+----------+-----------+----------+-------+-------+---------------+--------+----------+--------+--------+---------+-------------+----------+----------+------------------+-----------+-------------+-----------+-----------+------------+
|fl_year|fl_month|fl_day|fl_dep_time|fl_dep_delay|fl_arr_time|fl_arr_delay|fl_carrier|fl_tailnum|fl_flight|fl_origin|fl_destiny|fl_air_time|fl_distance|fl_hour|fl_minute|f

## 3.

In [74]:
i, most_M = float('-inf'), '' #most_M = dados faltantes
results = []

for c in qa_challenge_df.columns:
    if 'qa' in c:
        results.append(qa_challenge_df.filter(~check_empty_column(c) & 
        (F.col(c) =='M')).groupBy(F.substring(c, 1, 1)
        .alias(c)).count().toPandas())

for _row in results:
    c = _row.columns[0]
    if _row.values.size:
        v = _row.values[0]
        if v[1] > i:
            i = v[1]
            most_F = c

print(most_M, ' tem maior aparições: ' ,i)

  tem maior aparições:  9444



## 4.

In [22]:
i, most_F = float('-inf'), ''
results = []
for c in qa_challenge_df.columns:
    if 'qa' in c:
        results.append(qa_challenge_df.filter((F.col(c)!='') & 
        (F.col(c) =='F')).groupBy(F.substring(c, 1, 1).alias(c)).count().toPandas())

for _row in results:
    c = _row.columns[0]
    if _row.values.size:
        v = _row.values[0]
        if v[1] > i:
            i = v[1]
            most_F = c

print(most_F, ' tem o maior valor: ' ,i)

air_qa_faa  tem o maior valor:  10000



## 5.

In [23]:
results = []
# joins com planilha renomeadas em airports_qualidade2
# 5. Qual variável apresenta maior número de formatos errados?
for c in qa_challenge_df.columns:
    if 'qa' in c:
        results.append(qa_challenge_df.filter((F.col(c)!='') & 
        (F.col(c) =='I')).groupBy(F.substring(c, 1, 1).alias(c)).count().toPandas())

In [24]:
i, most_I = float('-inf'), '' 
for _row in results:
    c = _row.columns[0]
    if _row.values.size:
        v = _row.values[0]
        if v[1] > i:
            i = v[1]
            most_F = c

print(most_I, ' tem ' ,i , 'aparições fora do intervalo')

  tem  8 aparições fora do intervalo



# Perguntas para negócio

## 1.

In [25]:
# Lendo arquivo parquet

flights_transform = (spark.getOrCreate().read
                          .format("parquet")
                          .option("header","True")
                          .load("./parquet_files/transformation_proc/flights_proc.parquet"))


# Lendo arquivo parquet
planes_transform = (spark.getOrCreate().read
                         .format("parquet")
                         .option("header","True")
                         .load("./parquet_files/transformation_proc/planes_proc.parquet"))

# Lendo arquivo parquet
airports_transform = (spark.getOrCreate().read
                           .format("parquet")
                           .option("header","True")
                           .load("./parquet_files/transformation_proc/airports_proc.parquet"))

In [26]:
# Renomeando colunas
flights_transform = flights_transform.select(*(F.col(x).alias('fl_' + x ) for x in flights_transform.columns))
planes_transform = planes_transform.select(*(F.col(x).alias('pl_' + x ) for x in planes_transform.columns))
airports_transform2 = airports_transform.select(*(F.col(x).alias('air2_' + x ) for x in airports_transform.columns))
airports_transform = airports_transform.select(*(F.col(x).alias('air_' + x ) for x in airports_transform.columns))

In [27]:
flights_transform.show()

+-----------+-----------+------------+------------+----------+----------+---------+---------+-------+-----------+-----------+-------------------+---------------------+--------------------+----------------+-------------+---------------------+
|fl_dep_time|fl_arr_time|fl_dep_delay|fl_arr_delay|fl_carrier|fl_tailnum|fl_flight|fl_origin|fl_dest|fl_air_time|fl_distance|    fl_dep_datetime|fl_air_time_projected|fl_air_time_expected|fl_haul_duration|fl_dep_season|fl_dep_delay_category|
+-----------+-----------+------------+------------+----------+----------+---------+---------+-------+-----------+-----------+-------------------+---------------------+--------------------+----------------+-------------+---------------------+
|        658|        935|          -7|          -5|        VX|    N846VA|     1780|      SEA|    LAX|        132|        954|2014-12-08 08:58:00|                  115|                 126|      SHORT-HAUL|         FALL|          ANTECIPATED|
|       1040|       1505|       

In [28]:
transform_proc_df = flights_transform.join(airports_transform,  (flights_transform.fl_origin == airports_transform.air_faa) ,'left')\
                                     .join(airports_transform2, (flights_transform.fl_dest  == airports_transform2.air2_faa),'left')\
                                     .join(planes_transform,    (flights_transform.fl_tailnum == planes_transform.pl_tailnum),'left')

In [29]:
transform_proc_df.show()

+-----------+-----------+------------+------------+----------+----------+---------+---------+-------+-----------+-----------+-------------------+---------------------+--------------------+----------------+-------------+---------------------+-------+-------------------+---------+---------+-------+------+-------+-------------+--------+------------+------------------+--------+--------------------+---------+-----------+--------+-------+--------+-------------+---------+-------------+-------------------+----------+-------+---------+---------------+-----------+----------+--------+--------+---------+-----------+------+--------------+
|fl_dep_time|fl_arr_time|fl_dep_delay|fl_arr_delay|fl_carrier|fl_tailnum|fl_flight|fl_origin|fl_dest|fl_air_time|fl_distance|    fl_dep_datetime|fl_air_time_projected|fl_air_time_expected|fl_haul_duration|fl_dep_season|fl_dep_delay_category|air_faa|           air_name|  air_lat|  air_lon|air_alt|air_tz|air_dst|   air_region|air_type|air_military|air_administration



## 2.

In [30]:
# transform_proc_df.select(F.countDistinct('air_name')).groupBy('air_re   gion').orderBy('region').show() | #.agg(F.expr('count(distinct air_name)').alias("Aeroportos por regiao")).show()
#groupby e agregate

transform_proc_df.groupBy(F.col('air2_region').alias('regiões')).agg(F.expr('count(distinct air2_name)').alias('aeroportos por região')).show()

+-------------+---------------------+
|      regiões|aeroportos por região|
+-------------+---------------------+
|       ALASKA|                    9|
|MAINLAND-EAST|                   24|
|MAINLAND-WEST|                   36|
+-------------+---------------------+




## 3.

In [31]:
transform_proc_df.agg(F.max(F.col('air_alt')-
F.col('air2_alt')).alias("Maior diferença de altiude")).show() 

+--------------------------+
|Maior diferença de altiude|
+--------------------------+
|                       429|
+--------------------------+



## 4.

In [32]:
transform_proc_df.filter(transform_proc_df['fl_arr_delay'] > 0).agg(F.avg('fl_arr_delay').alias('Atraso médio das chegadas (em minutos)')).show()
transform_proc_df.filter(transform_proc_df['fl_dep_delay'] > 0).agg(F.avg('fl_dep_delay').alias('Atraso médio de partida (em minutos)')).show()
#.groupBy('fl_carrier') + agregate em fl_arr_delay e median

+--------------------------------------+
|Atraso médio das chegadas (em minutos)|
+--------------------------------------+
|                      24.6496644295302|
+--------------------------------------+

+------------------------------------+
|Atraso médio de partida (em minutos)|
+------------------------------------+
|                    25.5242774566474|
+------------------------------------+





## 5.

In [33]:
transform_proc_df.filter(transform_proc_df['fl_arr_delay'] > 0).groupBy(F.col('air2_region').alias('regiões')).agg(F.ceil(F.avg('fl_arr_delay')).alias('Atraso médio de chegada (em minutos)')).show()
transform_proc_df.filter(transform_proc_df['fl_dep_delay'] > 0).groupBy(F.col('air2_region').alias('regiões')).agg(F.ceil(F.avg('fl_dep_delay')).alias('Atraso médio de partida (em minutos)')).show()

+-------------+------------------------------------+
|      regiões|Atraso médio de chegada (em minutos)|
+-------------+------------------------------------+
|       ALASKA|                                  23|
|MAINLAND-EAST|                                  29|
|MAINLAND-WEST|                                  24|
+-------------+------------------------------------+

+-------------+------------------------------------+
|      regiões|Atraso médio de partida (em minutos)|
+-------------+------------------------------------+
|       ALASKA|                                  21|
|MAINLAND-EAST|                                  27|
|MAINLAND-WEST|                                  26|
+-------------+------------------------------------+




# 6.

In [36]:
transform_proc_df.filter((transform_proc_df['fl_arr_delay'] > 0) | (transform_proc_df['fl_dep_delay'] > 0)).groupBy(F.year(F.col('fl_dep_datetime')).alias('Ano')).agg((F.sum('fl_dep_delay') + F.sum('fl_arr_delay')).alias('Atraso de partida acumulado (em minutos)')).show()

+----+----------------------------------------+
| Ano|Atraso de partida acumulado (em minutos)|
+----+----------------------------------------+
|2015|                                       6|
|2014|                                  166792|
+----+----------------------------------------+




## 7.

In [37]:
transform_proc_df.groupBy(F.year(F.col('fl_dep_datetime')).alias('Ano'), F.col('air_region').alias('Região')).agg((F.sum('fl_dep_delay') + 
F.sum('fl_arr_delay')).alias('Atraso origem acumulado (em minutos)')).show()
transform_proc_df.groupBy(F.year(F.col('fl_dep_datetime')).alias('Ano'), F.col('air2_region').alias('Região')).agg((F.sum('fl_dep_delay') + 
F.sum('fl_arr_delay')).alias('Atraso destino acumulado (em minutos)')).show()

+----+-------------+------------------------------------+
| Ano|       Região|Atraso origem acumulado (em minutos)|
+----+-------------+------------------------------------+
|2015|MAINLAND-WEST|                                   6|
|2014|MAINLAND-WEST|                               82751|
+----+-------------+------------------------------------+

+----+-------------+-------------------------------------+
| Ano|       Região|Atraso destino acumulado (em minutos)|
+----+-------------+-------------------------------------+
|2014|       ALASKA|                                 4435|
|2014|MAINLAND-EAST|                                21618|
|2015|MAINLAND-WEST|                                    6|
|2014|MAINLAND-WEST|                                56698|
+----+-------------+-------------------------------------+




## 8.

In [38]:
transform_proc_df.agg(F.ceil(F.avg('fl_air_time')).alias('Tempo médio de vôo (em minutos)')).show()

+-------------------------------+
|Tempo médio de vôo (em minutos)|
+-------------------------------+
|                            153|
+-------------------------------+




## 9.

In [39]:
transform_proc_df.groupBy(F.col('air2_region').alias('regiões')).agg(F.ceil(F.avg('fl_air_time')).alias('Tempo médio de vôo (em minutos)')).show()

+-------------+-------------------------------+
|      regiões|Tempo médio de vôo (em minutos)|
+-------------+-------------------------------+
|       ALASKA|                            228|
|MAINLAND-EAST|                            238|
|MAINLAND-WEST|                            116|
+-------------+-------------------------------+




## 10.

In [40]:
transform_proc_df.groupBy(F.col('fl_origin').alias('Origem'), F.col('fl_dest').alias('Destino')).agg(F.ceil(F.avg('fl_air_time')).alias('Tempo médio de vôo (em minutos)')).show(200)

+------+-------+-------------------------------+
|Origem|Destino|Tempo médio de vôo (em minutos)|
+------+-------+-------------------------------+
|   SEA|    RNO|                             75|
|   SEA|    DTW|                            220|
|   SEA|    CLE|                            234|
|   SEA|    LAX|                            127|
|   PDX|    SEA|                             35|
|   SEA|    BLI|                             23|
|   PDX|    IAH|                            214|
|   PDX|    PHX|                            131|
|   SEA|    SLC|                             89|
|   SEA|    SBA|                            119|
|   SEA|    BWI|                            270|
|   PDX|    IAD|                            268|
|   PDX|    SFO|                             86|
|   SEA|    KOA|                            348|
|   PDX|    MCI|                            175|
|   SEA|    SJC|                            104|
|   SEA|    ABQ|                            143|
|   SEA|    SAT|    


## 11.

In [41]:
transform_proc_df.groupBy(F.year(F.col('fl_dep_datetime')).alias('Ano')).agg(F.ceil(F.avg('fl_air_time')).alias('Tempo médio de vôo (em minutos)')).show()

+----+-------------------------------+
| Ano|Tempo médio de vôo (em minutos)|
+----+-------------------------------+
|2015|                             43|
|2014|                            153|
+----+-------------------------------+




## 12.

In [42]:
transform_proc_df.groupBy(F.col('air2_region').alias('regiões')).agg(F.sum('fl_air_time').alias('Tempo de vôo acumulado (em minutos)')).show()

+-------------+-----------------------------------+
|      regiões|Tempo de vôo acumulado (em minutos)|
+-------------+-----------------------------------+
|       ALASKA|                             230602|
|MAINLAND-EAST|                             508344|
|MAINLAND-WEST|                             789679|
+-------------+-----------------------------------+




## 13.

In [43]:
transform_proc_df.agg(F.round(F.avg('fl_distance')).alias('Distância de vôo médio (em milhas)')).show()

+----------------------------------+
|Distância de vôo médio (em milhas)|
+----------------------------------+
|                            1208.0|
+----------------------------------+




## 14.

*Considerando que a distância dos vôos é dada pela distância total entre os aeroportos*:

In [44]:
transform_proc_df.groupBy(F.col('air2_region').alias('regiões')).agg(F.round(F.avg('fl_distance')).alias('Distância de vôo médio (em milhas)')).show()

+-------------+----------------------------------+
|      regiões|Distância de vôo médio (em milhas)|
+-------------+----------------------------------+
|       ALASKA|                            1742.0|
|MAINLAND-EAST|                            2042.0|
|MAINLAND-WEST|                             868.0|
+-------------+----------------------------------+




## 15.

In [45]:
transform_proc_df.groupBy(F.col('fl_origin').alias('Origem'), F.col('fl_dest').alias('Destino')).agg(F.concat(F.round(F.avg('fl_distance')), F.lit(' mls')).alias('Distância de vôo médio (em milhas)')).show()

+------+-------+----------------------------------+
|Origem|Destino|Distância de vôo médio (em milhas)|
+------+-------+----------------------------------+
|   SEA|    RNO|                         564.0 mls|
|   SEA|    DTW|                        1927.0 mls|
|   SEA|    CLE|                        2021.0 mls|
|   SEA|    LAX|                         954.0 mls|
|   PDX|    SEA|                         129.0 mls|
|   SEA|    BLI|                          93.0 mls|
|   PDX|    IAH|                        1825.0 mls|
|   PDX|    PHX|                        1009.0 mls|
|   SEA|    SLC|                         689.0 mls|
|   SEA|    SBA|                         908.0 mls|
|   SEA|    BWI|                        2335.0 mls|
|   PDX|    IAD|                        2327.0 mls|
|   PDX|    SFO|                         550.0 mls|
|   SEA|    KOA|                        2688.0 mls|
|   PDX|    MCI|                        1482.0 mls|
|   SEA|    SJC|                         697.0 mls|
|   SEA|    


## 16.

In [46]:
transform_proc_df.groupBy(F.year(F.col('fl_dep_datetime')).alias('Ano')).agg(F.concat(F.format_number(F.sum('fl_distance'), 1), F.lit(' milhas')).alias('Distância de vôo acumulado (em milhas)')).show()
#.agg(F.format_number(F.sum('fl_distance'), 1).alias('Distância de vôo acumulado (em milhas)')).show()

+----+--------------------------------------+
| Ano|Distância de vôo acumulado (em milhas)|
+----+--------------------------------------+
|2015|                          224.0 milhas|
|2014|                   12,081,292.0 milhas|
+----+--------------------------------------+




## 17.

In [47]:
transform_proc_df.groupBy(F.col('air2_region').alias('regiões')).agg(F.concat(F.format_number(F.sum('fl_distance'), 1), F.lit('mls')).alias('Distância de vôo acumulado (em milhas)')).show()

+-------------+--------------------------------------+
|      regiões|Distância de vôo acumulado (em milhas)|
+-------------+--------------------------------------+
|       ALASKA|                        1,762,553.0mls|
|MAINLAND-EAST|                        4,378,902.0mls|
|MAINLAND-WEST|                        5,940,061.0mls|
+-------------+--------------------------------------+




## 18.

*Considerando que todos os aviões dos vôos estão em sua capacidade máxima de assentos*: 

In [48]:
transform_proc_df.groupBy(F.col('fl_origin').alias('Origem'), F.col('fl_dest').alias('Destino') ).agg(F.format_number(F.ceil(F.avg('pl_seats')), 0).alias('Número médio de passageiros por rota')).show()

+------+-------+------------------------------------+
|Origem|Destino|Número médio de passageiros por rota|
+------+-------+------------------------------------+
|   SEA|    RNO|                                 142|
|   SEA|    DTW|                                 213|
|   SEA|    CLE|                                 182|
|   SEA|    LAX|                                 155|
|   PDX|    SEA|                                  65|
|   SEA|    BLI|                                 164|
|   PDX|    IAH|                                 183|
|   PDX|    PHX|                                 196|
|   SEA|    SLC|                                 166|
|   SEA|    SBA|                                  80|
|   SEA|    BWI|                                 152|
|   PDX|    IAD|                                 188|
|   PDX|    SFO|                                 139|
|   SEA|    KOA|                                 171|
|   PDX|    MCI|                                 147|
|   SEA|    SJC|            


## 19.

In [49]:
transform_proc_df.groupBy(F.year(F.to_timestamp('fl_dep_datetime')).alias('Ano')).agg(F.format_number(F.ceil(F.sum('pl_seats')), 0).alias('Número acumulado de passageiros por ano')).show()

+----+---------------------------------------+
| Ano|Número acumulado de passageiros por ano|
+----+---------------------------------------+
|2015|                                    149|
|2014|                              1,509,395|
+----+---------------------------------------+




## 20.

In [50]:
transform_proc_df.groupBy(F.col('fl_dest').alias('Destino')).count().orderBy(F.col('count').desc()).withColumnRenamed('count', 'Quantidade de viagens').show(1)

+-------+---------------------+
|Destino|Quantidade de viagens|
+-------+---------------------+
|    SFO|                  787|
+-------+---------------------+
only showing top 1 row



# 21.

In [51]:
transform_proc_df.groupBy(F.col('fl_dest').alias('Destino')).agg(F.sum('pl_seats').alias('sum_seats')).orderBy(F.col('sum_seats').desc()).withColumnRenamed('sum_seats', 'Quantidade de passageiros').show(1)

+-------+-------------------------+
|Destino|Quantidade de passageiros|
+-------+-------------------------+
|    SFO|                   119635|
+-------+-------------------------+
only showing top 1 row




## 22.

In [52]:
transform_proc_df.select(F.col('fl_origin').alias('Origem'),F.col('fl_dest').alias('Destino'),F.concat(F.format_number(F.col('fl_distance'), 1), F.lit(' mls')).alias('Distância em milhas')).filter((F.col('fl_origin') == 'PDX')| 
(F.col('fl_dest') == 'PDX')).orderBy(F.col('fl_distance').desc()).show(1)

+------+-------+-------------------+
|Origem|Destino|Distância em milhas|
+------+-------+-------------------+
|   PDX|    LIH|        2,631.0 mls|
+------+-------+-------------------+
only showing top 1 row




## 23.

In [53]:
transform_proc_df.groupBy(F.col('fl_dest').alias('Destino')).count().orderBy(F.col('count').desc()).withColumnRenamed('count', 'Total de viagens').show(1)

+-------+----------------+
|Destino|Total de viagens|
+-------+----------------+
|    SFO|             787|
+-------+----------------+
only showing top 1 row




## 24.

In [54]:
transform_proc_df.select('*').groupBy(F.col('pl_model').alias('Modelo')).count().orderBy(F.col('count').desc()).withColumnRenamed('count', 'Quantidade').show(1)

+-------+----------+
| Modelo|Quantidade|
+-------+----------+
|737-890|      1463|
+-------+----------+
only showing top 1 row




## 25.

In [55]:
transform_proc_df.filter(F.col('pl_model').isNotNull()).groupBy(F.col('fl_dest').alias('Destino'), F.col('pl_model').alias('Modelo')).count().orderBy(F.col('count').desc()).withColumnRenamed('count', 'Quantidade').show(1)

+-------+-------+----------+
|Destino| Modelo|Quantidade|
+-------+-------+----------+
|    OAK|737-7H4|       141|
+-------+-------+----------+
only showing top 1 row




## 26.

In [56]:
transform_proc_df.groupBy(F.col('fl_haul_duration').alias('Haul Duration')).agg(F.ceil(F.avg('pl_engines')).alias('Número médio de motores')).show()

+-------------+-----------------------+
|Haul Duration|Número médio de motores|
+-------------+-----------------------+
|    LONG-HAUL|                      2|
|  MEDIUM-HAUL|                      2|
|   SHORT-HAUL|                      2|
+-------------+-----------------------+




## 27.

In [57]:
transform_proc_df.groupBy(F.col('fl_dep_season').alias('Estação do ano')).count().orderBy(F.col('count').desc()).withColumnRenamed('count', 'Quantidade de vôos').show(1) 

+--------------+------------------+
|Estação do ano|Quantidade de vôos|
+--------------+------------------+
|        SUMMER|              2918|
+--------------+------------------+
only showing top 1 row




## 28.

In [58]:
data = transform_proc_df.groupBy(F.col('fl_dep_season'), F.col('fl_dest')).count().orderBy(F.col('count').desc())

In [59]:
maxs =  data.groupBy(F.col('fl_dest').alias('max_fl_dest')).agg(F.max('count').alias('max')).alias('maxs')

result = data.join(maxs,
             F.col('count') == F.col('max') 
         ).select('*')

In [60]:
result.select(F.col('fl_dep_season').alias('Estação do Ano'), F.col('fl_dest').alias('Destino'), F.col('count').alias('Quantidade')).orderBy(F.col('max').desc()).show()

+--------------+-------+----------+
|Estação do Ano|Destino|Quantidade|
+--------------+-------+----------+
|        SUMMER|    SFO|       217|
|        SPRING|    LAX|       176|
|        SUMMER|    DEN|       172|
|        SPRING|    PHX|       148|
|        SPRING|    LAS|       145|
|          FALL|    DEN|       145|
|        SPRING|    LAS|       145|
|        SUMMER|    ANC|       145|
|          FALL|    DEN|       145|
|        SUMMER|    ANC|       145|
|        SUMMER|    ORD|       129|
|        SUMMER|    DFW|       122|
|        SUMMER|    SJC|       120|
|        SPRING|    SLC|       107|
|        SUMMER|    SAN|       100|
|          FALL|    OAK|        96|
|        SUMMER|    SMF|        91|
|        SUMMER|    ATL|        86|
|        SPRING|    DFW|        86|
|          FALL|    MSP|        68|
+--------------+-------+----------+
only showing top 20 rows




## 29.

In [68]:

transform_proc_df.groupBy('fl_dep_delay_category').count().orderBy(F.col('count').desc()).withColumnRenamed('count', 'Quantidade').show()

+---------------------+----------+
|fl_dep_delay_category|Quantidade|
+---------------------+----------+
|          ANTECIPATED|      5894|
|                MINOR|      3065|
|               INTIME|       646|
|                MAJOR|       395|
+---------------------+----------+



In [56]:

flights_qa.show()

+-------+--------+------+-----------+------------+-----------+------------+----------+----------+---------+---------+----------+-----------+-----------+-------+---------+--------------------+-----------------+------------------+-------------------+-------------+------------+-----------------+--------------+--------------+-------------+
|fl_year|fl_month|fl_day|fl_dep_time|fl_dep_delay|fl_arr_time|fl_arr_delay|fl_carrier|fl_tailnum|fl_flight|fl_origin|fl_destiny|fl_air_time|fl_distance|fl_hour|fl_minute|fl_qa_year_month_day|fl_qa_hour_minute|fl_qa_dep_arr_time|fl_qa_dep_arr_delay|fl_qa_carrier|fl_qa_flight|fl_qa_origin_dest|fl_qa_air_time|fl_qa_distance|fl_qa_airtime|
+-------+--------+------+-----------+------------+-----------+------------+----------+----------+---------+---------+----------+-----------+-----------+-------+---------+--------------------+-----------------+------------------+-------------------+-------------+------------+-----------------+--------------+--------------+-


## 30.

In [69]:
data = transform_proc_df.groupBy(F.col('fl_dep_delay_category'), F.col('fl_origin'), F.col('fl_dest'))\
                        .count()\
                        .orderBy(F.col('count').desc())

In [70]:
maxs =  data.groupBy(F.col('fl_origin').alias('max_fl_origin'), F.col('fl_dest').alias('max_fl_dest')).agg(F.max('count').alias('max')).alias('maxs')

results = data.join(maxs,data['count'] == maxs['max'], 'inner')

In [71]:

results.select(F.col('fl_dep_delay_category').alias('Categoria'), F.col('fl_origin').alias('Origem'), F.col('fl_dest').alias('Destino'), F.col('count').alias('Quantidade'))\
       .orderBy(F.col('count').desc())\
       .show()

+-----------+------+-------+----------+
|  Categoria|Origem|Destino|Quantidade|
+-----------+------+-------+----------+
|ANTECIPATED|   SEA|    LAX|       293|
|ANTECIPATED|   SEA|    SFO|       245|
|ANTECIPATED|   SEA|    LAS|       228|
|ANTECIPATED|   SEA|    PHX|       195|
|ANTECIPATED|   SEA|    ANC|       192|
|ANTECIPATED|   PDX|    SFO|       174|
|ANTECIPATED|   SEA|    DEN|       169|
|ANTECIPATED|   SEA|    ORD|       167|
|ANTECIPATED|   PDX|    PHX|       136|
|ANTECIPATED|   SEA|    SJC|       135|
|ANTECIPATED|   PDX|    DEN|       135|
|ANTECIPATED|   PDX|    DEN|       135|
|ANTECIPATED|   SEA|    SJC|       135|
|ANTECIPATED|   SEA|    DFW|       134|
|ANTECIPATED|   PDX|    LAX|       126|
|ANTECIPATED|   PDX|    LAX|       126|
|ANTECIPATED|   PDX|    LAX|       126|
|ANTECIPATED|   SEA|    SLC|       126|
|ANTECIPATED|   SEA|    SLC|       126|
|ANTECIPATED|   SEA|    PDX|       126|
+-----------+------+-------+----------+
only showing top 20 rows

