In [1]:
# Instalando os pacotes
!pip install pyspark
!pip install findspark



In [2]:
# Importando os módulos necessários
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

import pandas as pd

In [3]:
# Cria o contexto spark
sc = SparkContext()

# Instancia o criador de sessão do spark
spark = (SparkSession.builder
                     .master('local[7]')
                     .appName('Aceleração Pyspark - Capgemini'))

In [5]:
# Cria os schemas
schema_airports = StructType([
    StructField("faa",  StringType(),  True),
    StructField("name", StringType(),  True),
    StructField("lat",  FloatType(),   True),
    StructField("lon",  FloatType(),   True),
    StructField("alt",  IntegerType(), True),
    StructField("tz",   IntegerType(), True),
    StructField("dst",  StringType(),  True)
])

schema_planes = StructType([
    StructField("tailnum",      StringType(),  True),
    StructField("year",         IntegerType(), True),
    StructField("type",         StringType(),  True),
    StructField("manufacturer", StringType(),  True),
    StructField("model",        StringType(),  True),
    StructField("engines",      IntegerType(), True),
    StructField("seats",        IntegerType(), True),
    StructField("speed",        IntegerType(), True),
    StructField("engine",       StringType(),  True)
])

schema_flights = StructType([
    StructField("year",      IntegerType(), True),
    StructField("month",     IntegerType(), True),
    StructField("day",       IntegerType(), True),
    StructField("dep_time",  StringType(),  True),
    StructField("dep_delay", IntegerType(), True),
    StructField("arr_time",  StringType(),  True),
    StructField("arr_delay", IntegerType(), True),
    StructField("carrier",   StringType(),  True),
    StructField("tailnum",   StringType(),  True),
    StructField("flight",    StringType(),  True),
    StructField("origin",    StringType(),  True),
    StructField("dest",      StringType(),  True),
    StructField("air_time",  IntegerType(), True),
    StructField("distance",  IntegerType(), True),
    StructField("hour",      IntegerType(), True),
    StructField("minute",    IntegerType(), True),
])

In [6]:
# Cria todos os dataframes
df_airports = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_airports)
                  .load("../data/airports.csv"))

df_planes = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_planes)
                  .load("../data/planes.csv"))

df_flights = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_flights)
                  .load("../data/flights.csv"))

In [7]:
df_airports.show(3)
df_planes.show(3)
df_flights.show(3)

+---+--------------------+---------+---------+----+---+---+
|faa|                name|      lat|      lon| alt| tz|dst|
+---+--------------------+---------+---------+----+---+---+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|
+---+--------------------+---------+---------+----+---+---+
only showing top 3 rows

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
+--

# -----------------------------------------------------------------------------------------------------------
## Report quality
# -----------------------------------------------------------------------------------------------------------
## 1.

In [None]:
df_airports = df_airports.drop('name', 'lat', 'lon', 'alt', 'tz', 'dst')
df_planes   = df_planes.drop('year', 'type', 'manufacturer', 'model', 'engines', 'seats', 'speed', 'engine')
df_flights  = df_flights.drop('year', 'month', 'day', 'dep_time', 'dep_delay', 'arr_time', 'arr_delay', 'carrier', 'flight', 'air_time', 'distance', 'hour', 'minute')

In [None]:
df_airports = (df_airports.withColumnRenamed('faa',     'faa_origin')
                          .withColumnRenamed('qa_faa',  'qa_faa_origin')
                          .withColumnRenamed('qa_name', 'qa_name_origin')
                          .withColumnRenamed('qa_lat',  'qa_lat_origin')
                          .withColumnRenamed('qa_lon',  'qa_lon_origin')
                          .withColumnRenamed('qa_alt',  'qa_alt_origin')
                          .withColumnRenamed('qa_tz',   'qa_tz_origin')
                          .withColumnRenamed('qa_dst',  'qa_dst_origin'))

# dataframe de relatório
df_relatorio = df_flights.join(df_airports, 
                        (df_airports['faa_origin'] == df_flights['origin']),
                         'left')

df_relatorio.toPandas()

In [None]:
df_airports = (df_airports.withColumnRenamed('faa_origin',     'faa_dest')
                          .withColumnRenamed('qa_faa_origin',  'qa_faa_dest')
                          .withColumnRenamed('qa_name_origin', 'qa_name_dest')
                          .withColumnRenamed('qa_lat_origin',  'qa_lat_dest')
                          .withColumnRenamed('qa_lon_origin',  'qa_lon_dest')
                          .withColumnRenamed('qa_alt_origin',  'qa_alt_dest')
                          .withColumnRenamed('qa_tz_origin',   'qa_tz_dest')
                          .withColumnRenamed('qa_dst_origin',  'qa_dst_dest'))

df_relatorio = df_relatorio.join(df_airports, 
                                (df_airports['faa_dest'] == df_relatorio['dest']),
                                 'left')

In [None]:
df_planes = (df_planes.withColumnRenamed('tailnum',    'tailnum_tailnum')
                      .withColumnRenamed('qa_tailnum', 'qa_tailnum_tailnum')
                      .withColumnRenamed('qa_year',    'qa_year_tailnum')
            )

df_relatorio = df_relatorio.join(df_planes,
                                 df_relatorio['tailnum'] == df_planes['tailnum_tailnum'],
                                 'left')

In [None]:
df_relatorio = df_relatorio.drop('tailnum', 'origin', 'dest', 'faa_origin', 'faa_dest', 'tailnum_tailnum')

# -----------------------------------------------------------------------------------------------------------
## 2.

In [None]:
for c in df_relatorio.columns:
    df_relatorio.groupBy(F.substring(c, 1, 1).alias(c)).count().show()

In [None]:
lista_m = []
lista_f = []
lista_i = []
lista_s = []
lista_t = []
lista_c = []
lista_none = []

for c in df_relatorio.columns:
    linha = df_relatorio.groupBy(F.substring(c, 1, 1).alias(c)).count().collect()
        
    for n in range(len(linha)):
        clas = linha[n][0]

        if clas == 'M':
            lista_m.append((c, linha[n][1]))

        elif clas == 'F':
            lista_f.append((c, linha[n][1]))

        elif clas == 'I':
            lista_i.append((c, linha[n][1]))

        elif clas == 'S':
            lista_s.append((c, linha[n][1]))

        elif clas == 'T':
            lista_t.append((c, linha[n][1]))

        elif clas == 'C':
            lista_c.append((c, linha[n][1]))

        elif clas == 'None':
            lista_none.append((c, linha[n][1]))

In [None]:
def max_lista(lista):
    valor = 0
    
    for n in range(len(lista)):
        if lista[n][1] > valor:
            valor = lista[n][1]
            coluna = lista[n][0]
    
    return coluna, valor

# -----------------------------------------------------------------------------------------------------------
## 3.

In [None]:
print(f'M: {m} com {valor}')

In [None]:
m     = max_lista(lista_m)
valor = m[1]
m     = m[0]

# -----------------------------------------------------------------------------------------------------------
## 4.

In [None]:
f     = max_lista(lista_f)
valor = f[1]
f     = f[0]

In [None]:
print(f'F: {f} com {valor}')

# -----------------------------------------------------------------------------------------------------------
## 5.

In [None]:
i     = max_lista(lista_i)
valor = i[1]
i     = i[0]

In [None]:
print(f'I: {i} com {valor}')

# -----------------------------------------------------------------------------------------------------------
## Report quality
# -----------------------------------------------------------------------------------------------------------
## 1.

In [None]:
df_airports = (df_airports.withColumnRenamed('faa', 'faa_origin')
                          .withColumnRenamed('name', 'name_origin')
                          .withColumnRenamed('lat', 'lat_origin')
                          .withColumnRenamed('lon', 'lon_origin')
                          .withColumnRenamed('alt', 'alt_origin')
                          .withColumnRenamed('tz', 'tz_origin')
                          .withColumnRenamed('dst', 'dst_origin')
                          .withColumnRenamed('region', 'region_origin')
                          .withColumnRenamed('type', 'type_origin')
                          .withColumnRenamed('military', 'military_origin')
                          .withColumnRenamed('administration', 'administration_origin'))

df_proc = df_flights.join(df_airports,
                          df_airports['faa_origin'] == df_flights['origin'],
                          'left'
                         )

df_proc.show(1, vertical=True)

In [None]:
df_airports = (df_airports.withColumnRenamed('faa_origin',            'faa_dest')
                          .withColumnRenamed('name_origin',           'name_dest')
                          .withColumnRenamed('lat_origin',            'lat_dest')
                          .withColumnRenamed('lon_origin',            'lon_dest')
                          .withColumnRenamed('alt_origin',            'alt_dest')
                          .withColumnRenamed('tz_origin',             'tz_dest')
                          .withColumnRenamed('dst_origin',            'dst_dest')
                          .withColumnRenamed('region_origin',         'region_dest')
                          .withColumnRenamed('type_origin',           'type_dest')
                          .withColumnRenamed('military_origin',       'military_dest')
                          .withColumnRenamed('administration_origin', 'administration_dest'))

df_proc = df_proc.join(df_airports,
                       df_airports['faa_dest'] == df_proc['dest'],
                       'left')

In [None]:
df_planes = (df_planes.withColumnRenamed('tailnum', 'tailnum_planes'))

df_proc = df_proc.join(df_planes,
                       df_planes['tailnum_planes'] == df_proc['tailnum'],
                       'left')

In [None]:
df_proc.show(1, vertical=True)

# -----------------------------------------------------------------------------------------------------------
## 2.

In [None]:
df_proc.groupBy(F.col('region_dest')).agg(F.countDistinct('name_dest').alias('count')).show()

# -----------------------------------------------------------------------------------------------------------
## 3.

In [None]:
df_proc.agg(F.max(F.col('alt_origin') - F.col('alt_dest')).alias('maior diferença')).show()

# -----------------------------------------------------------------------------------------------------------
## 4.

In [None]:
(df_proc.where((F.col('dep_delay') > 0) | (F.col('arr_delay') > 0))
        .agg( F.ceil(F.avg(F.col('dep_delay') + F.col('arr_delay')) ).alias('Atraso médio (min)'))
        .show())

# -----------------------------------------------------------------------------------------------------------
## 5.

In [None]:
(df_proc.where((F.col('dep_delay') >= 0) | (F.col('arr_delay') >= 0))
        .groupBy(F.col('region_dest'))
        .agg( F.ceil(F.avg(F.col('dep_delay') + F.col('arr_delay'))).alias('Atraso Médio (min)') )
        .show())

# -----------------------------------------------------------------------------------------------------------
## 6.

In [None]:
df_proc.groupBy(F.year(F.col('dep_datetime'))).count().show()

In [None]:
(df_proc.where((F.col('dep_delay') > 0) | (F.col('arr_delay') > 0))
        .groupBy(F.year(F.col('dep_datetime')).alias('year'))
        .agg(F.sum(F.col('dep_delay') + F.col('arr_delay')).alias('atraso acumulado (min)'))
        .show())

# -----------------------------------------------------------------------------------------------------------
## 7.

In [None]:
df_proc.groupBy(F.col('region_dest'), F.year(F.col('dep_datetime'))).count().show()

In [None]:
(df_proc.where((F.col('dep_delay') > 0) | (F.col('arr_delay') > 0))
        .groupBy(F.col('region_dest').alias('região'), F.year(F.col('dep_datetime')).alias('Ano'))
        .agg(F.sum(F.col('dep_delay') + F.col('arr_delay')).alias('Atraso Acumulado chegada (min)'))
        .show())

# -----------------------------------------------------------------------------------------------------------
## 8.

In [None]:
df_proc.agg( F.ceil(F.avg(F.col('air_time'))).alias('tempo de voo médio (min)') ).show()

# -----------------------------------------------------------------------------------------------------------
## 9.

In [None]:
df_proc.groupBy(F.col('region_dest')).agg( F.ceil(F.avg(F.col('air_time'))).alias('média de voo (min)') ).show()

# -----------------------------------------------------------------------------------------------------------
## 10.

In [None]:
(df_proc.groupBy( F.col('origin'), F.col('dest') )
        .agg( F.ceil(F.avg(F.col('air_time'))).alias('média de voo (min)') )
        .show())