In [1]:
# instalando pacotes necessários
!pip install pyspark
!pip install findspark



In [2]:
# Inicia o findspark
import findspark
findspark.init()

In [3]:
# Importando os módulos necessários
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

import pandas as pd

In [4]:
# Cria o contexto do spark
sc = SparkContext()

# Instancia o criador de sessao do spark
spark = (SparkSession.builder
                     .master('local[7]')
                     .appName('Aceleração Pyspark - Capgemini'))

In [5]:
# Cria os schemas
schema_airports = StructType([
    StructField("faa",  StringType(),  True),
    StructField("name", StringType(),  True),
    StructField("lat",  FloatType(),   True),
    StructField("lon",  FloatType(),   True),
    StructField("alt",  IntegerType(), True),
    StructField("tz",   IntegerType(), True),
    StructField("dst",  StringType(),  True)
])

schema_planes = StructType([
    StructField("tailnum",      StringType(),  True),
    StructField("year",         IntegerType(), True),
    StructField("type",         StringType(),  True),
    StructField("manufacturer", StringType(),  True),
    StructField("model",        StringType(),  True),
    StructField("engines",      IntegerType(), True),
    StructField("seats",        IntegerType(), True),
    StructField("speed",        IntegerType(), True),
    StructField("engine",       StringType(),  True)
])

schema_flights = StructType([
    StructField("year",      IntegerType(), True),
    StructField("month",     IntegerType(), True),
    StructField("day",       IntegerType(), True),
    StructField("dep_time",  StringType(),  True),
    StructField("dep_delay", IntegerType(), True),
    StructField("arr_time",  StringType(),  True),
    StructField("arr_delay", IntegerType(), True),
    StructField("carrier",   StringType(),  True),
    StructField("tailnum",   StringType(),  True),
    StructField("flight",    StringType(),  True),
    StructField("origin",    StringType(),  True),
    StructField("dest",      StringType(),  True),
    StructField("air_time",  IntegerType(), True),
    StructField("distance",  IntegerType(), True),
    StructField("hour",      IntegerType(), True),
    StructField("minute",    IntegerType(), True),
])

In [7]:
# Cria todos os dataframes
df_airports = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_airports)
                  .load("../data/airports.csv"))

df_planes = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_planes)
                  .load("../data/planes.csv"))

df_flights = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_flights)
                  .load("../data/flights.csv"))

In [9]:
df_airports.show(3)
df_planes.show(3)
df_flights.show(3)

+---+--------------------+---------+---------+----+---+---+
|faa|                name|      lat|      lon| alt| tz|dst|
+---+--------------------+---------+---------+----+---+---+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|
+---+--------------------+---------+---------+----+---+---+
only showing top 3 rows

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
+--

# -----------------------------------------------------------------------------------------------------------
# Quality airports
# -----------------------------------------------------------------------------------------------------------
## 1.

In [None]:
df_airports.filter(F.col('faa') == 'NA').show(3)

In [None]:
REGEX_ALPHANUM = r'[0-9a-zA-Z]+'

df_airports = df_airports.withColumn("qa_faa", 
                                     F.when(F.col("faa").isNull(), "M")
                                      .when(~F.col("faa").rlike(r'[0-9a-zA-Z]+'), "F"))

df_airports.show(5)
df_airports.filter(F.col('qa_faa').isNull()).show()

In [None]:
df_airports.groupBy(F.col('qa_faa')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 2.

In [None]:
df_airports.filter(F.col('name') == 'NA').show(5)

In [None]:
df_airports = df_airports.withColumn('qa_name',
                                     F.when(F.col('name') == 'NA', 'M'))

df_airports.show(5)

In [None]:
df_airports.groupBy(F.col('qa_name')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 3.

In [None]:
df_airports.filter(F.col('lat').isNull()).show()

In [None]:
df_airports = df_airports.withColumn('qa_lat',
                                     F.when(F.col('lat').isNull(), 'M')
                                      .when(~F.col('lat').between(-180, 180), 'I')
                                      .when(F.col('lat').rlike('[a-zA-Z]'), 'A'))

df_airports.show(5)

In [None]:
df_airports.groupBy(F.col('qa_lat')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 4.

In [None]:
df_airports.filter(F.col('lon').isNull()).show()

In [None]:
df_airports = df_airports.withColumn('qa_lon',
                                     F.when(F.col('lon').isNull(), 'M')
                                      .when(~F.col('lon').between(-180, 180), 'I')
                                      .when(F.col('lon').rlike('[a-zA-Z]'), 'A')
                                    )

df_airports.show(5)

In [None]:
df_airports.groupBy(F.col('qa_lon')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 5.

In [None]:
df_airports.filter(F.col('alt').isNull()).show()

In [None]:
df_airports = df_airports.withColumn('qa_alt',
                                     F.when(F.col('alt').isNull(), 'M')
                                      .when(F.col('alt') < 0, 'I')
                                      .when(F.col('alt').rlike('[a-zA-Z]'), 'A')
                                    )

df_airports.show(5)

In [None]:
df_airports.groupBy(F.col('qa_alt')).count().show()
df_airports.filter(F.col('qa_alt').isNotNull()).show()

# -----------------------------------------------------------------------------------------------------------
## 6.

In [None]:
df_airports.filter(F.col('tz').isNull()).show()

In [None]:
df_airports = df_airports.withColumn('qa_tz',
                                     F.when(F.col('tz').isNull(), 'M')
                                      .when(~F.col('tz').between(-11, 14), 'I')
                                      .when(F.col('tz').rlike('[a-zA-Z]'), 'A')
                                    )

df_airports.show(5)

In [None]:
df_airports.groupBy(F.col('qa_tz')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 7.

In [None]:
df_airports.filter(F.col('dst') == 'NA').show()

In [None]:
LIST_DST = ['E', 'A', 'S', 'O', 'Z', 'N', 'U']

df_airports = df_airports.withColumn('qa_dst',
                                     F.when(F.col('dst') == 'NA', 'M')
                                      .when(~F.col('dst').isin(LIST_DST), 'C')
                                      .when(F.col('dst').rlike('[0-9]'), 'N')
                                    )

df_airports.show(5)

In [None]:
df_airports.groupBy(F.col('qa_dst')).count().show()

# -----------------------------------------------------------------------------------------------------------
# Quality airports
# -----------------------------------------------------------------------------------------------------------
## 1.

In [None]:
df_planes.filter(F.col('tailnum') == 'NA').show()

In [None]:
REGEX_TAILNUM = r'(^[N][0-9]{4}[A-Z]$)|(^[N][0-9]{3}[A-Z]{2})|(^[N][0-9]{3}[A-Z]$)'

df_planes = df_planes.withColumn('qa_tailnum',
                                 F.when(F.col('tailnum') == 'NA', 'M')
                                  .when(~F.length(F.col('tailnum')).between(5, 6), 'S')
                                  .when(~F.col('tailnum').rlike('^N'), 'FN')
                                  .when(F.col('tailnum').rlike('(N0)|([IO])'), 'FE')
                                  .when(~F.col('tailnum').rlike(REGEX_TAILNUM), 'F')
                                )

df_planes.show(5)

In [None]:
df_planes.groupBy(F.col('qa_tailnum')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 2.

In [None]:
df_planes.filter(F.col('year').isNull()).show(5)

In [None]:
df_planes = df_planes.withColumn('qa_year',
                                 F.when(F.col('year').isNull(), 'M')
                                  .when(F.col('year') < 1950,   'I')
                                )

df_planes.show(5)

In [None]:
df_planes.groupBy(F.col('qa_year')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 3.

In [None]:
df_planes.filter(F.col('type') == 'NA').show()

In [None]:
LIST_TYPE = ['Fixed wing multi engine', 'Fixed wing single engine', 'Rotorcraft']
df_planes = df_planes.withColumn('qa_type',
                                 F.when(F.col('type') == 'NA',          'M')
                                  .when(~F.col('type').isin(LIST_TYPE), 'C')
                                )

df_planes.show(5)

In [None]:
df_planes.groupBy(F.col('qa_type')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 4.

In [None]:
df_planes.filter(F.col('type') == 'NA').show()

In [None]:
LIST_MAN = [ "AIRBUS", "BOEING", "BOMBARDIER", "CESSNA", "EMBRAER", "SIKORSKY", "CANADAIR", "PIPER", 
             "MCDONNELL DOUGLAS", "CIRRUS", "BELL", "KILDALL GARY", "LAMBERT RICHARD", "BARKER JACK", "ROBINSON HELICOPTER", 
             "GULFSTREAM", "MARZ BARRY"]

df_planes = df_planes.withColumn('qa_manufacturer',
                                 F.when(F.col('manufacturer') == 'NA',         'M')
                                  .when(~F.col('manufacturer').isin(LIST_MAN), 'C')
                                )

df_planes.show(5)

In [None]:
df_planes.groupBy(F.col('qa_manufacturer')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 5.

In [None]:
df_planes.filter(F.col('model') == 'NA').show()

In [None]:
df_planes = df_planes.withColumn('qa_model',
                                 F.when(F.col('model') == 'NA', 'M')
                                  .when(((F.col("manufacturer") == "AIRBUS") & (F.col("model").rlike('^[A]') == False)) |
                                        ((F.col("manufacturer") == "BOEING") & (F.col("model").rlike('^[7]') == False)) |
                                        ((F.col("manufacturer") == "BOMBARDIER") & (F.col("model").rlike('^[C][L]') == False)) |
                                        ((F.col("manufacturer") == "CANADAIR") & (F.col("model").rlike('^[C][L]') == False)) |
                                        ((F.col("manufacturer") == "MCDONNELL DOUGLAS") & (F.col("model").rlike('(^[M][D]|^[D][C])') == False)), 'F')
                                )

df_planes.show(5)

In [None]:
df_planes.groupBy(F.col('qa_model')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 6.

In [None]:
df_planes.filter(F.col('engines').isNull()).show()

In [None]:
df_planes = df_planes.withColumn('qa_engines',
                                 F.when(F.col('engines').isNull(),          'M')
                                  .when(~F.col('engines').between(1, 4),    'I')
                                  .when(F.col('engines').rlike('[a-zA-Z]'), 'A')
                                )

df_planes.show(3)

In [None]:
df_planes.groupBy(F.col('qa_engines')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 7.

In [None]:
df_planes.filter(F.col('seats').isNull()).show()

In [None]:
df_planes = df_planes.withColumn('qa_seats',
                                 F.when(F.col('seats').isNull(),          'M')
                                  .when(~F.col('seats').between(2, 500),  'I')
                                  .when(F.col('seats').rlike('[a-zA-Z]'), 'A')
                                )

df_planes.show(3)

In [None]:
df_planes.groupBy(F.col('qa_seats')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 8.

In [None]:
df_planes.filter(F.col('speed').isNull()).count()

In [None]:
df_planes = df_planes.withColumn('qa_speed',
                                 F.when(F.col('speed').isNull(),          'M')
                                  .when(~F.col('speed').between(50, 150), 'I')
                                  .when(F.col('speed').rlike('[a-zA-Z]'), 'A')
                                )

df_planes.show(3)

In [None]:
df_planes.groupBy(F.col('qa_speed')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 9.

In [None]:
df_planes.filter(F.col('engine').isNull()).show()

In [None]:
LIST_ENGINE = ["Turbo-fan", "Turbo-jet", "Turbo-prop", "Turbo-shaft", "4 Cycle"]

df_planes = df_planes.withColumn('qa_engine',
                                 F.when(F.col('engine') == 'NA',            'M')
                                  .when(~F.col('engine').isin(LIST_ENGINE), 'C')
                                )

df_planes.show(3)

In [None]:
df_planes.groupBy(F.col('qa_engine')).count().show()

# -----------------------------------------------------------------------------------------------------------
# Quality flights
# -----------------------------------------------------------------------------------------------------------
## 1.

In [None]:
df_flights.filter(F.col('year').isNull()).show()

In [None]:
df_flights = df_flights.withColumn('qa_year_month_day',
                                   F.when(F.col('year').isNull(),         'MY')
                                    .when(F.col('month').isNull(),        'MM')
                                    .when(F.col('day').isNull(),          'MD')
                                    .when(F.col('year') < 1950,           'IY')
                                    .when(~F.col('month').between(1, 12), 'IM')
                                    .when(~F.col('day').between(1,31),    'ID')
                                    .when((F.col('month') == 2) & (~F.col('day').between(1,29)), 'ID')
                                  )

df_flights.show(3)

In [None]:
df_flights.groupBy(F.col('qa_year_month_day')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 2.

In [None]:
df_flights.filter(F.col('hour').isNull()).show(2)

In [None]:
df_flights = df_flights.withColumn('qa_hour_minute',
                                   F.when(F.col('hour').isNull(),          'MH')
                                    .when(F.col('minute').isNull(),        'MM')
                                    .when(~F.col('hour').between(0, 24),   'IH')
                                    .when(~F.col('minute').between(0, 59), 'IM')
                                  )

df_flights.show(3)

In [None]:
df_flights.groupBy(F.col('qa_hour_minute')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 3.

In [None]:
df_flights.filter(F.col('dep_time') == 'NA').show(2)

In [None]:
REGEX_TIME = r'^(([0-1]?[0-9])|(2[0-3]))([0-5][0-9])$'

df_flights = df_flights.withColumn('qa_dep_arr_time',
                                   F.when(F.col('dep_time') == 'NA',            'MD')
                                    .when(F.col('arr_time') == 'NA',            'MA')
                                    .when(~F.col('dep_time').rlike(REGEX_TIME), 'FD')
                                    .when(~F.col('arr_time').rlike(REGEX_TIME), 'FA')
                                  )

df_flights.show(2)

In [None]:
df_flights.groupBy(F.col('qa_dep_arr_time')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 4.

In [None]:
df_flights = df_flights.withColumn('qa_dep_arr_delay',
                                   F.when(F.col('dep_delay').isNull(), 'MD')
                                    .when(F.col('arr_delay').isNull(), 'MA')
                                  )

In [None]:
df_flights.groupBy(F.col('qa_dep_arr_delay')).count().show()