In [1]:
# instalando pacotes necessários
!pip install pyspark
!pip install findspark



In [2]:
# Inicia o findspark
import findspark
findspark.init()

In [3]:
# Importando os módulos necessários
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

import pandas as pd

In [4]:
# Cria o contexto do spark
sc = SparkContext()

# Instancia o criador de sessao do spark
spark = (SparkSession.builder
                     .master('local[7]')
                     .appName('Aceleração Pyspark - Capgemini'))

In [5]:
# Cria os schemas
schema_airports = StructType([
    StructField("faa",  StringType(),  True),
    StructField("name", StringType(),  True),
    StructField("lat",  FloatType(),   True),
    StructField("lon",  FloatType(),   True),
    StructField("alt",  IntegerType(), True),
    StructField("tz",   IntegerType(), True),
    StructField("dst",  StringType(),  True)
])

schema_planes = StructType([
    StructField("tailnum",      StringType(),  True),
    StructField("year",         IntegerType(), True),
    StructField("type",         StringType(),  True),
    StructField("manufacturer", StringType(),  True),
    StructField("model",        StringType(),  True),
    StructField("engines",      IntegerType(), True),
    StructField("seats",        IntegerType(), True),
    StructField("speed",        IntegerType(), True),
    StructField("engine",       StringType(),  True)
])

schema_flights = StructType([
    StructField("year",      IntegerType(), True),
    StructField("month",     IntegerType(), True),
    StructField("day",       IntegerType(), True),
    StructField("dep_time",  StringType(),  True),
    StructField("dep_delay", IntegerType(), True),
    StructField("arr_time",  StringType(),  True),
    StructField("arr_delay", IntegerType(), True),
    StructField("carrier",   StringType(),  True),
    StructField("tailnum",   StringType(),  True),
    StructField("flight",    StringType(),  True),
    StructField("origin",    StringType(),  True),
    StructField("dest",      StringType(),  True),
    StructField("air_time",  IntegerType(), True),
    StructField("distance",  IntegerType(), True),
    StructField("hour",      IntegerType(), True),
    StructField("minute",    IntegerType(), True),
])

In [7]:
# Cria todos os dataframes
df_airports = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_airports)
                  .load("../data/airports.csv"))

df_planes = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_planes)
                  .load("../data/planes.csv"))

df_flights = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .schema(schema_flights)
                  .load("../data/flights.csv"))

In [9]:
df_airports.show(3)
df_planes.show(3)
df_flights.show(3)

+---+--------------------+---------+---------+----+---+---+
|faa|                name|      lat|      lon| alt| tz|dst|
+---+--------------------+---------+---------+----+---+---+
|04G|   Lansdowne Airport|41.130474|-80.61958|1044| -5|  A|
|06A|Moton Field Munic...| 32.46057|-85.68003| 264| -5|  A|
|06C| Schaumburg Regional| 41.98934|-88.10124| 801| -6|  A|
+---+--------------------+---------+---------+----+---+---+
only showing top 3 rows

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|
+--

# -----------------------------------------------------------------------------------------------------------
# Quality airports
# -----------------------------------------------------------------------------------------------------------
## 1.

In [None]:
df_airports.filter(F.col('faa') == 'NA').show(3)

In [None]:
REGEX_ALPHANUM = r'[0-9a-zA-Z]+'

df_airports = df_airports.withColumn("qa_faa", 
                                     F.when(F.col("faa").isNull(), "M")
                                      .when(~F.col("faa").rlike(r'[0-9a-zA-Z]+'), "F"))

df_airports.show(5)
df_airports.filter(F.col('qa_faa').isNull()).show()

In [None]:
df_airports.groupBy(F.col('qa_faa')).count().show()

# -----------------------------------------------------------------------------------------------------------
## 2.

In [None]:
df_airports.filter(F.col('name') == 'NA').show(5)

In [None]:
df_airports = df_airports.withColumn('qa_name',
                                     F.when(F.col('name') == 'NA', 'M'))

df_airports.show(5)

In [None]:
df_airports.groupBy(F.col('qa_name')).count().show()

# -----------------------------------------------------------------------------------------------------------
# 3.

In [None]:
df_airports.filter(F.col('lat').isNull()).show()

In [None]:
df_airports = df_airports.withColumn('qa_lat',
                                     F.when(F.col('lat').isNull(), 'M')
                                      .when(~F.col('lat').between(-180, 180), 'I')
                                      .when(F.col('lat').rlike('[a-zA-Z]'), 'A'))

df_airports.show(5)

In [None]:
df_airports.groupBy(F.col('qa_lat')).count().show()