In [1]:
# Instalação dos pacotes necessários
!pip install pyspark
!pip install findspark



In [2]:
#importando o findSpark


import findspark
findspark.init()

In [3]:
## importando bibliotecas necessárias

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import when,lit,length,trim
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
import numpy as np

In [4]:
# Criação do contexto do spark
sc = SparkContext()

# Instância do criador de sessão do spark
spark = (SparkSession.builder
                     .master("local[7]")
                     .appName("Aceleração PySpark - Capgemini"))

##  Dataset airports.csv :

In [5]:
# Leitura/Carga do dataset

df_airports = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")                  
                  .load("airports.csv"))

In [6]:
#Visões Temporárias - não altera a estrutura original

df_airports.createOrReplaceTempView('airports')

#### Pergunta 1

In [7]:
# Atendendo os requisitos

df_airports.withColumn('qa_faa', 
                        (when((df_airports.faa == '') | 
                              (df_airports.faa.isNull() == True) |
                              (df_airports.faa.rlike('\t')) |
                              (df_airports.faa.rlike(' +'))
                              , lit('M')) 
                        .when((length(df_airports.faa) < 3) | 
                              (length(df_airports.faa) > 5) | 
                              (df_airports.faa.rlike("^[a-zA-Z]*$") == True) |
                              (df_airports.faa.rlike("^[0-9]$") == True)
                              , lit('F')) 
                        )
                   ).show()

+---+--------------------+----------------+-----------------+----+---+---+------+
|faa|                name|             lat|              lon| alt| tz|dst|qa_faa|
+---+--------------------+----------------+-----------------+----+---+---+------+
|04G|   Lansdowne Airport|      41.1304722|      -80.6195833|1044| -5|  A|  null|
|06A|Moton Field Munic...|      32.4605722|      -85.6800278| 264| -5|  A|  null|
|06C| Schaumburg Regional|      41.9893408|      -88.1012428| 801| -6|  A|  null|
|06N|     Randall Airport|       41.431912|      -74.3915611| 523| -5|  A|  null|
|09J|Jekyll Island Air...|      31.0744722|      -81.4277778|  11| -4|  A|  null|
|0A9|Elizabethton Muni...|      36.3712222|      -82.1734167|1593| -4|  A|  null|
|0G6|Williams County A...|      41.4673056|      -84.5067778| 730| -5|  A|  null|
|0G7|Finger Lakes Regi...|      42.8835647|      -76.7812318| 492| -5|  A|  null|
|0P2|Shoestring Aviati...|      39.7948244|      -76.6471914|1000| -5|  U|  null|
|0S9|Jefferson C

#### Pergunta 2

In [8]:
# Atendendo os requisitos

df_airports.withColumn('qa_name', 
                  (when(df_airports.name == None  |
                       (df_airports.name == ''), lit('M')
                       )
                  )
             ).show()

+---+--------------------+----------------+-----------------+----+---+---+-------+
|faa|                name|             lat|              lon| alt| tz|dst|qa_name|
+---+--------------------+----------------+-----------------+----+---+---+-------+
|04G|   Lansdowne Airport|      41.1304722|      -80.6195833|1044| -5|  A|   null|
|06A|Moton Field Munic...|      32.4605722|      -85.6800278| 264| -5|  A|   null|
|06C| Schaumburg Regional|      41.9893408|      -88.1012428| 801| -6|  A|   null|
|06N|     Randall Airport|       41.431912|      -74.3915611| 523| -5|  A|   null|
|09J|Jekyll Island Air...|      31.0744722|      -81.4277778|  11| -4|  A|   null|
|0A9|Elizabethton Muni...|      36.3712222|      -82.1734167|1593| -4|  A|   null|
|0G6|Williams County A...|      41.4673056|      -84.5067778| 730| -5|  A|   null|
|0G7|Finger Lakes Regi...|      42.8835647|      -76.7812318| 492| -5|  A|   null|
|0P2|Shoestring Aviati...|      39.7948244|      -76.6471914|1000| -5|  U|   null|
|0S9

#### Pergunta 3

In [9]:
# Atendendo os requisitos


df_airports.withColumn('qa_lat', 
                 (when(df_airports.lat == None |
                       (df_airports.lat == ''), lit ('M')
                      )
                 .when((df_airports.lat < -180) | 
                       (df_airports.lat > 180), lit ('I'))
                 .when(df_airports.lat.rlike("^[a-zA-Z]*$"), lit('A'))
                 )
             ).show()

+---+--------------------+----------------+-----------------+----+---+---+------+
|faa|                name|             lat|              lon| alt| tz|dst|qa_lat|
+---+--------------------+----------------+-----------------+----+---+---+------+
|04G|   Lansdowne Airport|      41.1304722|      -80.6195833|1044| -5|  A|  null|
|06A|Moton Field Munic...|      32.4605722|      -85.6800278| 264| -5|  A|  null|
|06C| Schaumburg Regional|      41.9893408|      -88.1012428| 801| -6|  A|  null|
|06N|     Randall Airport|       41.431912|      -74.3915611| 523| -5|  A|  null|
|09J|Jekyll Island Air...|      31.0744722|      -81.4277778|  11| -4|  A|  null|
|0A9|Elizabethton Muni...|      36.3712222|      -82.1734167|1593| -4|  A|  null|
|0G6|Williams County A...|      41.4673056|      -84.5067778| 730| -5|  A|  null|
|0G7|Finger Lakes Regi...|      42.8835647|      -76.7812318| 492| -5|  A|  null|
|0P2|Shoestring Aviati...|      39.7948244|      -76.6471914|1000| -5|  U|  null|
|0S9|Jefferson C

#### Pergunta 4

In [10]:
# Atendendo os requisitos

df_airports.withColumn('qa_lon', 
                 (when((df_airports.lon == None) |
                      (df_airports.lon == ''),lit ('M')) 
                 .when((df_airports.lon < -180) | 
                       (df_airports.lon > 180), lit ('I')) 
                 .when(df_airports.lon.rlike("^[a-zA-Z]*$"), lit('A'))
                 )
             ).show()

+---+--------------------+----------------+-----------------+----+---+---+------+
|faa|                name|             lat|              lon| alt| tz|dst|qa_lon|
+---+--------------------+----------------+-----------------+----+---+---+------+
|04G|   Lansdowne Airport|      41.1304722|      -80.6195833|1044| -5|  A|  null|
|06A|Moton Field Munic...|      32.4605722|      -85.6800278| 264| -5|  A|  null|
|06C| Schaumburg Regional|      41.9893408|      -88.1012428| 801| -6|  A|  null|
|06N|     Randall Airport|       41.431912|      -74.3915611| 523| -5|  A|  null|
|09J|Jekyll Island Air...|      31.0744722|      -81.4277778|  11| -4|  A|  null|
|0A9|Elizabethton Muni...|      36.3712222|      -82.1734167|1593| -4|  A|  null|
|0G6|Williams County A...|      41.4673056|      -84.5067778| 730| -5|  A|  null|
|0G7|Finger Lakes Regi...|      42.8835647|      -76.7812318| 492| -5|  A|  null|
|0P2|Shoestring Aviati...|      39.7948244|      -76.6471914|1000| -5|  U|  null|
|0S9|Jefferson C

#### Pergunta 5

In [11]:
# Atendendo os requisitos

df_airports.withColumn('qa_alt', 
                 (when(((df_airports.alt == None) |
                        (df_airports.alt == '')
                       ), lit ('M')) 
                 .when((df_airports.alt < 0)  
                       , lit ('I')) 
                 .when(df_airports.alt.rlike("^[a-zA-Z]*$"), lit('A'))
                 )
             ).show()

+---+--------------------+----------------+-----------------+----+---+---+------+
|faa|                name|             lat|              lon| alt| tz|dst|qa_alt|
+---+--------------------+----------------+-----------------+----+---+---+------+
|04G|   Lansdowne Airport|      41.1304722|      -80.6195833|1044| -5|  A|  null|
|06A|Moton Field Munic...|      32.4605722|      -85.6800278| 264| -5|  A|  null|
|06C| Schaumburg Regional|      41.9893408|      -88.1012428| 801| -6|  A|  null|
|06N|     Randall Airport|       41.431912|      -74.3915611| 523| -5|  A|  null|
|09J|Jekyll Island Air...|      31.0744722|      -81.4277778|  11| -4|  A|  null|
|0A9|Elizabethton Muni...|      36.3712222|      -82.1734167|1593| -4|  A|  null|
|0G6|Williams County A...|      41.4673056|      -84.5067778| 730| -5|  A|  null|
|0G7|Finger Lakes Regi...|      42.8835647|      -76.7812318| 492| -5|  A|  null|
|0P2|Shoestring Aviati...|      39.7948244|      -76.6471914|1000| -5|  U|  null|
|0S9|Jefferson C

#### Pergunta 6

In [12]:
# Atendendo os requisitos

df_airports.withColumn('qa_tz',
                 (when((df_airports.tz.isNull()) | 
                        (df_airports.tz == ''),
                        lit('M'))
                 .when(
                        (df_airports.tz < - 11) |
                        (df_airports.tz > 14),
                        lit('I'))
                 .when(
                        (df_airports.tz.rlike("^[a-zA-Z]*$")),
                        lit('A'))
                 )
             ).show()

+---+--------------------+----------------+-----------------+----+---+---+-----+
|faa|                name|             lat|              lon| alt| tz|dst|qa_tz|
+---+--------------------+----------------+-----------------+----+---+---+-----+
|04G|   Lansdowne Airport|      41.1304722|      -80.6195833|1044| -5|  A| null|
|06A|Moton Field Munic...|      32.4605722|      -85.6800278| 264| -5|  A| null|
|06C| Schaumburg Regional|      41.9893408|      -88.1012428| 801| -6|  A| null|
|06N|     Randall Airport|       41.431912|      -74.3915611| 523| -5|  A| null|
|09J|Jekyll Island Air...|      31.0744722|      -81.4277778|  11| -4|  A| null|
|0A9|Elizabethton Muni...|      36.3712222|      -82.1734167|1593| -4|  A| null|
|0G6|Williams County A...|      41.4673056|      -84.5067778| 730| -5|  A| null|
|0G7|Finger Lakes Regi...|      42.8835647|      -76.7812318| 492| -5|  A| null|
|0P2|Shoestring Aviati...|      39.7948244|      -76.6471914|1000| -5|  U| null|
|0S9|Jefferson County ...|  

#### Pergunta 7

In [13]:
# Atendendo os requisitos

expected_categories = ['E', 'A', 'S', 'O', 'Z', 'N', 'U']
df_airports.withColumn('qa_dst',
                 (when(
                     ((df_airports.dst.isNull()) |
                     (df_airports.dst == '')),
                     lit('M')
                     )
                  .when(
                      (~df_airports.dst.isin(expected_categories)),
                      lit('C')
                      )
                  .when(
                      (df_airports.dst.rlike("^[0-9]*$")),
                      lit('N')
                      )
                 )
             ).show()


+---+--------------------+----------------+-----------------+----+---+---+------+
|faa|                name|             lat|              lon| alt| tz|dst|qa_dst|
+---+--------------------+----------------+-----------------+----+---+---+------+
|04G|   Lansdowne Airport|      41.1304722|      -80.6195833|1044| -5|  A|  null|
|06A|Moton Field Munic...|      32.4605722|      -85.6800278| 264| -5|  A|  null|
|06C| Schaumburg Regional|      41.9893408|      -88.1012428| 801| -6|  A|  null|
|06N|     Randall Airport|       41.431912|      -74.3915611| 523| -5|  A|  null|
|09J|Jekyll Island Air...|      31.0744722|      -81.4277778|  11| -4|  A|  null|
|0A9|Elizabethton Muni...|      36.3712222|      -82.1734167|1593| -4|  A|  null|
|0G6|Williams County A...|      41.4673056|      -84.5067778| 730| -5|  A|  null|
|0G7|Finger Lakes Regi...|      42.8835647|      -76.7812318| 492| -5|  A|  null|
|0P2|Shoestring Aviati...|      39.7948244|      -76.6471914|1000| -5|  U|  null|
|0S9|Jefferson C

### Gerando arquivo .parquet

In [14]:
(df_airports
.repartition(1) # coalesce
.write.format("parquet")
.mode('overwrite')
.option("header", "true")
.save("airports_qa.parquet"))

## Dataset planes.csv:

In [15]:
# Leitura/Carga do dataset


df_planes = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")
                  .load("planes.csv"))

In [16]:
#Visão Temporária - não altera a estrutura original


df_planes.createOrReplaceTempView('planes')

#### Pergunta 1

In [17]:
# Atendendo os requisitos


df_planes.withColumn("qa_tailnum",
                  (when(
                          (df_planes.tailnum.isNull()) |
                           (df_planes.tailnum == ''),
                           lit('M')
                        )
                   .when(
                         (length(trim(df_planes.tailnum))!= 6),
                       lit('S')
                      )
                   .when(
                         (df_planes.tailnum.rlike("^[N][0-9]{3}[a-zA-Z0-9]{2}$")) == False,
                        lit('F')
                        )
                   .when(
                       (df_planes.tailnum.rlike("^[N]") == False ),
                       lit('FN')
                       )
                   .when(
                       (df_planes.tailnum.rlike("^[N][I|O|0]") == True),
                       lit('FE')
                       )
                  )
              ).show()

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_tailnum|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N108UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N109UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  18

#### Pergunta 2

In [18]:
# Atendendo os requisitos


df_planes.withColumn("qa_year",
                  (when((df_planes.year.isNull()) |
                        (df_planes.year == ''),
                        lit('M')
                      )
                   .when((df_planes.year < 1950) |
                         (df_planes.year == np.inf),
                         lit('I')
                       )
                  )
).show()

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+-------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_year|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+-------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|   null|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|   null|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|   null|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|   null|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|   null|
| N108UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|   null|
| N109UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|   null|


#### Pergunta 3

In [19]:
# Atendendo os requisitos

engines = ['Fixed wing multi engine',
           'Fixed wing single engine',
           'Rotorcraft'
          ]
df_planes.withColumn("qa_type",
                  (when((df_planes.type.isNull()) |
                        (df_planes.type == ''),
                        lit('M')
                       )
                   .when(~df_planes.type.isin(engines),
                         lit('C')
                        )
                  )
).show()

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+-------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_type|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+-------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|   null|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|   null|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|   null|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|   null|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|   null|
| N108UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|   null|
| N109UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|   null|


#### Pergunta 4

In [20]:
# Atendendo os requisitos


manufacturer = ['AIRBUS INDUSTRIE',
                'BOEING',
                'BOMBARDIER',
                'CESSNA',
                'EMBRAER',
                'SIKORSKY',
                'CANADAIR',
                'PIPER',
                'MCDONNELL DOUGLAS',
                'CIRRUS',
                'BELL',
                'KILDALL GARY',
                'LAMBERT RICHARD',
                'BARKER JACK',
                'ROBINSON HELICOPTER',
                'GULFSTREAM',
                'MARZ BARRY',
               ]

df_planes.withColumn("qa_manufacturer",
                     (when((df_planes.manufacturer.isNull()) |
                           (df_planes.manufacturer == ''),
                           lit('M')
                         )
                      .when(~df_planes.manufacturer.isin(manufacturer),
                            lit('C')
                          )
                     )
                 ).show()

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+---------------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_manufacturer|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+---------------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|           null|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|           null|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|           null|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|           null|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|           null|
| N108UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|           null|
| N109UW|1999|Fixed wing mul

#### Pergunta 5

In [21]:
# Atendendo os requisitos

## - comentar linha desejada para testagem em seperado


df_planes.withColumn('qa_model',
                     (when((df_planes.model.isNull()) |
                           (df_planes.model == ''),
                           lit('M')
                         )
                     )
                     .when((df_planes.manufacturer == 'AIRBUS INDUSTRIE') &
                           (~df_planes.model.rlike("^[A]")), 
                           lit ('F')
                     )
                     .when((df_planes.manufacturer == 'BOEING') &
                           (~df_planes.model.rlike("^[7]")), 
                           lit ('F')
                     )
                     .when(
                           ((df_planes.manufacturer == 'BOMBARDIER') | (df_planes.manufacturer == 'CANADIAR')) &
                           (~df_planes.model.rlike("^[CL]")), 
                           lit ('F')
                     )
                     .when((df_planes.manufacturer == 'MCDONELL DOUGLAS') &
                           ((~df_planes.model.rlike("^[MD]")) |
                            (~df_planes.model.rlike("^[DC]"))
                           )
                           , lit ('F')
                     )
                 ).show()

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+--------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_model|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+--------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|    null|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|    null|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|    null|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|    null|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|    null|
| N108UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|    null|
| N109UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|

#### Pergunta 6

In [22]:
# Atendendo os requisitos


df_planes.withColumn('qa_engines',
                     (when((df_planes.engines.isNull()) |
                           (df_planes.engines == ''),
                           lit('M')
                         )
                      .when(~df_planes.engines.between(1, 4),
                            lit('I')
                          )
                      .when(df_planes.engines.rlike('^[a-zA-Z]*$'),
                            lit('A')
                          )
                     )
                 ).show()

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_engines|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N108UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N109UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  18

#### Pergunta 7

In [23]:
# Atendendo os requisitos

df_planes.withColumn('qa_seats',
                      (when((df_planes.seats.isNull()) |
                            (df_planes.seats == ''),
                            lit('M')
                          )
                       .when(~df_planes.seats.between(2,500),
                             lit('I')
                           )
                       .when(df_planes.seats.rlike('^[a-zA-Z]*$'),
                             lit('A')
                           )
                      )
                 ).show()

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+--------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_seats|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+--------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|    null|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|    null|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|    null|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|    null|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|    null|
| N108UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|    null|
| N109UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|

#### Pergunta 8

In [24]:
# Atendendo os requisitos



df_planes.withColumn('qa_speed',
                     (when((df_planes.speed.isNull()) |
                           (df_planes.speed == ''),
                           lit('M')
                         )
                      .when(~df_planes.speed.between(50,150),
                            lit('I')
                          )
                      .when(df_planes.speed.rlike('^[a-zA-Z]*$'),
                            lit('A')
                          )
                     )
                 ).show()

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+--------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_speed|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+--------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|       A|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|       A|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|       A|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|       A|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|       A|
| N108UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|       A|
| N109UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|

#### Pergunta 9

In [25]:
# Atendendo os requisitos



engine_categories = ['Turbo-fan',
                      'Turbo-jet',
                      'Turbo-prop',
                      'Turbo-shaft',
                      '4 Cycle'
                     ]

df_planes.withColumn('qa_enginge',
                     (when((df_planes.engine.isNull()) |
                           (df_planes.engine == ''),
                           lit('M')
                         )
                      .when(~df_planes.engine.isin(engine_categories),
                            lit('C')
                          )
                     )
                 ).show()

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_enginge|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N108UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|      null|
| N109UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  18

### Gerando arquivo .parquet

In [27]:

(df_planes.repartition(1) 
            .write.format("parquet")
            .mode('overwrite')
            .option("header", "true")
.save("planes_qa.parquet")
)

## Dataset flights.csv :

In [29]:
# Leitura/Carga do dataset


df_flights = (spark.getOrCreate().read
                  .format("csv")
                  .option("header", "true")                  
                  .load("flights.csv"))

In [None]:
#Gerar a tabela de acordo com sua estrutura de colunas

df_flights = df_flights[['year', 'month', 'day','hour', 'minute','dep_time','arr_time', 'dep_delay', 'arr_delay', 'carrier', 'tailnum', 'flight',
        'origin','dest', 'air_time', 'distance']]

In [None]:
#Visões Temporárias - não altera a estrutura original


df_flights.createOrReplaceTempView('flights')

#### Pergunta 1:

In [None]:
# Atendendo os requisitos

df_flights = df_flights.withColumn('qa_year_month_day',
                                (when((df_flights.year.isNull()) |
                                      (df_flights.year == ''),
                                      lit('MY')
                                    )
                                 .when((df_flights.month.isNull()) |
                                      (df_flights.month == ''),
                                      lit('MM')
                                    )
                                 .when((df_flights.day.isNull()) |
                                      (df_flights.day == ''),
                                      lit('MD')
                                    )
                                 .when((df_flights.year< 1950), 
                                      lit('IY')
                                    )
                                 .when((~df_flights.month.between(1,12)), 
                                      lit('IM')
                                )
                                 .when(((df_flights.year == 2) & (~df_flights.day.between(1,29))) |
                                       ((df_flights.year != 2) & (~df_flights.day.between(1,31))), 
                                       lit('IY'))
                            ))





#### Pergunta 2:

In [None]:
# Atendendo os requisitos


df_flights.withColumn('qa_hour_minute',
                      (when((df_flights.hour.isNull()) |
                            (df_flights.hour == ''),
                            lit('MH')
                          )
                       .when((df_flights.minute.isNull()) |
                             (df_flights.minute == ''),
                             lit('MM')
                           )
                       .when(~df_flights.hour.between(0,24), lit('IH'))
                       .when(~df_flights.minute.between(0,60), lit('IM'))
                      )
                  ).show(1)

#### Pergunta 3:

In [None]:
# Atendendo os requisitos


qa_dep_arr_time = df_flights.withColumn('qa_dep_arr_time',
                       (when((df_flights.dep_time.isNull()) |
                             (df_flights.dep_time == ''),
                             lit('MD')
                           )
                        .when((df_flights.arr_time.isNull()) |
                              (df_flights.arr_time == ''),
                              lit('MA')
                            )
                        .when((length(df_flights.dep_time) < 3 ) |
                              (length(df_flights.dep_time) > 4 ),
                              lit('FD')
                             )
                        .when((length(df_flights.arr_time) < 3 ) |
                              (length(df_flights.arr_time) > 4 ),
                              lit('FD')
                             )
                        .when((length(df_flights.dep_time) == 3) &
                              (~df_flights.dep_time.substr(1,1).cast('int').between(0, 24)) &
                              (~df_flights.dep_time.substr(2,3).cast('int').between(0,60)),
                              lit('FD')
                            )
                        .when((length(df_flights.dep_time) == 4) &
                              (~df_flights.dep_time.substr(1,2).cast('int').between(0, 24)) &
                              (~df_flights.dep_time.substr(3,4).cast('int').between(0,60)), 
                              lit('FD')
                            )
                        .when((length(df_flights.arr_time) == 3) &
                              (~df_flights.arr_time.substr(1,1).cast('int').between(0, 24)) &
                              (~df_flights.arr_time.substr(2,3).cast('int').between(0,60)), 
                              lit('FD')
                            )
                        .when((length(df_flights.dep_time) == 4) &
                              (~df_flights.arr_time.substr(1,2).cast('int').between(0, 24)) &
                              (~df_flights.arr_time.substr(3,4).cast('int').between(0,60)), 
                              lit('FD')
                           )  
                       )
                  )

#### Pergunta 4:

In [None]:
# Atendendo os requisitos

df_flights.withColumn('qa_dep_arr_delay',
                      (when((df_flights.dep_delay.isNull()) |
                            (df_flights.dep_delay == ''),
                            lit('MD')
                          )
                       .when((df_flights.arr_delay.isNull()) |
                            (df_flights.arr_delay == ''),
                            lit('MA')
                           )
                      )
                  ).show(1)

#### Pergunta 5:

In [None]:
# Atendendo os requisitos


df_flights.withColumn('qa_carrier',
                      (when((df_flights.carrier.isNull()) |
                            (df_flights.carrier == ''),
                            lit('M')
                          )
                       .when((df_flights.carrier.rlike("^[a-zA-Z-0-9]+$") == False),
                             lit('F')
                           )
                      )
                  ).show(1)


#### Pergunta 6:

In [None]:
tailnum = df_flights.tailnum


In [None]:
# Atendendo os requisitos

df_flights.withColumn('qa_tailnum',
                      (when((tailnum.isNull()) |
                            (tailnum == ''),
                            lit('M')
                          )
                       .when(length(tailnum )!= 6,
                            lit('S')
                           )
                       .when(tailnum.rlike("^[N][0-9]{3}[a-zA-Z0-9]{2}$") == False,
                            lit('F')
                            )
                       .when(tailnum.rlike('^[N]') == False,
                            lit('FN')
                            )
                       .when((tailnum.substr(2,2) == 'O') |
                             (tailnum.substr(5,5) == 'I') | 
                             (tailnum.substr(5,5) == 'O') |
                             (tailnum.substr(6,6) == 'I') |
                             (tailnum.substr(6,6) == 'I'),
                             lit('FE')
                            )
                      )
                  ).show(1)


#### Pergunta 7:

In [None]:
# Atendendo os requisitos

qa_flight = df_flights.flight
df_flights.withColumn('qa_flight',
                      (when((qa_flight.isNull()) |
                            (qa_flight == ''),
                            lit('M')
                          )
                       .when((length(qa_flight) != 4) &
                             (qa_flight.rlike("[a-zA-Z]") == True),
                             lit('F')
                           )
                      )
                  ).show(1)

#### Pergunta 8:

In [None]:
# Atendendo os requisitos

origin = df_flights.origin
dest   = df_flights.dest

df_flights.withColumn('qa_origin_dest',
                      (when((origin.isNull()) |
                            (origin == ''),
                            lit('MO')
                          )
                       .when((dest.isNull()) |
                            (dest == ''),
                            lit('MD')
                      )
                       .when(origin.rlike("[a-zA-Z0-9]{3}") == False,
                            lit('FO')
                            )
                       .when(dest.rlike("[a-zA-Z0-9]{3}") == False,
                            lit('FD')
                            )
                  )
                ).show(1)

#### Pergunta 9:

In [None]:
# Atendendo os requisitos

air_time = df_flights.air_time 

df_flights.withColumn('qa_airtime',
                      (when((air_time.isNull()) |
                            (air_time == ''),
                            lit('M')
                          )
                       .when(~air_time.between(20,500),
                            lit('I')
                            )
                      )
                  ).show(1)

#### Pergunta 10:

In [None]:
# Atendendo os requisitos

distance = df_flights.distance 

df_flights.withColumn('qa_distance',
                      (when((distance.isNull()) |
                            (distance == ''),
                            lit('M')
                          )
                       .when(~distance.between(50,3000),
                            lit('I')
                            )
                      )
                  ).show(1)

#### Pergunta 11:

In [None]:
df_flights.withColumn('qa_airtime_distance',
                  (when((air_time.isNull()) |
                        (air_time == '') |
                        (distance.isNull()) |
                        (distance == ''),
                        lit('M')
                      )
                   .when(air_time >= (distance * 0.1) + 30,
                         lit('TL')
                       )
                   .when(air_time <= (distance * 0.1) + 10,
                         lit('TS')
                       )
                   .when((air_time > (distance * 0.1) + 10) &
                         (air_time < (distance * 0.1) + 30),
                         lit('TR')
                        )
                  )
                  ).show()

### Gerando arquivo .parquet

In [None]:
(df_flights.repartition(1) 
            .write.format("parquet")
            .mode('overwrite')
            .option("header", "true")
.save("flights_qa.parquet")
)