In [1]:
# Installing required packages
!pip install pyspark
!pip install findspark



In [2]:
import findspark
findspark.init()

In [3]:
import re
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [4]:
# Criar o contexto do spark
sc = SparkContext()

# Instancia o criador de sessao do spark
spark = (SparkSession.builder
                     .master("local[*]")
                     .appName("Aceleração PySpark - Capgemini"))

In [5]:
# Carrega os dados com as configurações necessárias

path_airports = ("../output/airports_qa.parquet")

df_airports_qa = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")
                  .load(path_airports))

path_planes = ("../output/planes_qa.parquet")

df_planes_qa = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")
                  .load(path_planes))


path_flights = ("../output/flights_qa.parquet")

df_flights_qa = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")
                  .load(path_flights))

In [6]:
# Carrega os dados com as configurações necessárias

path_airports = ("../output/airports_proc.parquet")

df_airports_proc = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")
                  .load(path_airports))

path_planes = ("../output/planes_proc.parquet")

df_planes_proc = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")
                  .load(path_planes))


path_flights = ("../output/flights_proc.parquet")

df_flights_proc = (spark.getOrCreate().read
                  .format("parquet")
                  .option("header", "true")
                  .load(path_flights))

## Relatórios- Perguntas para qualidade

#### Pergunta 1

In [7]:
df_1 = df_airports_qa
df_2 = df_airports_qa

In [8]:
print('Actual columns:', df_1.columns)


df_1 = (df_1.withColumnRenamed('faa', 'faa_O')
            .withColumnRenamed('qa_faa', 'qa_faa_O')
            .withColumnRenamed('qa_name', 'qa_name_O')
            .withColumnRenamed('qa_lat', 'qa_lat_O')
            .withColumnRenamed('qa_lon', 'qa_lon_O')
            .withColumnRenamed('qa_alt', 'qa_alt_O')
            .withColumnRenamed('qa_tz', 'qa_tz_O')
            .withColumnRenamed('qa_dst', 'qa_dst_O')
                                                                 
              )
print('modified columns:', df_1.columns)

df_1.toPandas()

Actual columns: ['faa', 'qa_faa', 'qa_name', 'qa_lat', 'qa_lon', 'qa_alt', 'qa_tz', 'qa_dst']
modified columns: ['faa_O', 'qa_faa_O', 'qa_name_O', 'qa_lat_O', 'qa_lon_O', 'qa_alt_O', 'qa_tz_O', 'qa_dst_O']


Unnamed: 0,faa_O,qa_faa_O,qa_name_O,qa_lat_O,qa_lon_O,qa_alt_O,qa_tz_O,qa_dst_O
0,04G,F,,,,,,
1,06A,F,,,,,,
2,06C,F,,,,,,
3,06N,F,,,,,,
4,09J,F,,,,,,
...,...,...,...,...,...,...,...,...
1392,ZUN,F,,,,,,
1393,ZVE,F,,,,,,
1394,ZWI,F,,,,,,
1395,ZWU,F,,,,,,


In [9]:
print('Actual columns:', df_2.columns)


df_2 = (df_2.withColumnRenamed('faa', 'faa_D')
            .withColumnRenamed('qa_faa', 'qa_faa_D')
            .withColumnRenamed('qa_name', 'qa_name_D')
            .withColumnRenamed('qa_lat', 'qa_lat_D')
            .withColumnRenamed('qa_lon', 'qa_lon_D')
            .withColumnRenamed('qa_alt', 'qa_alt_D')
            .withColumnRenamed('qa_tz', 'qa_tz_D')
            .withColumnRenamed('qa_dst', 'qa_dst_D')
                                                                 
              )
print('modified columns:', df_2.columns)

df_2.toPandas()

Actual columns: ['faa', 'qa_faa', 'qa_name', 'qa_lat', 'qa_lon', 'qa_alt', 'qa_tz', 'qa_dst']
modified columns: ['faa_D', 'qa_faa_D', 'qa_name_D', 'qa_lat_D', 'qa_lon_D', 'qa_alt_D', 'qa_tz_D', 'qa_dst_D']


Unnamed: 0,faa_D,qa_faa_D,qa_name_D,qa_lat_D,qa_lon_D,qa_alt_D,qa_tz_D,qa_dst_D
0,04G,F,,,,,,
1,06A,F,,,,,,
2,06C,F,,,,,,
3,06N,F,,,,,,
4,09J,F,,,,,,
...,...,...,...,...,...,...,...,...
1392,ZUN,F,,,,,,
1393,ZVE,F,,,,,,
1394,ZWI,F,,,,,,
1395,ZWU,F,,,,,,


In [10]:
print('Actual columns:', df_planes_qa.columns)


df_planes_qa = (df_planes_qa.withColumnRenamed('tailnum', 'tailnum_P')
                            .withColumnRenamed('qa_tailnum', 'qa_tailnum_P')
                                                                 
              )
print('modified columns:', df_planes_qa.columns)

df_planes_qa.toPandas()

Actual columns: ['tailnum', 'qa_tailnum', 'qa_year', 'qa_type', 'qa_manufacturer', 'qa_model', 'qa_engines', 'qa_seats', 'qa_speed', 'qa_engine']
modified columns: ['tailnum_P', 'qa_tailnum_P', 'qa_year', 'qa_type', 'qa_manufacturer', 'qa_model', 'qa_engines', 'qa_seats', 'qa_speed', 'qa_engine']


Unnamed: 0,tailnum_P,qa_tailnum_P,qa_year,qa_type,qa_manufacturer,qa_model,qa_engines,qa_seats,qa_speed,qa_engine
0,N102UW,,,,C,,,,M,
1,N103US,,,,C,,,,M,
2,N104UW,,,,C,,,,M,
3,N105UW,,,,C,,,,M,
4,N107US,,,,C,,,,M,
...,...,...,...,...,...,...,...,...,...,...
2623,N983SW,,,,C,,,,M,
2624,N984CA,,,,,,,,M,
2625,N986CA,,M,,,,,,M,
2626,N986SW,,,,C,,,,M,


In [11]:
df = (df_flights_qa.join(df_1,
                  (df_flights_qa.origin == df_1.faa_O)
                        ,"left")
)

In [12]:
df.printSchema()

root
 |-- tailnum: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- qa_year_month_day: string (nullable = true)
 |-- qa_hour_minute: string (nullable = true)
 |-- qa_dep_arr_time: string (nullable = true)
 |-- qa_dep_arr_delay: string (nullable = true)
 |-- qa_carrier: string (nullable = true)
 |-- qa_tailnum: string (nullable = true)
 |-- qa_flight: string (nullable = true)
 |-- qa_origin_dest: string (nullable = true)
 |-- qa_air_time: string (nullable = true)
 |-- qa_distance: string (nullable = true)
 |-- qa_distance_airtime: string (nullable = true)
 |-- faa_O: string (nullable = true)
 |-- qa_faa_O: string (nullable = true)
 |-- qa_name_O: string (nullable = true)
 |-- qa_lat_O: string (nullable = true)
 |-- qa_lon_O: string (nullable = true)
 |-- qa_alt_O: string (nullable = true)
 |-- qa_tz_O: string (nullable = true)
 |-- qa_dst_O: string (nullable = true)



In [13]:
df = (df.join(df_2,
                  (df.origin == df_2.faa_D)
                        ,"left")
)

In [14]:
df.printSchema()

root
 |-- tailnum: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- qa_year_month_day: string (nullable = true)
 |-- qa_hour_minute: string (nullable = true)
 |-- qa_dep_arr_time: string (nullable = true)
 |-- qa_dep_arr_delay: string (nullable = true)
 |-- qa_carrier: string (nullable = true)
 |-- qa_tailnum: string (nullable = true)
 |-- qa_flight: string (nullable = true)
 |-- qa_origin_dest: string (nullable = true)
 |-- qa_air_time: string (nullable = true)
 |-- qa_distance: string (nullable = true)
 |-- qa_distance_airtime: string (nullable = true)
 |-- faa_O: string (nullable = true)
 |-- qa_faa_O: string (nullable = true)
 |-- qa_name_O: string (nullable = true)
 |-- qa_lat_O: string (nullable = true)
 |-- qa_lon_O: string (nullable = true)
 |-- qa_alt_O: string (nullable = true)
 |-- qa_tz_O: string (nullable = true)
 |-- qa_dst_O: string (nullable = true)
 |-- faa_D: string (nullable = true)
 |-- qa_faa_D: string (nullabl

In [15]:
df = (df.join(df_planes_qa,
                  (df.tailnum == df_planes_qa.tailnum_P)
                        ,"left")
)

In [16]:
df.printSchema()

root
 |-- tailnum: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- qa_year_month_day: string (nullable = true)
 |-- qa_hour_minute: string (nullable = true)
 |-- qa_dep_arr_time: string (nullable = true)
 |-- qa_dep_arr_delay: string (nullable = true)
 |-- qa_carrier: string (nullable = true)
 |-- qa_tailnum: string (nullable = true)
 |-- qa_flight: string (nullable = true)
 |-- qa_origin_dest: string (nullable = true)
 |-- qa_air_time: string (nullable = true)
 |-- qa_distance: string (nullable = true)
 |-- qa_distance_airtime: string (nullable = true)
 |-- faa_O: string (nullable = true)
 |-- qa_faa_O: string (nullable = true)
 |-- qa_name_O: string (nullable = true)
 |-- qa_lat_O: string (nullable = true)
 |-- qa_lon_O: string (nullable = true)
 |-- qa_alt_O: string (nullable = true)
 |-- qa_tz_O: string (nullable = true)
 |-- qa_dst_O: string (nullable = true)
 |-- faa_D: string (nullable = true)
 |-- qa_faa_D: string (nullabl

In [17]:
len(df.columns)

40

#### Pergunta 2

In [18]:
df_qa = df.select(df.colRegex("`^qa_.*`"))

for c in df_qa.columns:
    df_qa.filter(F.col(c).isNotNull()).groupBy(F.substring(c, 1, 1).alias(c)).count().show()

+-----------------+-----+
|qa_year_month_day|count|
+-----------------+-----+
+-----------------+-----+

+--------------+-----+
|qa_hour_minute|count|
+--------------+-----+
|             M|   48|
+--------------+-----+

+---------------+-----+
|qa_dep_arr_time|count|
+---------------+-----+
|              F|  241|
|              M|   55|
+---------------+-----+

+----------------+-----+
|qa_dep_arr_delay|count|
+----------------+-----+
|               M|   75|
+----------------+-----+

+----------+-----+
|qa_carrier|count|
+----------+-----+
+----------+-----+

+----------+-----+
|qa_tailnum|count|
+----------+-----+
|         F|  989|
|         M|   14|
+----------+-----+

+---------+-----+
|qa_flight|count|
+---------+-----+
|        F| 6158|
+---------+-----+

+--------------+-----+
|qa_origin_dest|count|
+--------------+-----+
+--------------+-----+

+-----------+-----+
|qa_air_time|count|
+-----------+-----+
|          M|   75|
+-----------+-----+

+-----------+-----+
|qa_distanc

#### Pergunta 3

In [19]:
df_qa_M = df.select(df.colRegex("`^qa_.*`"))

for c in df_qa.columns:
    df_qa.filter(F.col(c) == 'M').groupBy(F.substring(c, 1, 1).alias(c)).count().show()

+-----------------+-----+
|qa_year_month_day|count|
+-----------------+-----+
+-----------------+-----+

+--------------+-----+
|qa_hour_minute|count|
+--------------+-----+
+--------------+-----+

+---------------+-----+
|qa_dep_arr_time|count|
+---------------+-----+
+---------------+-----+

+----------------+-----+
|qa_dep_arr_delay|count|
+----------------+-----+
+----------------+-----+

+----------+-----+
|qa_carrier|count|
+----------+-----+
+----------+-----+

+----------+-----+
|qa_tailnum|count|
+----------+-----+
|         M|   14|
+----------+-----+

+---------+-----+
|qa_flight|count|
+---------+-----+
+---------+-----+

+--------------+-----+
|qa_origin_dest|count|
+--------------+-----+
+--------------+-----+

+-----------+-----+
|qa_air_time|count|
+-----------+-----+
|          M|   75|
+-----------+-----+

+-----------+-----+
|qa_distance|count|
+-----------+-----+
+-----------+-----+

+-------------------+-----+
|qa_distance_airtime|count|
+-------------------+-----+

#### Pergunta 4

In [20]:
df_qa_F = df.select(df.colRegex("`^qa_.*`"))

for c in df_qa.columns:
    df_qa.filter(F.col(c) == 'F').groupBy(F.substring(c, 1, 1).alias(c)).count().show()

+-----------------+-----+
|qa_year_month_day|count|
+-----------------+-----+
+-----------------+-----+

+--------------+-----+
|qa_hour_minute|count|
+--------------+-----+
+--------------+-----+

+---------------+-----+
|qa_dep_arr_time|count|
+---------------+-----+
+---------------+-----+

+----------------+-----+
|qa_dep_arr_delay|count|
+----------------+-----+
+----------------+-----+

+----------+-----+
|qa_carrier|count|
+----------+-----+
+----------+-----+

+----------+-----+
|qa_tailnum|count|
+----------+-----+
|         F|  987|
+----------+-----+

+---------+-----+
|qa_flight|count|
+---------+-----+
|        F| 6158|
+---------+-----+

+--------------+-----+
|qa_origin_dest|count|
+--------------+-----+
+--------------+-----+

+-----------+-----+
|qa_air_time|count|
+-----------+-----+
+-----------+-----+

+-----------+-----+
|qa_distance|count|
+-----------+-----+
+-----------+-----+

+-------------------+-----+
|qa_distance_airtime|count|
+-------------------+-----+
+

#### Pergunta 5

In [21]:
df_qa_M = df.select(df.colRegex("`^qa_.*`"))

for c in df_qa.columns:
    df_qa.filter(F.col(c) == 'I').groupBy(F.substring(c, 1, 1).alias(c)).count().show()

+-----------------+-----+
|qa_year_month_day|count|
+-----------------+-----+
+-----------------+-----+

+--------------+-----+
|qa_hour_minute|count|
+--------------+-----+
+--------------+-----+

+---------------+-----+
|qa_dep_arr_time|count|
+---------------+-----+
+---------------+-----+

+----------------+-----+
|qa_dep_arr_delay|count|
+----------------+-----+
+----------------+-----+

+----------+-----+
|qa_carrier|count|
+----------+-----+
+----------+-----+

+----------+-----+
|qa_tailnum|count|
+----------+-----+
+----------+-----+

+---------+-----+
|qa_flight|count|
+---------+-----+
+---------+-----+

+--------------+-----+
|qa_origin_dest|count|
+--------------+-----+
+--------------+-----+

+-----------+-----+
|qa_air_time|count|
+-----------+-----+
+-----------+-----+

+-----------+-----+
|qa_distance|count|
+-----------+-----+
+-----------+-----+

+-------------------+-----+
|qa_distance_airtime|count|
+-------------------+-----+
+-------------------+-----+

+--------

## Relatórios - Perguntas para negócio

#### Pergunta 1

In [22]:
df_airports_proc.toPandas()

Unnamed: 0,faa,name,lat,lon,alt,tz,dst,region,type,military,administration
0,04G,Lansdowne Airport,41.130474,-80.619583,1044,-5.0,A,MAINLAND-EAST,AP,False,
1,06A,Moton Field Municipal Airport,32.460571,-85.680031,264,-5.0,A,MAINLAND-EAST,AP,False,M
2,06C,Schaumburg Regional,41.989342,-88.101242,801,-6.0,A,MAINLAND-EAST,,False,R
3,06N,Randall Airport,41.431911,-74.391563,523,-5.0,A,MAINLAND-EAST,AP,False,
4,09J,Jekyll Island Airport,31.074472,-81.427780,11,-4.0,A,MAINLAND-EAST,AP,False,
...,...,...,...,...,...,...,...,...,...,...,...
1392,ZUN,Black Rock,35.083229,-108.791779,6454,-7.0,A,MAINLAND-WEST,,False,
1393,ZVE,New Haven Rail Station,41.298668,-72.925995,7,-5.0,A,MAINLAND-EAST,AS,False,
1394,ZWI,Wilmington Amtrak Station,39.736668,-75.551666,0,-5.0,A,MAINLAND-EAST,AS,False,
1395,ZWU,Washington Union Station,38.897461,-77.006432,76,-5.0,A,MAINLAND-EAST,AS,False,


In [23]:
df_airports_proc.select(df_airports_proc.region).groupBy(df_airports_proc.region).count().toPandas()

Unnamed: 0,region,count
0,ALASKA,261
1,OFFSHORE,4
2,MAINLAND-EAST,696
3,MAINLAND-WEST,436


In [24]:
df_planes_proc.toPandas()

Unnamed: 0,tailnum,year,type,manufacturer,model,engines,seats,speed,engine,tailchar,age,engine_type
0,N102UW,1998.0,MULTI_ENG,AIRBUS,A320-214,2,182,0,Turbo-fan,UW,24.0,FAN
1,N103US,1999.0,MULTI_ENG,AIRBUS,A320-214,2,182,0,Turbo-fan,US,23.0,FAN
2,N104UW,1999.0,MULTI_ENG,AIRBUS,A320-214,2,182,0,Turbo-fan,UW,23.0,FAN
3,N105UW,1999.0,MULTI_ENG,AIRBUS,A320-214,2,182,0,Turbo-fan,UW,23.0,FAN
4,N107US,1999.0,MULTI_ENG,AIRBUS,A320-214,2,182,0,Turbo-fan,US,23.0,FAN
...,...,...,...,...,...,...,...,...,...,...,...,...
2623,N983SW,2004.0,MULTI_ENG,BOMBARDIER,CL-600-2B19,2,55,0,Turbo-fan,SW,18.0,FAN
2624,N984CA,1997.0,MULTI_ENG,CANADAIR,CL-600-2B19,2,55,0,Turbo-fan,CA,25.0,FAN
2625,N986CA,1995.0,MULTI_ENG,CANADAIR,CL-600-2B19,2,55,0,Turbo-fan,CA,27.0,FAN
2626,N986SW,2004.0,MULTI_ENG,BOMBARDIER,CL-600-2B19,2,55,0,Turbo-fan,SW,18.0,FAN


In [25]:
df_flights_proc.toPandas()

Unnamed: 0,dep_time,arr_time,dep_delay,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,dep_datetime,air_time_projected,air_time_expected,haul_duration,dep_season,dep_delay_category
0,658,935,-7,-5,VX,N846VA,1780,SEA,LAX,132,954,2014-12-08 06:58:00,115,126,SHORT-HAUL,FALL,ANTECIPATED
1,1040,1505,5,5,AS,N559AS,851,SEA,HNL,360,2677,2014-01-22 10:40:00,287,343,MEDIUM-HAUL,WINTER,MINOR
2,1443,1652,-2,2,VX,N847VA,755,SEA,SFO,111,679,2014-03-09 14:43:00,87,101,SHORT-HAUL,WINTER,ANTECIPATED
3,1705,1839,45,34,WN,N360SW,344,PDX,SJC,83,569,2014-04-09 17:05:00,76,85,SHORT-HAUL,SPRING,MINOR
4,754,1015,-1,1,AS,N612AS,522,SEA,BUR,127,937,2014-03-09 07:54:00,113,122,SHORT-HAUL,WINTER,ANTECIPATED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1806,2104,-4,-6,OO,N225AG,3458,SEA,SLC,89,689,2014-06-23 18:06:00,88,88,SHORT-HAUL,SUMMER,ANTECIPATED
9996,2336,452,11,-13,AA,N3LEAA,1230,SEA,DFW,178,1660,2014-08-31 23:36:00,186,195,SHORT-HAUL,SUMMER,MINOR
9997,904,1042,-1,-5,AS,N523AS,360,SEA,SMF,81,605,2014-08-08 09:04:00,80,82,SHORT-HAUL,SUMMER,ANTECIPATED
9998,1441,1820,26,10,WN,N8647A,2857,SEA,ABQ,133,1180,2014-08-29 14:41:00,138,142,SHORT-HAUL,SUMMER,MINOR


In [26]:
df_flights_proc.printSchema()

root
 |-- dep_time: string (nullable = true)
 |-- arr_time: string (nullable = true)
 |-- dep_delay: integer (nullable = true)
 |-- arr_delay: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- dep_datetime: timestamp (nullable = true)
 |-- air_time_projected: integer (nullable = true)
 |-- air_time_expected: integer (nullable = true)
 |-- haul_duration: string (nullable = true)
 |-- dep_season: string (nullable = true)
 |-- dep_delay_category: string (nullable = true)



In [27]:
df_airports_proc.printSchema()

root
 |-- faa: string (nullable = true)
 |-- name: string (nullable = true)
 |-- lat: float (nullable = true)
 |-- lon: float (nullable = true)
 |-- alt: integer (nullable = true)
 |-- tz: float (nullable = true)
 |-- dst: string (nullable = true)
 |-- region: string (nullable = true)
 |-- type: string (nullable = true)
 |-- military: boolean (nullable = true)
 |-- administration: string (nullable = true)



In [28]:
df_planes_proc.printSchema()

root
 |-- tailnum: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- engines: integer (nullable = true)
 |-- seats: integer (nullable = true)
 |-- speed: integer (nullable = true)
 |-- engine: string (nullable = true)
 |-- tailchar: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- engine_type: string (nullable = true)



In [30]:
df_1 = df_airports_proc
df_2 = df_airports_proc

In [31]:
print('Actual columns:', df_1.columns)


df_1 = (df_1.withColumnRenamed('faa', 'faa_O')
            .withColumnRenamed('name', 'name_O')
            .withColumnRenamed('lat', 'lat_O')
            .withColumnRenamed('lon', 'lon_O')
            .withColumnRenamed('alt', 'alt_O')
            .withColumnRenamed('tz', 'tz_O')
            .withColumnRenamed('dst', 'dst_O')
            .withColumnRenamed('region', 'region_O')
            .withColumnRenamed('type', 'type_O')
            .withColumnRenamed('military', 'military_O')
            .withColumnRenamed('administration', 'administration_O')
                                                                 
              )
print('modified columns:', df_1.columns)

df_1.toPandas()

Actual columns: ['faa', 'name', 'lat', 'lon', 'alt', 'tz', 'dst', 'region', 'type', 'military', 'administration']
modified columns: ['faa_O', 'name_O', 'lat_O', 'lon_O', 'alt_O', 'tz_O', 'dst_O', 'region_O', 'type_O', 'military_O', 'administration_O']


Unnamed: 0,faa_O,name_O,lat_O,lon_O,alt_O,tz_O,dst_O,region_O,type_O,military_O,administration_O
0,04G,Lansdowne Airport,41.130474,-80.619583,1044,-5.0,A,MAINLAND-EAST,AP,False,
1,06A,Moton Field Municipal Airport,32.460571,-85.680031,264,-5.0,A,MAINLAND-EAST,AP,False,M
2,06C,Schaumburg Regional,41.989342,-88.101242,801,-6.0,A,MAINLAND-EAST,,False,R
3,06N,Randall Airport,41.431911,-74.391563,523,-5.0,A,MAINLAND-EAST,AP,False,
4,09J,Jekyll Island Airport,31.074472,-81.427780,11,-4.0,A,MAINLAND-EAST,AP,False,
...,...,...,...,...,...,...,...,...,...,...,...
1392,ZUN,Black Rock,35.083229,-108.791779,6454,-7.0,A,MAINLAND-WEST,,False,
1393,ZVE,New Haven Rail Station,41.298668,-72.925995,7,-5.0,A,MAINLAND-EAST,AS,False,
1394,ZWI,Wilmington Amtrak Station,39.736668,-75.551666,0,-5.0,A,MAINLAND-EAST,AS,False,
1395,ZWU,Washington Union Station,38.897461,-77.006432,76,-5.0,A,MAINLAND-EAST,AS,False,


In [32]:
print('Actual columns:', df_2.columns)


df_2 = (df_2.withColumnRenamed('faa', 'faa_D')
            .withColumnRenamed('name', 'name_D')
            .withColumnRenamed('lat', 'lat_D')
            .withColumnRenamed('lon', 'lon_D')
            .withColumnRenamed('alt', 'alt_D')
            .withColumnRenamed('tz', 'tz_D')
            .withColumnRenamed('dst', 'dst_D')
            .withColumnRenamed('region', 'region_D')
            .withColumnRenamed('type', 'type_D')
            .withColumnRenamed('military', 'military_D')
            .withColumnRenamed('administration', 'administration_D')
                                                                 
              )
print('modified columns:', df_2.columns)

df_2.toPandas()

Actual columns: ['faa', 'name', 'lat', 'lon', 'alt', 'tz', 'dst', 'region', 'type', 'military', 'administration']
modified columns: ['faa_D', 'name_D', 'lat_D', 'lon_D', 'alt_D', 'tz_D', 'dst_D', 'region_D', 'type_D', 'military_D', 'administration_D']


Unnamed: 0,faa_D,name_D,lat_D,lon_D,alt_D,tz_D,dst_D,region_D,type_D,military_D,administration_D
0,04G,Lansdowne Airport,41.130474,-80.619583,1044,-5.0,A,MAINLAND-EAST,AP,False,
1,06A,Moton Field Municipal Airport,32.460571,-85.680031,264,-5.0,A,MAINLAND-EAST,AP,False,M
2,06C,Schaumburg Regional,41.989342,-88.101242,801,-6.0,A,MAINLAND-EAST,,False,R
3,06N,Randall Airport,41.431911,-74.391563,523,-5.0,A,MAINLAND-EAST,AP,False,
4,09J,Jekyll Island Airport,31.074472,-81.427780,11,-4.0,A,MAINLAND-EAST,AP,False,
...,...,...,...,...,...,...,...,...,...,...,...
1392,ZUN,Black Rock,35.083229,-108.791779,6454,-7.0,A,MAINLAND-WEST,,False,
1393,ZVE,New Haven Rail Station,41.298668,-72.925995,7,-5.0,A,MAINLAND-EAST,AS,False,
1394,ZWI,Wilmington Amtrak Station,39.736668,-75.551666,0,-5.0,A,MAINLAND-EAST,AS,False,
1395,ZWU,Washington Union Station,38.897461,-77.006432,76,-5.0,A,MAINLAND-EAST,AS,False,


In [33]:
print('Actual columns:', df_planes_proc.columns)


df_planes_proc = (df_planes_proc.withColumnRenamed('tailnum', 'tailnum_P')
                 )

print('modified columns:', df_planes_proc.columns)

df_planes_proc.toPandas()

Actual columns: ['tailnum', 'year', 'type', 'manufacturer', 'model', 'engines', 'seats', 'speed', 'engine', 'tailchar', 'age', 'engine_type']
modified columns: ['tailnum_P', 'year', 'type', 'manufacturer', 'model', 'engines', 'seats', 'speed', 'engine', 'tailchar', 'age', 'engine_type']


Unnamed: 0,tailnum_P,year,type,manufacturer,model,engines,seats,speed,engine,tailchar,age,engine_type
0,N102UW,1998.0,MULTI_ENG,AIRBUS,A320-214,2,182,0,Turbo-fan,UW,24.0,FAN
1,N103US,1999.0,MULTI_ENG,AIRBUS,A320-214,2,182,0,Turbo-fan,US,23.0,FAN
2,N104UW,1999.0,MULTI_ENG,AIRBUS,A320-214,2,182,0,Turbo-fan,UW,23.0,FAN
3,N105UW,1999.0,MULTI_ENG,AIRBUS,A320-214,2,182,0,Turbo-fan,UW,23.0,FAN
4,N107US,1999.0,MULTI_ENG,AIRBUS,A320-214,2,182,0,Turbo-fan,US,23.0,FAN
...,...,...,...,...,...,...,...,...,...,...,...,...
2623,N983SW,2004.0,MULTI_ENG,BOMBARDIER,CL-600-2B19,2,55,0,Turbo-fan,SW,18.0,FAN
2624,N984CA,1997.0,MULTI_ENG,CANADAIR,CL-600-2B19,2,55,0,Turbo-fan,CA,25.0,FAN
2625,N986CA,1995.0,MULTI_ENG,CANADAIR,CL-600-2B19,2,55,0,Turbo-fan,CA,27.0,FAN
2626,N986SW,2004.0,MULTI_ENG,BOMBARDIER,CL-600-2B19,2,55,0,Turbo-fan,SW,18.0,FAN


In [34]:
df = (df_flights_proc.join(df_1,
                  (df_flights_proc.origin == df_1.faa_O)
                        ,"left")
)

In [35]:
df.printSchema()

root
 |-- dep_time: string (nullable = true)
 |-- arr_time: string (nullable = true)
 |-- dep_delay: integer (nullable = true)
 |-- arr_delay: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- dep_datetime: timestamp (nullable = true)
 |-- air_time_projected: integer (nullable = true)
 |-- air_time_expected: integer (nullable = true)
 |-- haul_duration: string (nullable = true)
 |-- dep_season: string (nullable = true)
 |-- dep_delay_category: string (nullable = true)
 |-- faa_O: string (nullable = true)
 |-- name_O: string (nullable = true)
 |-- lat_O: float (nullable = true)
 |-- lon_O: float (nullable = true)
 |-- alt_O: integer (nullable = true)
 |-- tz_O: float (nullable = true)
 |-- dst_O: string (nullable = true)
 |-- region_O: st

In [36]:
df = (df.join(df_2,
                  (df.dest == df_2.faa_D)
                        ,"left")
)

In [37]:
df.printSchema()

root
 |-- dep_time: string (nullable = true)
 |-- arr_time: string (nullable = true)
 |-- dep_delay: integer (nullable = true)
 |-- arr_delay: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- dep_datetime: timestamp (nullable = true)
 |-- air_time_projected: integer (nullable = true)
 |-- air_time_expected: integer (nullable = true)
 |-- haul_duration: string (nullable = true)
 |-- dep_season: string (nullable = true)
 |-- dep_delay_category: string (nullable = true)
 |-- faa_O: string (nullable = true)
 |-- name_O: string (nullable = true)
 |-- lat_O: float (nullable = true)
 |-- lon_O: float (nullable = true)
 |-- alt_O: integer (nullable = true)
 |-- tz_O: float (nullable = true)
 |-- dst_O: string (nullable = true)
 |-- region_O: st

In [38]:
df = (df.join(df_planes_proc,
                  (df.tailnum == df_planes_proc.tailnum_P)
                        ,"left")
)

In [39]:
df.printSchema()

root
 |-- dep_time: string (nullable = true)
 |-- arr_time: string (nullable = true)
 |-- dep_delay: integer (nullable = true)
 |-- arr_delay: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- dep_datetime: timestamp (nullable = true)
 |-- air_time_projected: integer (nullable = true)
 |-- air_time_expected: integer (nullable = true)
 |-- haul_duration: string (nullable = true)
 |-- dep_season: string (nullable = true)
 |-- dep_delay_category: string (nullable = true)
 |-- faa_O: string (nullable = true)
 |-- name_O: string (nullable = true)
 |-- lat_O: float (nullable = true)
 |-- lon_O: float (nullable = true)
 |-- alt_O: integer (nullable = true)
 |-- tz_O: float (nullable = true)
 |-- dst_O: string (nullable = true)
 |-- region_O: st

In [40]:
len(df.columns)

51

In [41]:
df.toPandas()

Unnamed: 0,dep_time,arr_time,dep_delay,arr_delay,carrier,tailnum,flight,origin,dest,air_time,...,type,manufacturer,model,engines,seats,speed,engine,tailchar,age,engine_type
0,658,935,-7,-5,VX,N846VA,1780,SEA,LAX,132,...,MULTI_ENG,AIRBUS,A320-214,2.0,182.0,0.0,Turbo-fan,VA,11.0,FAN
1,1040,1505,5,5,AS,N559AS,851,SEA,HNL,360,...,MULTI_ENG,BOEING,737-890,2.0,149.0,0.0,Turbo-fan,AS,16.0,FAN
2,1443,1652,-2,2,VX,N847VA,755,SEA,SFO,111,...,MULTI_ENG,AIRBUS,A320-214,2.0,182.0,0.0,Turbo-fan,VA,11.0,FAN
3,1705,1839,45,34,WN,N360SW,344,PDX,SJC,83,...,MULTI_ENG,BOEING,737-3H4,2.0,149.0,0.0,Turbo-fan,SW,30.0,FAN
4,754,1015,-1,1,AS,N612AS,522,SEA,BUR,127,...,MULTI_ENG,BOEING,737-790,2.0,151.0,0.0,Turbo-jet,AS,23.0,JET
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1806,2104,-4,-6,OO,N225AG,3458,SEA,SLC,89,...,MULTI_ENG,BOMBARDIER,CL-600-2C10,2.0,80.0,0.0,Turbo-fan,AG,21.0,FAN
9996,2336,452,11,-13,AA,N3LEAA,1230,SEA,DFW,178,...,,,,,,,,,,
9997,904,1042,-1,-5,AS,N523AS,360,SEA,SMF,81,...,MULTI_ENG,BOEING,737-890,2.0,149.0,0.0,Turbo-fan,AS,13.0,FAN
9998,1441,1820,26,10,WN,N8647A,2857,SEA,ABQ,133,...,MULTI_ENG,BOEING,737-8H4,2.0,140.0,0.0,Turbo-fan,A,8.0,FAN


In [42]:
# df.write.csv(
#         path = "C:/Users/danielpe/reports/csv", 
#         mode= "overwrite", 
#         sep=";", 
#         header=True)

#### Pergunta 2

In [43]:
df.groupBy(F.col('region_D')).agg(F.countDistinct('faa_D').alias('aeroportos_por_regiao')).toPandas()

Unnamed: 0,region_D,aeroportos_por_regiao
0,ALASKA,9
1,MAINLAND-EAST,24
2,MAINLAND-WEST,36


#### Pergunta 3

In [44]:
(df.agg(F.max(F.col("alt_O") - F.col("alt_D")).alias("maior_dif_altitude")).toPandas()
)

Unnamed: 0,maior_dif_altitude
0,429


#### Pergunta 4

In [45]:
(df.select("dep_delay", "arr_delay")
            .where('dep_delay > 0 OR arr_delay > 0')
            .agg(F.ceil(F.avg(F.col('dep_delay') + F.col('arr_delay'))).alias("atraso_medio")).toPandas()
)

Unnamed: 0,atraso_medio
0,35


#### Pergunta 5

In [46]:
(df.where('dep_delay > 0 OR arr_delay > 0')
   .groupBy("region_D" )
   .agg(F.ceil(F.avg(F.col('dep_delay') + F.col('arr_delay'))).alias("atraso_medio_regiao (min)")).toPandas()
)

Unnamed: 0,region_D,atraso_medio_regiao (min)
0,ALASKA,28
1,MAINLAND-EAST,37
2,MAINLAND-WEST,36


#### Pergunta 6

In [47]:
(df.where((F.col('dep_delay') > 0) | (F.col('arr_delay') > 0))
   .groupBy(F.year(F.col('dep_datetime')).alias('ano'))
   .agg(F.sum(F.col('dep_delay') + F.col('arr_delay')).alias('atraso_acumulado_ano (min)'))
   .toPandas()
)

Unnamed: 0,ano,atraso_acumulado_ano (min)
0,2014,166798


#### Pergunta 7

In [48]:
(df.select(F.year("dep_datetime").alias('ano'),"dep_delay",'arr_delay',"region_D")
   .where('dep_delay > 0 OR arr_delay > 0')
   .groupBy("ano", "region_D")
   .agg(F.sum(F.col('dep_delay') + F.col('arr_delay')).alias('atraso_acumulado_ano (min)'))
   .toPandas()
)

Unnamed: 0,ano,region_D,atraso_acumulado_ano (min)
0,2014,ALASKA,14281
1,2014,MAINLAND-EAST,40558
2,2014,MAINLAND-WEST,111959


#### Pergunta 8

In [49]:
(df.select("air_time")   
   .agg(F.ceil(F.avg("air_time")).alias("tempo_voo_medio")).toPandas()
)

Unnamed: 0,tempo_voo_medio
0,153


#### Pergunta 9

In [50]:
(df.select("air_time", "region_D")
   .groupBy("region_D")
   .agg(F.ceil(F.avg("air_time")).alias("tempo_voo_medio_regiao")).toPandas()
)

Unnamed: 0,region_D,tempo_voo_medio_regiao
0,ALASKA,228
1,MAINLAND-EAST,238
2,MAINLAND-WEST,116


#### Pergunta 10

In [51]:
(df.select("air_time", "origin","dest")
   .groupBy("origin", "dest")
   .agg(F.ceil(F.avg("air_time")).alias("tempo_voo_medio_rota")).toPandas()
)

Unnamed: 0,origin,dest,tempo_voo_medio_rota
0,SEA,RNO,75
1,SEA,DTW,220
2,SEA,CLE,234
3,SEA,LAX,127
4,PDX,SEA,35
...,...,...,...
108,SEA,JFK,282
109,SEA,FAI,199
110,PDX,HOU,227
111,PDX,BUR,109


#### Pergunta 11

In [52]:
(df.select(F.year("dep_datetime").alias('ano'),"air_time")
   .groupBy("ano")
   .agg(F.sum('air_time').alias('tempo_voo_acumulado_ano'))
   .toPandas()
)

Unnamed: 0,ano,tempo_voo_acumulado_ano
0,2014,1528625


#### Pergunta 12

In [53]:
(df.select(F.year("dep_datetime").alias('ano'),"air_time","region_D")
   .groupBy("ano", "region_D")
   .agg(F.sum(F.col('air_time')).alias('tempo_voo_acumulado_regiao'))
   .toPandas()
)

Unnamed: 0,ano,region_D,tempo_voo_acumulado_regiao
0,2014,ALASKA,230602
1,2014,MAINLAND-EAST,508344
2,2014,MAINLAND-WEST,789679


#### Pergunta 13

In [54]:
(df.select("distance")
   .agg(F.ceil(F.avg(F.col('distance'))).alias('distancia_voo_medio_regiao'))
   .toPandas()
)

Unnamed: 0,distancia_voo_medio_regiao
0,1209


#### Pergunta 14

In [55]:
(df.select("distance", "region_D")
   .groupBy("region_D")
   .agg(F.ceil(F.avg(F.col('distance'))).alias('distancia_voo_medio_regiao'))
   .toPandas()
)

Unnamed: 0,region_D,distancia_voo_medio_regiao
0,ALASKA,1742
1,MAINLAND-EAST,2043
2,MAINLAND-WEST,868


#### Pergunta 15

In [56]:
(df.select("distance", "origin","dest")
   .groupBy("origin", "dest")
   .agg(F.ceil(F.avg("distance")).alias("distancia_voo_medio_rota")).toPandas()
)

Unnamed: 0,origin,dest,distancia_voo_medio_rota
0,SEA,RNO,564
1,SEA,DTW,1927
2,SEA,CLE,2021
3,SEA,LAX,954
4,PDX,SEA,129
...,...,...,...
108,SEA,JFK,2422
109,SEA,FAI,1533
110,PDX,HOU,1843
111,PDX,BUR,817


#### Pergunta 16

In [57]:
(df.select(F.year("dep_datetime").alias('ano'),"distance")
   .groupBy("ano")
   .agg(F.ceil(F.sum('distance')).alias('distancia_voo_acumulado_ano'))
   .toPandas()
)

Unnamed: 0,ano,distancia_voo_acumulado_ano
0,2014,12081516


#### Pergunta 17

In [58]:
(df.select(F.year("dep_datetime").alias('ano'),"distance","region_D")
   .groupBy("ano", "region_D")
   .agg(F.ceil(F.sum(F.col('distance'))).alias('distancia_acumulado_regiao'))
   .toPandas()
)

Unnamed: 0,ano,region_D,distancia_acumulado_regiao
0,2014,ALASKA,1762553
1,2014,MAINLAND-EAST,4378902
2,2014,MAINLAND-WEST,5940061


#### Pergunta 18

In [59]:
(df.select("seats", "origin","dest")
   .groupBy("origin", "dest")
   .agg(F.round(F.avg("seats"), 3).alias("passageiros_medio_rota")).toPandas()
)

Unnamed: 0,origin,dest,passageiros_medio_rota
0,SEA,RNO,141.125
1,SEA,DTW,212.429
2,SEA,CLE,182.000
3,SEA,LAX,154.711
4,PDX,SEA,64.282
...,...,...,...
108,SEA,JFK,182.600
109,SEA,FAI,179.903
110,PDX,HOU,141.500
111,PDX,BUR,79.648


#### Pergunta 19

In [60]:
(df.select(F.year("dep_datetime").alias("ano"),"seats")
   .groupBy("ano",)
   .agg(F.sum("seats").alias("passageiros_acumulado_ano")).toPandas()
)

Unnamed: 0,ano,passageiros_acumulado_ano
0,2014,1509544


#### Pergunta 20

In [61]:
(df.select("dest")
   .groupBy("dest")
   .count()
   .orderBy('count', ascending= False)
   .limit(1)
   .toPandas()
)

Unnamed: 0,dest,count
0,SFO,787


#### Pergunta 21

In [62]:
(df.select("dest", "seats")
   .groupBy("dest")
   .agg(F.sum("seats").alias("destino_mais_passageiros"))
   .orderBy("destino_mais_passageiros", ascending= False)
   .limit(1)
   .toPandas()
)

Unnamed: 0,dest,destino_mais_passageiros
0,SFO,119635


#### Pergunta 22

In [63]:
(df.select( "origin","dest", "distance")
   .where(F.col("origin") == "PDX")   
   .orderBy("distance", ascending = False)
   .limit(1)
   .toPandas()

)

Unnamed: 0,origin,dest,distance
0,PDX,LIH,2631


#### Pergunta 23

In [64]:
(df.groupBy("dest", F.month("dep_datetime").alias('mes'))
   .count()
   .orderBy('count', ascending= False)
   .limit(1)
   .toPandas()
)

Unnamed: 0,dest,mes,count
0,LAX,5,77


#### Pergunta 24

In [65]:
(df.groupBy("model")
   .count()
   .orderBy('count', ascending= False)
   .limit(1)
   .toPandas()
)

Unnamed: 0,model,count
0,737-890,1463


#### Pergunta 25

In [66]:
(df.where(F.col("model").isNotNull()) 
   .groupBy("model", "dest")
   .count()
   .orderBy('count', ascending= False)
   .limit(1)
   .toPandas()
)

Unnamed: 0,model,dest,count
0,737-7H4,OAK,141


#### Pergunta 26

In [67]:
(df.groupBy("haul_duration")
   .agg(F.avg("engines").alias("media_motores_categoria"))
   .toPandas())

Unnamed: 0,haul_duration,media_motores_categoria
0,LONG-HAUL,2.0
1,MEDIUM-HAUL,1.999643
2,SHORT-HAUL,1.999242


#### Pergunta 27

In [68]:
(df.groupBy("dep_season")
   .count()
   .orderBy('count', ascending= False)
   .limit(1)
   .toPandas()
)

Unnamed: 0,dep_season,count
0,SUMMER,2918


#### Pergunta 28

In [69]:
(df.groupBy("dep_season", "dest")
   .count()
   .orderBy('count', ascending= False)
   .limit(1)
   .toPandas()
)

Unnamed: 0,dep_season,dest,count
0,SUMMER,SFO,217


#### Pergunta 29

In [70]:
(df.where((F.col('dep_delay_category') != 'ANTECIPATED') & (F.col('dep_delay_category') != 'INTIME'))
   .groupBy("dep_delay_category")
   .count()
   .orderBy('count', ascending= False)
   .limit(1)
   .toPandas()
)

Unnamed: 0,dep_delay_category,count
0,MINOR,3065


#### Pergunta 30

In [71]:
(df.where((F.col('dep_delay_category') != 'ANTECIPATED') & (F.col('dep_delay_category') != 'INTIME'))
   .groupBy("dep_delay_category", "origin","dest")
   .count()
   .orderBy('count', ascending= False)
   .limit(1)
   .toPandas()
)

Unnamed: 0,dep_delay_category,origin,dest,count
0,MINOR,SEA,SFO,159
