In [None]:
# Célula 1: Configuração e Imports
import os
import sys

# Ajuste de caminhos para garantir que o Python encontre tudo
# (Opcional, mas ajuda se tiver módulos customizados)
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DoubleType

# Criar a Sessão Spark (Modo Local)
spark = SparkSession.builder \
    .appName("Exploracao_Silver_BeFly") \
    .master("local[*]") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

print("SparkSession criada com sucesso!")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/01/10 19:13:40 WARN Utils: Your hostname, turma27.local, resolves to a loopback address: 127.0.0.1; using 192.168.2.102 instead (on interface en5)
26/01/10 19:13:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/10 19:13:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


SparkSession criada com sucesso!


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Criar sessão (Simples, sem configurações complexas de log)
spark = SparkSession.builder \
    .appName("Exploracao_BeFly") \
    .master("local[*]") \
    .getOrCreate()

# Ler os dados que JÁ estão em Parquet (muito mais rápido que CSV)
# Ajuste o caminho se necessário
df_flights = spark.read.parquet("../data/bronze/flights")
df_airlines = spark.read.parquet("../data/bronze/airlines")
df_airports = spark.read.parquet("../data/bronze/airports")

print(f"Voos carregados: {df_flights.count()}")

26/01/10 19:14:35 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Voos carregados: 469968


In [3]:
df_flights.printSchema()

root
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- FLIGHT_NUMBER: integer (nullable = true)
 |-- TAIL_NUMBER: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- SCHEDULED_DEPARTURE: integer (nullable = true)
 |-- DEPARTURE_TIME: integer (nullable = true)
 |-- DEPARTURE_DELAY: integer (nullable = true)
 |-- TAXI_OUT: integer (nullable = true)
 |-- WHEELS_OFF: integer (nullable = true)
 |-- SCHEDULED_TIME: integer (nullable = true)
 |-- ELAPSED_TIME: integer (nullable = true)
 |-- AIR_TIME: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- WHEELS_ON: integer (nullable = true)
 |-- TAXI_IN: integer (nullable = true)
 |-- SCHEDULED_ARRIVAL: integer (nullable = true)
 |-- ARRIVAL_TIME: integer (nullable = true)
 |-- ARRIVAL_DELAY: integer (null

In [None]:
# Ver algumas amostras de voos cancelados
df_flights.filter("CANCELLED = 1").select(
    "FLIGHT_NUMBER", 
    "DEPARTURE_TIME", 
    "DEPARTURE_DELAY", 
    "ARRIVAL_DELAY"
).show(10)

+-------------+--------------+---------------+-------------+
|FLIGHT_NUMBER|DEPARTURE_TIME|DEPARTURE_DELAY|ARRIVAL_DELAY|
+-------------+--------------+---------------+-------------+
|          136|          NULL|           NULL|         NULL|
|         2459|          NULL|           NULL|         NULL|
|         5254|          NULL|           NULL|         NULL|
|         2859|          NULL|           NULL|         NULL|
|         5460|          NULL|           NULL|         NULL|
|         2926|          NULL|           NULL|         NULL|
|         6457|          NULL|           NULL|         NULL|
|         3534|          NULL|           NULL|         NULL|
|         3161|          NULL|           NULL|         NULL|
|          175|          NULL|           NULL|         NULL|
+-------------+--------------+---------------+-------------+
only showing top 10 rows


In [None]:
# Visualizar os códigos das companhias
df_airlines.show(5)
df_flights.select("AIRLINE").distinct().show(5)

+---------+--------------------+
|IATA_CODE|             AIRLINE|
+---------+--------------------+
|       UA|United Air Lines ...|
|       AA|American Airlines...|
|       US|     US Airways Inc.|
|       F9|Frontier Airlines...|
|       B6|     JetBlue Airways|
+---------+--------------------+
only showing top 5 rows
+-------+
|AIRLINE|
+-------+
|     UA|
|     NK|
|     AA|
|     EV|
|     B6|
+-------+
only showing top 5 rows


In [None]:
# Testando a Criação de Data
df_flights.select("YEAR", "MONTH", "DAY") \
    .withColumn("data_teste", F.concat_ws("-", "YEAR", "MONTH", "DAY")) \
    .withColumn("data_oficial", F.to_date(F.col("data_teste"))) \
    .show(15)

+----+-----+---+----------+------------+
|YEAR|MONTH|DAY|data_teste|data_oficial|
+----+-----+---+----------+------------+
|2015|    1|  1|  2015-1-1|  2015-01-01|
|2015|    1|  1|  2015-1-1|  2015-01-01|
|2015|    1|  1|  2015-1-1|  2015-01-01|
|2015|    1|  1|  2015-1-1|  2015-01-01|
|2015|    1|  1|  2015-1-1|  2015-01-01|
|2015|    1|  1|  2015-1-1|  2015-01-01|
|2015|    1|  1|  2015-1-1|  2015-01-01|
|2015|    1|  1|  2015-1-1|  2015-01-01|
|2015|    1|  1|  2015-1-1|  2015-01-01|
|2015|    1|  1|  2015-1-1|  2015-01-01|
|2015|    1|  1|  2015-1-1|  2015-01-01|
|2015|    1|  1|  2015-1-1|  2015-01-01|
|2015|    1|  1|  2015-1-1|  2015-01-01|
|2015|    1|  1|  2015-1-1|  2015-01-01|
|2015|    1|  1|  2015-1-1|  2015-01-01|
+----+-----+---+----------+------------+
only showing top 15 rows


In [14]:
# Validando o código das companhias aéreas nos voos
voos_sem_companhia = df_flights.join(
    df_airlines, 
    df_flights.AIRLINE == df_airlines.IATA_CODE, 
    "left_anti"
)

count_orfãos = voos_sem_companhia.count()

print(f"Quantidade de voos com código de companhia inválido: {count_orfãos}")

Quantidade de voos com código de companhia inválido: 0


In [15]:
# Validando os códigos das companhias aéreas nos voos - Método 2 (verificação por diferença de conjuntos)
codigos_voos = df_flights.select("AIRLINE").distinct()
codigos_cias = df_airlines.select("IATA_CODE").distinct()

# (Códigos usados nos voos) - (Códigos cadastrados)
diferenca = codigos_voos.subtract(codigos_cias)

diferenca.show()

+-------+
|AIRLINE|
+-------+
+-------+



In [16]:
# Check-up da camada Silver
df_silver = spark.read.parquet("../data/silver/flights_enriched")
df_silver.printSchema()
df_silver.select("FLIGHT_DATE", "AIRLINE_NAME", "ORIGIN_CITY", "FLIGHT_STATUS").show(15)

root
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- FLIGHT_NUMBER: integer (nullable = true)
 |-- TAIL_NUMBER: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- SCHEDULED_DEPARTURE: integer (nullable = true)
 |-- DEPARTURE_TIME: integer (nullable = true)
 |-- DEPARTURE_DELAY: integer (nullable = true)
 |-- TAXI_OUT: integer (nullable = true)
 |-- WHEELS_OFF: integer (nullable = true)
 |-- SCHEDULED_TIME: integer (nullable = true)
 |-- ELAPSED_TIME: integer (nullable = true)
 |-- AIR_TIME: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- WHEELS_ON: integer (nullable = true)
 |-- TAXI_IN: integer (nullable = true)
 |-- SCHEDULED_ARRIVAL: integer (nullable = true)
 |-- ARRIVAL_TIME: integer (nullable = true)
 |-- ARRIVAL_DELAY: integer (null