In [3]:
#Inspect Gold data

import os
import sys
from pathlib import Path
from dotenv import load_dotenv
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# 1. LOAD CREDENTIALS (Identical to Silver script)
# Since your notebook is in /notebooks, we go up one level to find .env
env_path = Path.cwd().parent / '.env'
load_dotenv(dotenv_path=env_path)

STORAGE_ACCOUNT = os.getenv("STORAGE_ACCOUNT")
CLIENT_ID       = os.getenv("CLIENT_ID")
TENANT_ID       = os.getenv("TENANT_ID")
CLIENT_SECRET   = os.getenv("CLIENT_SECRET")

# 2. BUILD THE SPARK SESSION
spark = SparkSession.builder \
    .appName("Aviation_EDA") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.4,com.microsoft.azure:azure-storage:8.6.6") \
    .config(f"fs.azure.account.auth.type.{STORAGE_ACCOUNT}.dfs.core.windows.net", "OAuth") \
    .config(f"fs.azure.account.oauth.provider.type.{STORAGE_ACCOUNT}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") \
    .config(f"fs.azure.account.oauth2.client.id.{STORAGE_ACCOUNT}.dfs.core.windows.net", CLIENT_ID) \
    .config(f"fs.azure.account.oauth2.client.secret.{STORAGE_ACCOUNT}.dfs.core.windows.net", CLIENT_SECRET) \
    .config(f"fs.azure.account.oauth2.client.endpoint.{STORAGE_ACCOUNT}.dfs.core.windows.net", f"https://login.microsoftonline.com/{TENANT_ID}/oauth2/token") \
    .getOrCreate()

# 3. LOAD THE DATA
gold_path_country = f"abfss://gold@{STORAGE_ACCOUNT}.dfs.core.windows.net/country_stats"
gold_path_summay= f"abfss://gold@{STORAGE_ACCOUNT}.dfs.core.windows.net/daily_summary"
df_country_stats = spark.read.parquet(gold_path_country)
df_summary = spark.read.parquet(gold_path_summay)



In [2]:
# 4. PREVIEW
df_country_stats.limit(5).toPandas() # .toPandas() makes it look much nicer in Notebooks than .show()

Unnamed: 0,origin_country,flight_count,avg_speed_ms,avg_altitude_m
0,United States,87,143.04686,6402.460769
1,India,30,162.214333,8360.525455
2,Turkey,16,187.643125,8300.357143
3,France,11,207.511818,8588.432727
4,Austria,9,178.548889,8684.895


In [4]:
df_summary.limit(5).toPandas()

                                                                                

Unnamed: 0,processed_at,total_global_flights
0,2026-02-15 08:13:00.653109,200
