In [1]:
days = ""

In [2]:
# Parameters
days = 30


In [3]:
import os
import requests
import json
from dotenv import load_dotenv
from datetime import datetime, timedelta, timezone
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, posexplode, col, size,to_date, concat_ws, lit
from pyspark.sql.types import StructType, StructField, StringType, FloatType, BooleanType, ArrayType

In [4]:
load_dotenv()
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk@17/libexec/openjdk.jdk/Contents/Home"

In [5]:
# Get Jar path needed for spark session
# For simplicity using locally downloaded jars for delta format
cwd = os.getcwd()
if cwd.endswith("notebooks"):
    proj_dir = os.path.abspath("..")
else:
    proj_dir = cwd
jar_dir = os.path.join(proj_dir, "jars")
jar1 = os.path.join(jar_dir, "delta-spark_2.13-4.0.0.jar")
jar2 = os.path.join(jar_dir, "delta-storage-4.0.0.jar")

In [6]:
spark = SparkSession.builder.appName("EnergyUseCase") \
            .config("spark.jars", f"{jar1},{jar2}") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .getOrCreate()




25/06/29 19:21:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [7]:
# API Configuration
BASE_URL = "https://api.energy-charts.info"
ENDPOINT_CONFIG = { 
    "installed_power": {
        "path": "/installed_power",
        "params": ["country", "time_step", "installation_decommission"]
    }
}

In [8]:
# Pipeline Parameters
country= "de"
time_step = "monthly"
#start_time = "2025-04-01 00:00"
#end_time = "2025-04-30 00:00"
end_time = datetime.now(timezone.utc)
start_time = end_time - timedelta(days=days)

In [9]:
def fetch_api_data(endpoint_name, **kwargs):
    """
    Fetch JSON data from the energy charts 
    """

    if endpoint_name not in ENDPOINT_CONFIG:
        raise ValueError(f"Unsupported endpoint: {endpoint_name}")
    
    config = ENDPOINT_CONFIG[endpoint_name]
    path = config["path"]
    required_params = config["params"]
    missing = [p for p in required_params if p not in kwargs]
    if missing:
        raise ValueError(f"Missing required params: {missing} for endpoint '{endpoint_name}'")
    params = {k: v for k, v in kwargs.items() if k in required_params}
    url = f"{BASE_URL}{path}"
    try:
        print(f"Fetching data from api with these params - {params}")
        response = requests.get(url, params=params, verify=False)
        print(f"Status Code: {response.status_code}")
        data = response.json() 
        if not data:
            print("Empty response received.")
            return None
        return data
    except Exception as e:
        print(f"Unexpected error: {e}")
        return None

In [10]:
# Fetch installed power
installed_power = fetch_api_data("installed_power",
                            country=country,
                            time_step=time_step,  
                           installation_decommission=False
                        )
# Create raw dataframe 
if installed_power is not None and isinstance(installed_power, dict):
    # Define schema
    installed_power_schema = StructType([
        StructField("time", ArrayType(StringType()), True),
        StructField("production_types", ArrayType(
            StructType([
                StructField("name", StringType(), True),
                StructField("data", ArrayType(FloatType()), True)
            ])
        ), True),
        StructField("deprecated", BooleanType(), True)
    ])
    installed_power_df = spark.createDataFrame([installed_power], schema=installed_power_schema)
    installed_power_df.show(3)

Fetching data from api with these params - {'country': 'de', 'time_step': 'monthly', 'installation_decommission': False}




Status Code: 200


[Stage 0:>                                                          (0 + 1) / 1]

                                                                                

+--------------------+--------------------+----------+
|                time|    production_types|deprecated|
+--------------------+--------------------+----------+
|[01.2002, 02.2002...|[{Biomass, [0.0, ...|     false|
+--------------------+--------------------+----------+



In [11]:
installed_power_df = installed_power_df.withColumn("production_type", explode("production_types"))
installed_power_df = installed_power_df.select(
    col("time"),
    col("production_type.name").alias("production_type"),
    posexplode(col("production_type.data")).alias("pos", "value")
)
installed_power_df = installed_power_df.filter(col("pos") < size(col("time")))
installed_power_df = installed_power_df.withColumn("monthyear", col("time")[col("pos")]).drop("time")
# Format date filed        
installed_power_df = installed_power_df.withColumn("date", to_date(concat_ws("-", lit("01"), col("monthyear")), "dd-MM.yyyy") )

# Filter dataframe to get required data based on start and end time
start_year, start_month = start_time.year, start_time.month
end_year, end_month = start_time.year, start_time.month
start_filter = to_date(lit(f"{start_year}-{start_month}-01"))
end_filter = to_date(lit(f"{end_year}-{end_month}-01"))
if start_year == end_year and start_month == end_month:
    filtered = installed_power_df.filter(col("date") == start_filter)
else:
    filtered = installed_power_df.filter((col("date") >= start_filter) & (col("date") <= end_filter))

installed_power_data = filtered.select("date", 
                                        "production_type", 
                                        col("value").alias("installed_power")).drop("monthyear")
installed_power_data = installed_power_data.dropDuplicates() 
# Write to storage in Delta format
installed_power_data.write \
                    .format("delta") \
                    .mode("append") \
                    .option("mergeSchema", "true") \
                    .partitionBy("date") \
                    .save(f"{proj_dir}/data/silver/installed_power_data")  

installed_power_data.show(10)
installed_power_data.printSchema()



[Stage 5:>                                                          (0 + 1) / 1]                                                                                

25/06/29 19:22:01 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.




+----------+--------------------+---------------+
|      date|     production_type|installed_power|
+----------+--------------------+---------------+
|2025-05-01|Battery Storage (...|         20.537|
|2025-05-01|Battery Storage (...|         13.961|
|2025-05-01|         Solar gross|        106.399|
|2025-05-01|             Biomass|          9.202|
|2025-05-01|           Solar net|         94.911|
|2025-05-01|       Wind offshore|          9.215|
|2025-05-01|        Wind onshore|          65.04|
+----------+--------------------+---------------+

root
 |-- date: date (nullable = false)
 |-- production_type: string (nullable = true)
 |-- installed_power: float (nullable = true)

