In [1]:
import sys
from pathlib import Path
# Finn prosjektroten automatisk (mappen over "src")
root = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.append(str(root / "src"))

import pandas as pd
from datetime import datetime, timedelta
import time
from pyspark.sql import SparkSession
from api.elhub_api import fetch_elhub_data
from cassandra.cluster import Cluster

In [2]:
spark = SparkSession.builder \
    .appName("ElhubBronze") \
    .master("local[*]") \
    .config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.12:3.5.1") \
    .config("spark.cassandra.connection.host", "127.0.0.1") \
    .config("spark.cassandra.connection.port", "9042") \
    .config("spark.sql.extensions", "com.datastax.spark.connector.CassandraSparkExtensions") \
    .config("spark.sql.catalog.mycatalog", "com.datastax.spark.connector.datasource.CassandraCatalog") \
    .config("spark.cassandra.output.consistency.level", "ONE") \
    .config("spark.cassandra.connection.keepAliveMS", "60000") \
    .getOrCreate()

print("✅ SparkSession started with Cassandra integration")

25/10/13 12:25:40 WARN Utils: Your hostname, Fabians-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.22 instead (on interface en0)
25/10/13 12:25:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/fabianheflo/.ivy2/cache
The jars for the packages stored in: /Users/fabianheflo/.ivy2/jars
com.datastax.spark#spark-cassandra-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4b19302c-0719-46bb-b5a6-34bab00deaa4;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/fabianheflo/UNI_courses/IND320/IND320/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found com.datastax.spark#spark-cassandra-connector_2.12;3.5.1 in central
	found com.datastax.spark#spark-cassandra-connector-driver_2.12;3.5.1 in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.11.0 in central
	found org.apache.cassandra#java-driver-core-shaded;4.18.1 in central
	found com.datastax.oss#native-protocol;1.5.1 in central
	found com.datastax.oss#java-driver-shaded-guava;25.1-jre-graal-sub-1 in central
	found com.typesafe#config;1.4.1 in central
	found org.slf4j#slf4j-api;1.7.26 in central
	found io.dropwizard.metrics#metrics-core;4.1.18 in central
	found org.hdrhistogram#HdrHistogram;2.1.12 in central
	found org.reactivestreams#reactive-streams;1.0.3 in central
	found org.apache.cassandra#java-driver-mapper-runtime;4.18.1 in central
	found org.apache.cassandra#java-driver-query-builder;4.18.1 in central
	found org.apache.commons#commons-lang3;3.10 in central
	found com.thoughtworks.paranamer#paranamer;2.8 in central
	found org.scala-lang#scala-reflect

✅ SparkSession started with Cassandra integration


In [3]:
# Testing API connection and data fetching
start = datetime(2021, 1, 1)
end = start + timedelta(days=1)
df = fetch_elhub_data(start, end)
df.head()

Unnamed: 0,endTime,lastUpdatedTime,priceArea,productionGroup,quantityKwh,startTime,meteringgridarea
0,2021-01-01T01:00:00+01:00,2024-12-20T10:35:40+01:00,NO1,hydro,2507716.8,2021-01-01T00:00:00+01:00,NO1
1,2021-01-01T02:00:00+01:00,2024-12-20T10:35:40+01:00,NO1,hydro,2494728.0,2021-01-01T01:00:00+01:00,NO1
2,2021-01-01T03:00:00+01:00,2024-12-20T10:35:40+01:00,NO1,hydro,2486777.5,2021-01-01T02:00:00+01:00,NO1
3,2021-01-01T04:00:00+01:00,2024-12-20T10:35:40+01:00,NO1,hydro,2461176.0,2021-01-01T03:00:00+01:00,NO1
4,2021-01-01T05:00:00+01:00,2024-12-20T10:35:40+01:00,NO1,hydro,2466969.2,2021-01-01T04:00:00+01:00,NO1


In [4]:
# Koble til Cassandra
cluster = Cluster(['127.0.0.1'])
session = cluster.connect()

# Opprett keyspace om det ikke finnes
session.execute("""
CREATE KEYSPACE IF NOT EXISTS elhub_data
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}
""")

# Opprett tabell automatisk basert på kolonnene i df
columns = df.columns
print("📋 Columns from API:", columns)

# Lag en dynamisk CREATE TABLE-setning
schema = ",\n    ".join([f"{col} TEXT" for col in columns])
primary_key = ", ".join(columns[:3])  # bruk de tre første som nøkkel (enkelt valg for bronze)
query = f"""
CREATE TABLE IF NOT EXISTS elhub_data.production_raw (
    {schema},
    PRIMARY KEY ({primary_key})
)
"""

session.execute(query)
print("✅ Table created dynamically based on Elhub DataFrame schema")

📋 Columns from API: Index(['endTime', 'lastUpdatedTime', 'priceArea', 'productionGroup',
       'quantityKwh', 'startTime', 'meteringgridarea'],
      dtype='object')
✅ Table created dynamically based on Elhub DataFrame schema


In [11]:
# Delete existing data for clean slate (for testing)
session.execute("TRUNCATE elhub_data.production_raw")
print("🧹 Cleared existing data from production_raw")

🧹 Cleared existing data from production_raw


In [14]:
start_date = datetime(2021, 1, 1)
end_date = datetime(2022, 1, 1)  # eksakt slutt
delta = timedelta(days=7)

batch = 1
current = start_date

while current < end_date:
    next_batch = min(current + delta, end_date)
    print(f"📦 Batch {batch}: Fetching data for {current.date()} → {next_batch.date()}")

    df = fetch_elhub_data(current, next_batch)

    if not df.empty:
        # Sikre små bokstaver på kolonner for Cassandra
        df.columns = [c.lower() for c in df.columns]

        sdf = spark.createDataFrame(df)
        sdf.write \
            .format("org.apache.spark.sql.cassandra") \
            .mode("append") \
            .options(table="production_raw", keyspace="elhub_data") \
            .save()

        print(f"✅ Batch {batch}: Saved {len(df)} rows ({current.date()} → {next_batch.date()})\n")
    else:
        print(f"⚠️ Batch {batch}: No data found ({current.date()} → {next_batch.date()})\n")

    current = next_batch
    batch += 1
    time.sleep(0.5)

📦 Batch 1: Fetching data for 2021-01-01 → 2021-01-08
✅ Batch 1: Saved 4032 rows (2021-01-01 → 2021-01-08)

📦 Batch 2: Fetching data for 2021-01-08 → 2021-01-15
✅ Batch 2: Saved 4032 rows (2021-01-08 → 2021-01-15)

📦 Batch 3: Fetching data for 2021-01-15 → 2021-01-22
✅ Batch 3: Saved 4032 rows (2021-01-15 → 2021-01-22)

📦 Batch 4: Fetching data for 2021-01-22 → 2021-01-29
✅ Batch 4: Saved 4032 rows (2021-01-22 → 2021-01-29)

📦 Batch 5: Fetching data for 2021-01-29 → 2021-02-05
✅ Batch 5: Saved 4032 rows (2021-01-29 → 2021-02-05)

📦 Batch 6: Fetching data for 2021-02-05 → 2021-02-12
✅ Batch 6: Saved 4032 rows (2021-02-05 → 2021-02-12)

📦 Batch 7: Fetching data for 2021-02-12 → 2021-02-19
✅ Batch 7: Saved 4032 rows (2021-02-12 → 2021-02-19)

📦 Batch 8: Fetching data for 2021-02-19 → 2021-02-26
✅ Batch 8: Saved 4032 rows (2021-02-19 → 2021-02-26)

📦 Batch 9: Fetching data for 2021-02-26 → 2021-03-05
✅ Batch 9: Saved 4032 rows (2021-02-26 → 2021-03-05)

📦 Batch 10: Fetching data for 2021-03

In [15]:
df.columns  = [c.lower() for c in df.columns]  # Cassandra liker små bokstaver
sdf = spark.createDataFrame(df)
sdf.write \
    .format("org.apache.spark.sql.cassandra") \
    .mode("append") \
    .options(table="production_raw", keyspace="elhub_data") \
    .save()

print("✅ Data written to Cassandra (bronze layer)")

✅ Data written to Cassandra (bronze layer)


In [19]:
spark.read \
    .format("org.apache.spark.sql.cassandra") \
    .options(table="production_raw", keyspace="elhub_data") \
    .load() \
    .tail(16)

                                                                                

[Row(pricearea='NO3', starttime=datetime.datetime(2021, 12, 31, 20, 0), productiongroup='wind', endtime=datetime.datetime(2021, 12, 31, 21, 0), lastupdatedtime=datetime.datetime(2024, 10, 27, 7, 56, 27), meteringgridarea='NO3', quantitykwh=447099.28),
 Row(pricearea='NO3', starttime=datetime.datetime(2021, 12, 31, 21, 0), productiongroup='hydro', endtime=datetime.datetime(2021, 12, 31, 22, 0), lastupdatedtime=datetime.datetime(2024, 10, 27, 7, 56, 27), meteringgridarea='NO3', quantitykwh=2767937.0),
 Row(pricearea='NO3', starttime=datetime.datetime(2021, 12, 31, 21, 0), productiongroup='other', endtime=datetime.datetime(2021, 12, 31, 22, 0), lastupdatedtime=datetime.datetime(2024, 10, 27, 7, 56, 27), meteringgridarea='NO3', quantitykwh=63.735),
 Row(pricearea='NO3', starttime=datetime.datetime(2021, 12, 31, 21, 0), productiongroup='solar', endtime=datetime.datetime(2021, 12, 31, 22, 0), lastupdatedtime=datetime.datetime(2024, 10, 27, 7, 56, 27), meteringgridarea='NO3', quantitykwh=54.9