In [1]:
from pyspark.sql import SparkSession

import plotly.express as px
import ipywidgets as widgets

import pandas as pd

In [2]:
spark = SparkSession.builder \
    .appName("ElhubGold") \
    .master("local[*]") \
    .config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.12:3.5.1") \
    .config("spark.cassandra.connection.host", "127.0.0.1") \
    .config("spark.cassandra.connection.port", "9042") \
    .config("spark.sql.extensions", "com.datastax.spark.connector.CassandraSparkExtensions") \
    .config("spark.sql.catalog.mycatalog", "com.datastax.spark.connector.datasource.CassandraCatalog") \
    .config("spark.cassandra.output.consistency.level", "ONE") \
    .config("spark.cassandra.connection.keepAliveMS", "60000") \
    .getOrCreate()

print("✅ SparkSession started with Cassandra integration")

25/10/31 11:26:00 WARN Utils: Your hostname, Fabians-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 172.20.10.3 instead (on interface en0)
25/10/31 11:26:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/fabianheflo/.ivy2/cache
The jars for the packages stored in: /Users/fabianheflo/.ivy2/jars
com.datastax.spark#spark-cassandra-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6d471439-ae1d-4441-8da0-6f0e33bc642b;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/fabianheflo/UNI_courses/IND320/IND320/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found com.datastax.spark#spark-cassandra-connector_2.12;3.5.1 in central
	found com.datastax.spark#spark-cassandra-connector-driver_2.12;3.5.1 in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.11.0 in central
	found org.apache.cassandra#java-driver-core-shaded;4.18.1 in central
	found com.datastax.oss#native-protocol;1.5.1 in central
	found com.datastax.oss#java-driver-shaded-guava;25.1-jre-graal-sub-1 in central
	found com.typesafe#config;1.4.1 in central
	found org.slf4j#slf4j-api;1.7.26 in central
	found io.dropwizard.metrics#metrics-core;4.1.18 in central
	found org.hdrhistogram#HdrHistogram;2.1.12 in central
	found org.reactivestreams#reactive-streams;1.0.3 in central
	found org.apache.cassandra#java-driver-mapper-runtime;4.18.1 in central
	found org.apache.cassandra#java-driver-query-builder;4.18.1 in central
	found org.apache.commons#commons-lang3;3.10 in central
	found com.thoughtworks.paranamer#paranamer;2.8 in central
	found org.scala-lang#scala-reflect

✅ SparkSession started with Cassandra integration


In [3]:
silver_df = (
    spark.read
    .format("org.apache.spark.sql.cassandra")
    .options(table="production_silver", keyspace="elhub_data")
    .load()
)

pdf = silver_df.toPandas()
pdf["starttime"] = pd.to_datetime(pdf["starttime"])
pdf["month"] = pdf["starttime"].dt.month

                                                                                

In [4]:
# Unique price areas
price_areas = sorted(pdf["pricearea"].unique())

# Month selection
month_options = {
    "All year": 0,
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}

area_dropdown = widgets.Dropdown(
    options=price_areas,
    value=price_areas[0],
    description="Area:",
)

month_dropdown = widgets.Dropdown(
    options=month_options,
    value=0,
    description="Month:",
)

In [5]:
def update_visuals(price_area, month):
    # Filter by selected area
    subset = pdf[pdf["pricearea"] == price_area]

    # Filter by month (if selected)
    if month != 0:
        subset = subset[subset["month"] == month]
        title_suffix = f"Month: {month}"
    else:
        title_suffix = "Whole year"

    # --- Pie chart (total production per energy source) ---
    pie_data = subset.groupby("productiongroup")["quantitykwh"].sum().reset_index()
    fig_pie = px.pie(
        pie_data,
        values="quantitykwh",
        names="productiongroup",
        title=f"Total production in {price_area} ({title_suffix})",
        color_discrete_sequence=px.colors.qualitative.Pastel
    )

    # --- Line chart (development over time) ---
    line_data = (
        subset.groupby(["starttime", "productiongroup"])["quantitykwh"].sum().reset_index()
    )
    fig_line = px.line(
        line_data,
        x="starttime",
        y="quantitykwh",
        color="productiongroup",
        title=f"Production over time in {price_area} ({title_suffix})",
        labels={"quantitykwh": "kWh", "starttime": "Time"}
    )

    # Show both
    fig_pie.show()
    fig_line.show()

### Interactive Plot

- Choose area with dropdown. 
- Choose month wiht dropdown. All year is default.
- Click on the colors in the legend to hide it from the plot. Makes it easier to analys the smaller groups.


In [6]:
widgets.interactive(
    update_visuals,
    price_area=area_dropdown,
    month=month_dropdown
)

interactive(children=(Dropdown(description='Area:', options=('NO1', 'NO2', 'NO3', 'NO4', 'NO5'), value='NO1'),…

# Assignment 3 Parts:

## STL and Spectrogram 

In [65]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from statsmodels.tsa.seasonal import STL
from scipy import signal
import numpy as np
import ipywidgets as widgets
from IPython.display import display
from functools import lru_cache

In [36]:
def prepare_production_series(df, pricearea="NO3", productiongroup="hydro"):
    df_sel = df[(df["pricearea"] == pricearea) & (df["productiongroup"] == productiongroup)].copy()
    df_sel["time"] = pd.to_datetime(df_sel["time"])
    df_sel = df_sel.groupby("time", as_index=False)["quantitykwh"].sum()
    df_sel["production_mwh"] = df_sel["quantitykwh"] / 1000  # convert for readability
    df_sel = df_sel.set_index("time").asfreq("h")  # ensure hourly continuity
    return df_sel

In [None]:
@lru_cache(maxsize=128)
def prepare_data(area, group, period=24, seasonal=13, trend=169):
    df_sel = df[(df["pricearea"] == area) & (df["productiongroup"] == group)].copy()
    df_sel = df_sel.groupby("time", as_index=False)["quantitykwh"].sum()
    df_sel["production_mwh"] = df_sel["quantitykwh"] / 1000
    df_sel = df_sel.set_index("time").asfreq("h")
    series = df_sel["production_mwh"].interpolate()
    
    if trend <= period:
        trend = period + 1 if (period + 1) % 2 else period + 2
    
    stl = STL(series, period=period, seasonal=seasonal, trend=trend, robust=True)
    res = stl.fit()
    
    df_sel["trend"] = res.trend
    df_sel["seasonal"] = res.seasonal
    df_sel["resid"] = res.resid
    
    return df_sel

def interactive_stl_decomposition(df, priceareas=None, groups=None,
                                period=24, seasonal_default=13, trend_default=169):
    if priceareas is None:
        priceareas = sorted(df["pricearea"].unique())
    if groups is None:
        groups = sorted(df["productiongroup"].unique())

    area_widget = widgets.Dropdown(options=priceareas, value=priceareas[0], description="Area:")
    group_widget = widgets.Dropdown(options=groups, value=groups[0], description="Group:")
    seasonal_widget = widgets.IntSlider(value=seasonal_default, min=5, max=49, step=4, description="Seasonal")
    trend_widget = widgets.IntSlider(value=trend_default, min=25, max=289, step=24, description="Trend")

    def plot(area, group, seasonal, trend):
        df_sel = prepare_data(area, group, period, seasonal, trend)

        fig = make_subplots(
            rows=4, cols=1, shared_xaxes=True, vertical_spacing=0.03,
            subplot_titles=("Original", "Trend", "Seasonal", "Residual")
        )
        fig.add_trace(go.Scatter(x=df_sel.index, y=df_sel["production_mwh"], name="Original", line=dict(color="black")), row=1, col=1)
        fig.add_trace(go.Scatter(x=df_sel.index, y=df_sel["trend"], name="Trend", line=dict(color="blue")), row=2, col=1)
        fig.add_trace(go.Scatter(x=df_sel.index, y=df_sel["seasonal"], name="Seasonal", line=dict(color="orange")), row=3, col=1)
        fig.add_trace(go.Scatter(x=df_sel.index, y=df_sel["resid"], name="Residual", line=dict(color="red")), row=4, col=1)
        fig.update_layout(height=900, width=950, title=f"STL Decomposition – {group.capitalize()} ({area})", showlegend=False, template="plotly_white")
        fig.update_xaxes(title_text="Time", row=4, col=1)
        fig.show()

    ui = widgets.HBox([area_widget, group_widget, seasonal_widget, trend_widget])
    out = widgets.interactive_output(plot, {
        "area": area_widget,
        "group": group_widget,
        "seasonal": seasonal_widget,
        "trend": trend_widget
    })
    display(ui, out)


In [38]:
def plot_spectrogram(df, pricearea="NO3", productiongroup="hydro",
                     window_length=256, overlap=128):
    """
    Compute and plot a spectrogram for Elhub production data.
    """
    df_sel = prepare_production_series(df, pricearea, productiongroup)
    series = df_sel["production_mwh"].interpolate()

    f, t, Sxx = signal.spectrogram(series, fs=1.0, nperseg=window_length, noverlap=overlap)
    Sxx_log = 10 * np.log10(Sxx + 1e-10)  # convert to decibels

    fig = go.Figure(data=go.Heatmap(
        z=Sxx_log,
        x=t,
        y=f,
        colorscale="Viridis",
        colorbar=dict(title="Power (dB)")
    ))
    fig.update_layout(
        title=f"Spectrogram – {productiongroup.capitalize()} ({pricearea})",
        xaxis_title="Time (hours)",
        yaxis_title="Frequency (1/hour)",
        template="plotly_white"
    )

    return fig, (f, t, Sxx)

In [13]:
df = pdf.copy()
df = df.rename(columns={"starttime": "time"})
df = df.sort_values("time")
df.head()

Unnamed: 0,pricearea,productiongroup,time,quantitykwh,month
0,NO3,hydro,2021-01-01,2836774.0,1
8760,NO3,other,2021-01-01,0.0,1
183960,NO5,other,2021-01-01,0.0,1
175200,NO5,hydro,2021-01-01,4068096.5,1
17520,NO3,solar,2021-01-01,19.722,1


In [63]:
fig = interactive_stl_decomposition(df)
fig

HBox(children=(Dropdown(description='Area:', options=('NO1', 'NO2', 'NO3', 'NO4', 'NO5'), value='NO1'), Dropdo…

Output()

In [48]:
fig_spec, (f, t, Sxx) = plot_spectrogram(df, pricearea="NO3", productiongroup="hydro")
fig_spec.show()