In [None]:
from pyspark.sql import SparkSession

import plotly.express as px
import ipywidgets as widgets

import pandas as pd

In [2]:
spark = SparkSession.builder \
    .appName("ElhubGold") \
    .master("local[*]") \
    .config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.12:3.5.1") \
    .config("spark.cassandra.connection.host", "127.0.0.1") \
    .config("spark.cassandra.connection.port", "9042") \
    .config("spark.sql.extensions", "com.datastax.spark.connector.CassandraSparkExtensions") \
    .config("spark.sql.catalog.mycatalog", "com.datastax.spark.connector.datasource.CassandraCatalog") \
    .config("spark.cassandra.output.consistency.level", "ONE") \
    .config("spark.cassandra.connection.keepAliveMS", "60000") \
    .getOrCreate()

print("✅ SparkSession started with Cassandra integration")

25/10/13 14:30:57 WARN Utils: Your hostname, Fabians-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.22 instead (on interface en0)
25/10/13 14:30:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/fabianheflo/.ivy2/cache
The jars for the packages stored in: /Users/fabianheflo/.ivy2/jars
com.datastax.spark#spark-cassandra-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9a2b0e1c-8ac2-45c4-bf73-fa768bd1c005;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/fabianheflo/UNI_courses/IND320/IND320/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found com.datastax.spark#spark-cassandra-connector_2.12;3.5.1 in central
	found com.datastax.spark#spark-cassandra-connector-driver_2.12;3.5.1 in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.11.0 in central
	found org.apache.cassandra#java-driver-core-shaded;4.18.1 in central
	found com.datastax.oss#native-protocol;1.5.1 in central
	found com.datastax.oss#java-driver-shaded-guava;25.1-jre-graal-sub-1 in central
	found com.typesafe#config;1.4.1 in central
	found org.slf4j#slf4j-api;1.7.26 in central
	found io.dropwizard.metrics#metrics-core;4.1.18 in central
	found org.hdrhistogram#HdrHistogram;2.1.12 in central
	found org.reactivestreams#reactive-streams;1.0.3 in central
	found org.apache.cassandra#java-driver-mapper-runtime;4.18.1 in central
	found org.apache.cassandra#java-driver-query-builder;4.18.1 in central
	found org.apache.commons#commons-lang3;3.10 in central
	found com.thoughtworks.paranamer#paranamer;2.8 in central
	found org.scala-lang#scala-reflect

✅ SparkSession started with Cassandra integration


In [3]:
silver_df = (
    spark.read
    .format("org.apache.spark.sql.cassandra")
    .options(table="production_silver", keyspace="elhub_data")
    .load()
)

pdf = silver_df.toPandas()
pdf["starttime"] = pd.to_datetime(pdf["starttime"])
pdf["month"] = pdf["starttime"].dt.month

                                                                                

In [8]:
# Unique price areas
price_areas = sorted(pdf["pricearea"].unique())

# Month selection
month_options = {
    "All year": 0,
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}

area_dropdown = widgets.Dropdown(
    options=price_areas,
    value=price_areas[0],
    description="Area:",
)

month_dropdown = widgets.Dropdown(
    options=month_options,
    value=0,
    description="Month:",
)

In [9]:
def update_visuals(price_area, month):
    # Filter by selected area
    subset = pdf[pdf["pricearea"] == price_area]

    # Filter by month (if selected)
    if month != 0:
        subset = subset[subset["month"] == month]
        title_suffix = f"Month: {month}"
    else:
        title_suffix = "Whole year"

    # --- Pie chart (total production per energy source) ---
    pie_data = subset.groupby("productiongroup")["quantitykwh"].sum().reset_index()
    fig_pie = px.pie(
        pie_data,
        values="quantitykwh",
        names="productiongroup",
        title=f"Total production in {price_area} ({title_suffix})",
        color_discrete_sequence=px.colors.qualitative.Pastel
    )

    # --- Line chart (development over time) ---
    line_data = (
        subset.groupby(["starttime", "productiongroup"])["quantitykwh"].sum().reset_index()
    )
    fig_line = px.line(
        line_data,
        x="starttime",
        y="quantitykwh",
        color="productiongroup",
        title=f"Production over time in {price_area} ({title_suffix})",
        labels={"quantitykwh": "kWh", "starttime": "Time"}
    )

    # Show both
    fig_pie.show()
    fig_line.show()

### Interactive Plot

- Choose area with dropdown. 
- Choose month wiht dropdown. All year is default.
- Click on the colors in the legend to hide it from the plot. Makes it easier to analys the smaller groups.


In [1]:
widgets.interactive(
    update_visuals,
    price_area=area_dropdown,
    month=month_dropdown
)

NameError: name 'widgets' is not defined