In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from cassandra.cluster import Cluster

In [2]:
spark = SparkSession.builder \
    .appName("ElhubSilver") \
    .master("local[*]") \
    .config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.12:3.5.1") \
    .config("spark.cassandra.connection.host", "127.0.0.1") \
    .config("spark.cassandra.connection.port", "9042") \
    .config("spark.sql.extensions", "com.datastax.spark.connector.CassandraSparkExtensions") \
    .config("spark.sql.catalog.mycatalog", "com.datastax.spark.connector.datasource.CassandraCatalog") \
    .config("spark.cassandra.output.consistency.level", "ONE") \
    .config("spark.cassandra.connection.keepAliveMS", "60000") \
    .getOrCreate()

print("✅ SparkSession started with Cassandra integration")

:: loading settings :: url = jar:file:/Users/fabianheflo/UNI_courses/IND320/IND320/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/fabianheflo/.ivy2/cache
The jars for the packages stored in: /Users/fabianheflo/.ivy2/jars
com.datastax.spark#spark-cassandra-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c5ca5206-8177-436b-a124-84af90c6af0d;1.0
	confs: [default]
	found com.datastax.spark#spark-cassandra-connector_2.12;3.5.1 in central
	found com.datastax.spark#spark-cassandra-connector-driver_2.12;3.5.1 in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.11.0 in central
	found org.apache.cassandra#java-driver-core-shaded;4.18.1 in central
	found com.datastax.oss#native-protocol;1.5.1 in central
	found com.datastax.oss#java-driver-shaded-guava;25.1-jre-graal-sub-1 in central
	found com.typesafe#config;1.4.1 in central
	found org.slf4j#slf4j-api;1.7.26 in central
	found io.dropwizard.metrics#metrics-core;4.1.18 in central
	found org.hdrhistogram#HdrHistogram;2.1.12 in central
	found org.reactivestreams#reactive

✅ SparkSession started with Cassandra integration


# Production

In [3]:
bronze_df = (
    spark.read
    .format("org.apache.spark.sql.cassandra")
    .options(table="production_raw", keyspace="elhub_data")
    .load()
)

In [4]:
silver_df = (
    bronze_df
    .select(
        col("priceArea").alias("pricearea"),
        col("productionGroup").alias("productiongroup"),
        col("startTime").alias("starttime"),
        col("quantityKwh").alias("quantitykwh")
    )
    .withColumn("starttime", col("starttime").cast("timestamp"))
    .withColumn("quantitykwh", col("quantitykwh").cast("double"))
)

In [5]:
cluster = Cluster(["localhost"])
session = cluster.connect("elhub_data")
session.execute("TRUNCATE production_silver")

<cassandra.cluster.ResultSet at 0x112a9ce30>



In [6]:
(
    silver_df
    .write
    .format("org.apache.spark.sql.cassandra")
    .options(table="production_silver", keyspace="elhub_data")
    .mode("append")  
    .save()
)
print("✅ Data written to elhub_data.production_silver (after truncate)")



✅ Data written to elhub_data.production_silver (after truncate)


                                                                                

In [7]:
# Check for missing values
missing_values = silver_df.filter(
    col("pricearea").isNull() |
    col("productiongroup").isNull() |
    col("starttime").isNull() |
    col("quantitykwh").isNull()
).count()
if missing_values == 0:
    print("✅ No missing values in silver_df")
else:
    print(f"⚠️ There are {missing_values} rows with missing values in silver_df")



✅ No missing values in silver_df


                                                                                

In [8]:
# Describe the dataframe
silver_df.describe().show()



+-------+---------+---------------+------------------+
|summary|pricearea|productiongroup|       quantitykwh|
+-------+---------+---------------+------------------+
|  count|   871153|         871153|            871153|
|   mean|     NULL|           NULL| 704176.0272003636|
| stddev|     NULL|           NULL|1459343.4750884627|
|    min|      NO1|          hydro|               0.0|
|    max|      NO5|           wind|       1.0054233E7|
+-------+---------+---------------+------------------+



                                                                                

# Consumption

In [3]:
bronze_df_con = (
    spark.read
    .format("org.apache.spark.sql.cassandra")
    .options(table="consumption_raw", keyspace="elhub_data")
    .load()
)

In [6]:
silver_df_con = (
    bronze_df_con
    .select(
        col("pricearea").alias("pricearea"),
        col("consumptiongroup").alias("consumptiongroup"),
        col("starttime").alias("starttime"),
        col("endtime").alias("endtime"),
        col("quantitykwh").alias("quantitykwh")
    )
    .withColumn("starttime", col("starttime").cast("timestamp"))
    .withColumn("endtime", col("endtime").cast("timestamp"))
    .withColumn("quantitykwh", col("quantitykwh").cast("double"))
)

In [16]:

cluster = Cluster(["localhost"])
session = cluster.connect("elhub_data")

session.execute("""
CREATE TABLE IF NOT EXISTS consumption_silver (
    pricearea TEXT,
    consumptiongroup TEXT,
    starttime TIMESTAMP,
    endtime TIMESTAMP,
    quantitykwh DOUBLE,
    PRIMARY KEY ((pricearea), consumptiongroup, starttime)
)
""")

print("✔ consumption_silver table created or verified.")

✔ consumption_silver table created or verified.




In [None]:
cluster = Cluster(["localhost"])
session = cluster.connect("elhub_data")
session.execute("TRUNCATE consumption_silver")



In [5]:
(
    silver_df_con
    .write
    .format("org.apache.spark.sql.cassandra")
    .options(table="consumption_silver", keyspace="elhub_data")
    .mode("append")  
    .save()
)
print("✅ Data written to elhub_data.consumption_silver (after truncate)")

ERROR:root:KeyboardInterrupt while sending command.>              (15 + 5) / 20]
Traceback (most recent call last):
  File "/Users/fabianheflo/UNI_courses/IND320/IND320/.venv/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/fabianheflo/UNI_courses/IND320/IND320/.venv/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.12/3.12.9/Frameworks/Python.framework/Versions/3.12/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

                                                                                

In [18]:
missing_values = silver_df_con.filter(
    col("pricearea").isNull() |
    col("consumptiongroup").isNull() |
    col("starttime").isNull() |
    col("endtime").isNull() |
    col("quantitykwh").isNull()
).count()

if missing_values == 0:
    print("✔ No missing values in silver_df_con")
else:
    print(f"⚠️ {missing_values} rows in silver_df_con have missing values")

✔ No missing values in silver_df_con


                                                                                

In [19]:
# Describe the dataframe
silver_df_con.describe().show()



+-------+---------+----------------+-----------------+
|summary|pricearea|consumptiongroup|      quantitykwh|
+-------+---------+----------------+-----------------+
|  count|   874800|          874800|           874800|
|   mean|     NULL|            NULL|589486.6804404076|
| stddev|     NULL|            NULL|635637.7736697067|
|    min|      NO1|           cabin|         8731.135|
|    max|      NO5|        tertiary|        4264747.0|
+-------+---------+----------------+-----------------+



                                                                                

25/11/14 19:18:19 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1060230 ms exceeds timeout 120000 ms
25/11/14 19:18:19 WARN SparkContext: Killing executors is not supported by current scheduler.
25/11/14 19:34:10 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$