### 01_cassandra_setup.ipynb
### Purpose: Set up Cassandra keyspace and table, and test Spark–Cassandra connection

In [7]:
from pyspark.sql import SparkSession
from cassandra.cluster import Cluster
from datetime import datetime

In [None]:
# Start Spark session with Cassandra connector
spark = SparkSession.builder \
    .appName("ElhubCassandraSetup") \
    .master("local[*]") \
    .config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.12:3.5.1") \
    .config("spark.cassandra.connection.host", "127.0.0.1") \
    .config("spark.cassandra.connection.port", "9042") \
    .config("spark.sql.extensions", "com.datastax.spark.connector.CassandraSparkExtensions") \
    .config("spark.sql.catalog.mycatalog", "com.datastax.spark.connector.datasource.CassandraCatalog") \
    .config("spark.cassandra.output.consistency.level", "ONE") \
    .config("spark.cassandra.connection.keepAliveMS", "60000") \
    .getOrCreate()

print("✅ SparkSession started with Cassandra integration")

25/10/09 15:55:07 WARN Utils: Your hostname, Fabians-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.20.6.230 instead (on interface en0)
25/10/09 15:55:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/fabianheflo/.ivy2/cache
The jars for the packages stored in: /Users/fabianheflo/.ivy2/jars
com.datastax.spark#spark-cassandra-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-19562782-6c3f-4696-b449-c2992e72d337;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/fabianheflo/UNI_courses/IND320/IND320/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found com.datastax.spark#spark-cassandra-connector_2.12;3.5.1 in central
	found com.datastax.spark#spark-cassandra-connector-driver_2.12;3.5.1 in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.11.0 in central
	found org.apache.cassandra#java-driver-core-shaded;4.18.1 in central
	found com.datastax.oss#native-protocol;1.5.1 in central
	found com.datastax.oss#java-driver-shaded-guava;25.1-jre-graal-sub-1 in central
	found com.typesafe#config;1.4.1 in central
	found org.slf4j#slf4j-api;1.7.26 in central
	found io.dropwizard.metrics#metrics-core;4.1.18 in central
	found org.hdrhistogram#HdrHistogram;2.1.12 in central
	found org.reactivestreams#reactive-streams;1.0.3 in central
	found org.apache.cassandra#java-driver-mapper-runtime;4.18.1 in central
	found org.apache.cassandra#java-driver-query-builder;4.18.1 in central
	found org.apache.commons#commons-lang3;3.10 in central
	found com.thoughtworks.paranamer#paranamer;2.8 in central
	found org.scala-lang#scala-reflect

✅ SparkSession started with Cassandra integration


In [3]:
# Connect directly to Cassandra
cluster = Cluster(['127.0.0.1'])
session = cluster.connect()

# Create keyspace (only if it doesn't already exist)
session.execute("""
CREATE KEYSPACE IF NOT EXISTS elhub_data
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}
""")

# Create table (matching Elhub PRODUCTION_PER_GROUP_MBA_HOUR schema)
session.execute("""
CREATE TABLE IF NOT EXISTS elhub_data.production_raw (
    meteringgridarea TEXT,
    time_start TIMESTAMP,
    productiontype TEXT,
    businesstype TEXT,
    quantity DOUBLE,
    resolution TEXT,
    resourceid TEXT,
    time_end TIMESTAMP,
    PRIMARY KEY (meteringgridarea, time_start, productiontype)
)
""")

print("✅ Keyspace and table created successfully")

✅ Keyspace and table created successfully


In [4]:
# Sample data to test write
data = [
    ("NO1", datetime.utcnow(), "Hydro", "A04", 123.45, "PT1H", "RES001", datetime.utcnow())
]
columns = [
    "meteringgridarea", "time_start", "productiontype", "businesstype",
    "quantity", "resolution", "resourceid", "time_end"
]

df = spark.createDataFrame(data, columns)

# Write to Cassandra
df.write.format("org.apache.spark.sql.cassandra") \
    .mode("append") \
    .options(table="production_raw", keyspace="elhub_data") \
    .save()

print("✅ Test row written to Cassandra successfully")

  ("NO1", datetime.utcnow(), "Hydro", "A04", 123.45, "PT1H", "RES001", datetime.utcnow())
[Stage 0:>                                                        (0 + 10) / 10]

✅ Test row written to Cassandra successfully


                                                                                

In [5]:
df_read = spark.read \
    .format("org.apache.spark.sql.cassandra") \
    .options(table="production_raw", keyspace="elhub_data") \
    .load()

df_read.show()
print(f"✅ Read {df_read.count()} rows from Cassandra")

+----------------+--------------------+--------------+------------+--------+----------+----------+--------------------+
|meteringgridarea|          time_start|productiontype|businesstype|quantity|resolution|resourceid|            time_end|
+----------------+--------------------+--------------+------------+--------+----------+----------+--------------------+
|             NO1|2025-10-09 13:55:...|         Hydro|         A04|  123.45|      PT1H|    RES001|2025-10-09 13:55:...|
+----------------+--------------------+--------------+------------+--------+----------+----------+--------------------+

✅ Read 1 rows from Cassandra


### Delete Rows in the Table and Keeps Columns and Keys

In [9]:
cluster = Cluster(['127.0.0.1'])
session = cluster.connect('elhub_data')
session.execute("TRUNCATE production_raw;")

df_read.show()
print("✅ Table elhub_data.production_raw has been cleared.")

+----------------+----------+--------------+------------+--------+----------+----------+--------+
|meteringgridarea|time_start|productiontype|businesstype|quantity|resolution|resourceid|time_end|
+----------------+----------+--------------+------------+--------+----------+----------+--------+
+----------------+----------+--------------+------------+--------+----------+----------+--------+

✅ Table elhub_data.production_raw has been cleared.
