### Spark notebook ###

This notebook will only work in a Jupyter notebook or Jupyter lab session running on the cluster master node in the cloud.

Follow the instructions on the computing resources page to start a cluster and open this notebook.

**Steps**

1. Connect to the Windows server using Windows App.
2. Connect to Kubernetes.
3. Start Jupyter and open this notebook from Jupyter in order to connect to Spark.

In [1]:
# CELL 0: Reset Kernel State
# CELL 0: 
# ------------------------------------------------
# Summary:
# - clear all existing variables and definitions 
# - annoying to deebug without it

#  
%reset -f


In [2]:
import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re
from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession, DataFrame


# Constants used to interact with Azure Blob Storage using the hdfs command or Spark

global username

username = re.sub('@.*', '', getpass.getuser())

global azure_account_name
global azure_data_container_name
global azure_user_container_name
global azure_user_token

azure_account_name = "madsstorage002"
azure_data_container_name = "campus-data"
azure_user_container_name = "campus-user"
azure_user_token = r"sp=racwdl&st=2024-09-19T08:03:31Z&se=2025-09-19T16:03:31Z&spr=https&sv=2022-11-02&sr=c&sig=kMP%2BsBsRzdVVR8rrg%2BNbDhkRBNs6Q98kYY695XMRFDU%3D"


# Functions used below

def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")

        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://localhost:{sc.uiWebUrl.split(":")[-1]}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username} (notebook)</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{username}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.driver.memory", f'{master_memory}g')
        .config("spark.executor.memory", f'{worker_memory}g')
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.kubernetes.container.image", "madsregistry001.azurecr.io/hadoop-spark:v3.3.5-openjdk-8")
        .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")
        .config("spark.memory.fraction", "0.1")
        .config(f"fs.azure.sas.{azure_user_container_name}.{azure_account_name}.blob.core.windows.net",  azure_user_token)
        .config("spark.app.name", f"{username} (notebook)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

### Assignment 1 ###

The code below demonstrates how to explore and load the data provided for the assignment from Azure Blob Storage and how to save any outputs that you generate to a separate user container.

**Key points**

- The data provided for the assignment is stored in Azure Blob Storage and outputs that you generate will be stored in Azure Blob Storage as well. Hadoop and Spark can both interact with Azure Blob Storage similar to how they interact with HDFS, but where the replication and distribution is handled by Azure instead. This makes it possible to read or write data in Azure over HTTPS where the path is prefixed by `wasbs://`.
- There are two containers, one for the data which is read only and one for any outputs that you generate,
  - `wasbs://campus-data@madsstorage002.blob.core.windows.net/`
  - `wasbs://campus-user@madsstorage002.blob.core.windows.net/`
- You can use variable interpolation to insert your global username variable into paths automatically.
  - This works for bash commands as well.

In [3]:
# CELL 1: my common definitions  
# CELL 1: 
# ------------------------------------------------
import traceback 

DEEBUG = True  #override at any point in the cells

def dprintf(b: bool, s: str):
    """Conditional print of str"""
    if b:
        print(s)

def hprintf(b: bool, df, n=5):
    """Conditional print of top rows in Spark DataFrame (safe for large datasets)"""
    dprintf(1,"hprintf start")
    if b:            
        print("bool = 1")
        dprintf(1, f"hprintf: show top {n} rows")
        df.printSchema()
        #df.limit(n).show(truncate=False)        
        count =  df.count()
        dprintf(1, f"result.count() = {count}")
        display(df.limit(n).toPandas())
    else:
        print("bool = 0")
    dprintf(1,"hprintf end")


def dReadTEXT(b: bool, path: str, show: int = 5):
    try:
        
        if b:
            dprintf(1, f"bool = 1, read file from: {path}")
            start = time.time()
            result = spark.read.text(path)
            stop = time.time()
            hprintf(1, result, show)
            dprintf(1, f"completed in {stop - start:.2f} seconds")
            return result
        else:
            dprintf(1, f"bool = 0, read NO file from: {path}")
            return None

    except Exception as e:
        print("❌ Error caught:", type(e), e)
        traceback.print_exc()
        return None


from pyspark.sql.types import StructType

def dReadCSV(b: bool = 1, path: str = "", bHeader: bool = False, schema: StructType = None, show: int = 5):
    try:
  
       
        if schema:
            dprintf(1, f"schema: {schema.simpleString()}")
        else:
            dprintf(1, "no schema")

        if b:
            dprintf(1, f"bool = 1, read file from: {path}")
            start = time.time()
            result = spark.read \
                .option("header", bool(bHeader)) \
                .schema(schema) \
                .csv(path) \
                .repartition(8)  # leave this for now

            stop = time.time()
            dprintf(1, f"complete in {stop - start:.2f} seconds")
            hprintf(1, result, show)
            count = result.count()
            dprintf(1, f"result.count() = {count}")
            return result

        else:
            dprintf(1, f"bool = 1, read NO file from: {path}")
            return None

    except Exception as e:
        print("❌ Error caught:", type(e), e)
        traceback.print_exc()
        return None







In [4]:
# CELL 1: my common definitions  
# CELL 1: 
# ------------------------------------------------
import traceback
import time
from pyspark.sql.types import StructType

DEEBUG = True  # override at any point in the cells

def dprintf(b: bool, s: str):
    """Conditional print of str"""
    if b:
        print(s)

def hprintf(b: bool, df, n=5):
    """Conditional print of top rows in Spark DataFrame (safe for large datasets)"""
    try:
        dprintf(1, "hprintf start")
        if not df:
            raise ValueError("❌ DataFrame is None")
        if b:
            dprintf(1, f"hprintf: show top {n} rows")
            df.printSchema()
            count = df.count()
            dprintf(1, f"result.count() = {count}")
            display(df.limit(n).toPandas())
        else:
            print("bool = 0")
        dprintf(1, "hprintf end")
    except Exception as e:
        print("❌ Error caught in hprintf:", type(e), e)
        traceback.print_exc()


def dReadTEXT(b: bool, path: str, show: int = 5):
    try:
        if not b:
            dprintf(1, f"bool = 0, read NO file from: {path}")
            return None

        dprintf(1, f"filename: {path}")
        start = time.time()
        result = spark.read.text(path)
        stop = time.time()

        if result is None:
            raise Exception("❌ spark.read.text() returned None")

        hprintf(1, result, show)
        dprintf(1, f"completed in {stop - start:.2f} seconds")
        dprintf(1, f"result.count() = {result.count()}")
        return result

    except Exception as e:
        print("❌ Error caught:", type(e), e)
        traceback.print_exc()
        return None



def dReadCSV(b: bool = 1, path: str = "", bHeader: bool = False, schema: StructType = None, show: int = 5):
    try:
        dprintf(1, f"path: {path}")

        if schema:
            dprintf(1, f"schema: {schema.simpleString()}")
        else:
            dprintf(1, "no schema")

        if b:
            start = time.time()
            result = spark.read \
                .option("header", bool(bHeader)) \
                .schema(schema) \
                .csv(path) \
                .repartition(8)  # leave this for now

            stop = time.time()
            hprintf(1, result, show)
            dprintf(1, f"complete in {stop - start:.2f} seconds")
            count = result.count()
            dprintf(1, f"result.count() = {count}")
            return result
        else:
            dprintf(1, f"bool = 0, read NO file from: {path}")
            return None

    except Exception as e:
        print("❌ Error caught:", type(e), e)
        traceback.print_exc()
        return None


def dWritePQ(b: bool = 1, path: str = "", df: DataFrame = None, show: int = 5):
    try:
        dprintf(1, f"dWritePQ: path = {path}")
        
        if df is None:
            raise ValueError("❌ DataFrame is None — cannot write to Parquet.")

        if b:
            dprintf(1, "Preview before write:")
            hprintf(1, df, show)

            start = time.time()
            df.write.mode("overwrite").parquet(path)            
            stop = time.time()

            dprintf(1, f"✅ Write complete in {stop - start:.2f} seconds")
            dprintf(1, f"📦 Rows written: {df.count()}")
            return True

        else:
            dprintf(1, f"⚠️  b = 0 — write to {path} skipped")
            return False

    except Exception as e:
        print("❌ Error caught during dWritePQ:", type(e), e)
        traceback.print_exc()
        return False


In [5]:
# Run this cell to start a spark session in this notebook
#start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1)
#this file is treated as a headerfile and included in otehr jbooks call start after the file is included