### Spark notebook ###

This notebook will only work in a Jupyter notebook or Jupyter lab session running on the cluster master node in the cloud.

Follow the instructions on the computing resources page to start a cluster and open this notebook.

**Steps**

1. Connect to the Windows server using Windows App.
2. Connect to Kubernetes.
3. Start Jupyter and open this notebook from Jupyter in order to connect to Spark.

In [54]:


import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Constants used to interact with Azure Blob Storage using the hdfs command or Spark

global username

username = re.sub('@.*', '', getpass.getuser())

global azure_account_name
global azure_data_container_name
global azure_user_container_name
global azure_user_token

azure_account_name = "madsstorage002"
azure_data_container_name = "campus-data"
azure_user_container_name = "campus-user"
azure_user_token = r"sp=racwdl&st=2024-09-19T08:03:31Z&se=2025-09-19T16:03:31Z&spr=https&sv=2022-11-02&sr=c&sig=kMP%2BsBsRzdVVR8rrg%2BNbDhkRBNs6Q98kYY695XMRFDU%3D"


# Functions used below

def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")

        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://localhost:{sc.uiWebUrl.split(":")[-1]}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username} (notebook)</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{username}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.driver.memory", f'{master_memory}g')
        .config("spark.executor.memory", f'{worker_memory}g')
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.kubernetes.container.image", "madsregistry001.azurecr.io/hadoop-spark:v3.3.5-openjdk-8")
        .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")
        .config("spark.memory.fraction", "0.1")
        .config(f"fs.azure.sas.{azure_user_container_name}.{azure_account_name}.blob.core.windows.net",  azure_user_token)
        .config("spark.app.name", f"{username} (notebook)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()
 

# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

### Assignment 1 ###

The code below demonstrates how to explore and load the data provided for the assignment from Azure Blob Storage and how to save any outputs that you generate to a separate user container.

**Key points**

- The data provided for the assignment is stored in Azure Blob Storage and outputs that you generate will be stored in Azure Blob Storage as well. Hadoop and Spark can both interact with Azure Blob Storage similar to how they interact with HDFS, but where the replication and distribution is handled by Azure instead. This makes it possible to read or write data in Azure over HTTPS where the path is prefixed by `wasbs://`.
- There are two containers, one for the data which is read only and one for any outputs that you generate,
  - `wasbs://campus-data@madsstorage002.blob.core.windows.net/`
  - `wasbs://campus-user@madsstorage002.blob.core.windows.net/`
- You can use variable interpolation to insert your global username variable into paths automatically.
  - This works for bash commands as well.

In [2]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1)

25/03/29 19:26:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


0,1
spark.dynamicAllocation.enabled,false
spark.fs.azure.sas.uco-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2024-09-19T08:00:18Z&se=2025-09-19T16:00:18Z&spr=https&sv=2022-11-02&sr=c&sig=qtg6fCdoFz6k3EJLw7dA8D3D8wN0neAYw8yG4z4Lw2o%3D"""
spark.kubernetes.driver.pod.name,spark-master-driver
spark.kubernetes.namespace,dew59
spark.fs.azure.sas.campus-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2024-09-19T08:03:31Z&se=2025-09-19T16:03:31Z&spr=https&sv=2022-11-02&sr=c&sig=kMP%2BsBsRzdVVR8rrg%2BNbDhkRBNs6Q98kYY695XMRFDU%3D"""
spark.kubernetes.container.image.pullPolicy,IfNotPresent
spark.driver.memory,1g
spark.app.submitTime,1743229568824
spark.kubernetes.executor.podNamePrefix,dew59-notebook-2eeb8d95e0944ac7
spark.driver.extraJavaOptions,-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false -Dderby.system.home=/tmp/dew59/spark/


In [3]:
# Write your imports here or insert cells below

from pyspark.sql import functions as F
from pyspark.sql.types import *

In [28]:
# Use the hdfs command to explore the data in Azure Blob Storage
# http://localhost:4041/jobs/
import subprocess

remotepath = f'wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/'
print(f'remotepath: {remotepath}')
remotedaily = f'{remotepath}/ghcnd/'
print(f'remotedaily: {remotedaily}')
!hdfs dfs -du -h {remotepath}             #file sizes


#! HADOOP_ROOT_LOGGER="WARNING" hdfs dfs -ls wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/ghcnd/
#! HADOOP_ROOT_LOGGER="WARNING" hdfs dfs -ls wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/ghcnd/daily/

command = f'hdfs dfs -du -h {remotepath}'
try:
    result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    print(result.stdout.decode())  # Output the results of the command
except subprocess.CalledProcessError as e:
    print(f"Error occurred: {e.stderr.decode()}")

remotepath: wasbs://campus-data@madsstorage002.blob.core.windows.net/
remotedaily: wasbs://campus-data@madsstorage002.blob.core.windows.net//ghcnd/
2025-03-29 20:20:09,173 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2025-03-29 20:20:09,440 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2025-03-29 20:20:09,487 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2025-03-29 20:20:09,487 INFO impl.MetricsSystemImpl: azure-file-system metrics system started
13.1 G   13.1 G   wasbs://campus-data@madsstorage002.blob.core.windows.net/ghcnd
1.9 K    1.9 K    wasbs://campus-data@madsstorage002.blob.core.windows.net/helloworld
279.9 M  279.9 M  wasbs://campus-data@madsstorage002.blob.core.windows.net/crime
3.7 M    3.7 M    wasbs://campus-data@madsstorage002.blob.core.windows.net/openflights
12.9 G   12.9 G   wasbs://campus-data@madsstorage002.blob.core.wind

In [30]:
# Load a subset of the last year in daily into Spark from Azure Blob Storage using spark.read.csv
# Define the input path for the last year in daily

daily_relative_path = f'/ghcnd/daily/2025.csv.gz'
print(f'daily_relative_path: {daily_relative_path}')
print(f'remotepath: {remotepath}')
remote2025csv = f'{remotepath}{daily_relative_path}'
print(f'remote2025csv: {remote2025csv}')

#daily_path = f'wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/{daily_relative_path}'
#print(daily_path)

daily_relative_path: /ghcnd/daily/2025.csv.gz
remotepath: wasbs://campus-data@madsstorage002.blob.core.windows.net/
remote2025csv: wasbs://campus-data@madsstorage002.blob.core.windows.net//ghcnd/daily/2025.csv.gz


In [33]:
#daily  = spark.read.csv(daily_path).limit(1000)
dailydf = spark.read.csv(remote2025csv).limit(1000)

#print(type(daily))
print("print(type(weekly))")
print(type(dailydf))

#daily.printSchema()
print("dailydf.printSchema()")
dailydf.printSchema()

#print(daily)
print("print(dailydf)")
print(dailydf)

print("dailydf.show(20, False):")
dailydf.show(20, False)

print(type(weekly))
<class 'pyspark.sql.dataframe.DataFrame'>
dailydf.printSchema()
root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)

print(dailydf)
DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string]
dailydf.show(20, False):
+-----------+--------+----+---+----+----+---+----+
|_c0        |_c1     |_c2 |_c3|_c4 |_c5 |_c6|_c7 |
+-----------+--------+----+---+----+----+---+----+
|ASN00037106|20250101|PRCP|0  |NULL|NULL|a  |NULL|
|ASN00037115|20250101|PRCP|0  |NULL|NULL|a  |NULL|
|ASN00037118|20250101|PRCP|0  |NULL|NULL|a  |NULL|
|ASN00037120|20250101|PRCP|0  |NULL|NULL|a  |NULL|
|ASN00038010|20250101|PRCP|0  |NULL|NULL|a  |NULL|
|ASN00038026|20250101|TMAX|419|NULL|NULL|a  |NULL|
|ASN00038026|2025

In [46]:
# Define the input path for stations

stations_relative_path = f'ghcnd/ghcnd-stations.txt'
stations_path = f'wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/{stations_relative_path}'
print(stations_path)
remotestation = f'{remotepath}{stations_relative_path}'
print(f'remotestation: {remotestation}')

wasbs://campus-data@madsstorage002.blob.core.windows.net/ghcnd/ghcnd-stations.txt
remotestation: wasbs://campus-data@madsstorage002.blob.core.windows.net/ghcnd/ghcnd-stations.txt


In [48]:
# Load the stations metadata into Spark from Azure Blob Storage using spark.read.text without any other processing

stations = spark.read.text(stations_path).limit(1000)
print(type(stations))
stations.printSchema()
print(stations)
stations.show(20, False)

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- value: string (nullable = true)

DataFrame[value: string]
+-------------------------------------------------------------------------------------+
|value                                                                                |
+-------------------------------------------------------------------------------------+
|ACW00011604  17.1167  -61.7833   10.1    ST JOHNS COOLIDGE FLD                       |
|ACW00011647  17.1333  -61.7833   19.2    ST JOHNS                                    |
|AE000041196  25.3330   55.5170   34.0    SHARJAH INTER. AIRP            GSN     41196|
|AEM00041194  25.2550   55.3640   10.4    DUBAI INTL                             41194|
|AEM00041217  24.4330   54.6510   26.8    ABU DHABI INTL                         41217|
|AEM00041218  24.2620   55.6090  264.9    AL AIN INTL                            41218|
|AF000040930  35.3170   69.0170 3366.0    NORTH-SALANG                   GSN     40930|
|AFM000409

In [49]:
# Load the stations metadata into Spark from Azure Blob Storage using spark.read.text without any other processing

stationdf = spark.read.text(remotestation).limit(1000)
print(f'type(stationdf):{type(stationdf)}')
print("stationdf.printSchema()")
stationdf.printSchema()
print("print(stationdf)")
print(stationdf)
print("stationdf.show(20, False)")
stationdf.show(20, False)

type(stationdf):<class 'pyspark.sql.dataframe.DataFrame'>
stationdf.printSchema()
root
 |-- value: string (nullable = true)

print(stationdf)
DataFrame[value: string]
stationdf.show(20, False)
+-------------------------------------------------------------------------------------+
|value                                                                                |
+-------------------------------------------------------------------------------------+
|ACW00011604  17.1167  -61.7833   10.1    ST JOHNS COOLIDGE FLD                       |
|ACW00011647  17.1333  -61.7833   19.2    ST JOHNS                                    |
|AE000041196  25.3330   55.5170   34.0    SHARJAH INTER. AIRP            GSN     41196|
|AEM00041194  25.2550   55.3640   10.4    DUBAI INTL                             41194|
|AEM00041217  24.4330   54.6510   26.8    ABU DHABI INTL                         41217|
|AEM00041218  24.2620   55.6090  264.9    AL AIN INTL                            41218|
|AF000040930  3

In [52]:
# Define an output path as an exmaple

output_relative_path = f'{username}/stations'
output_path = f'wasbs://{azure_user_container_name}@{azure_account_name}.blob.core.windows.net/{output_relative_path}'
remotestations = f'{remotepath}{output_relative_path}'
print(output_path)
print(f'remotestations: {remotestations}')

wasbs://campus-user@madsstorage002.blob.core.windows.net/dew59/stations
remotestations: wasbs://campus-data@madsstorage002.blob.core.windows.net/dew59/stations


In [53]:
# Save the stations metadata to Azure Blob Storage from Spark

stations.write.mode("overwrite").text(output_path)

25/03/29 20:49:54 WARN AzureFileSystemThreadPoolExecutor: Disabling threads for Delete operation as thread count 0 is <= 1
25/03/29 20:49:55 WARN AzureFileSystemThreadPoolExecutor: Disabling threads for Delete operation as thread count 0 is <= 1


In [21]:
# Use the hdfs command to explore the data in Azure Blob Storage

!hdfs dfs -ls wasbs://{azure_user_container_name}@{azure_account_name}.blob.core.windows.net/{username}/stations/

2025-03-26 16:58:51,456 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2025-03-26 16:58:51,725 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2025-03-26 16:58:51,770 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2025-03-26 16:58:51,770 INFO impl.MetricsSystemImpl: azure-file-system metrics system started
Found 2 items
-rw-r--r--   1 dew59 supergroup          0 2025-03-26 08:51 wasbs://campus-user@madsstorage002.blob.core.windows.net/dew59/stations/_SUCCESS
-rw-r--r--   1 dew59 supergroup      86000 2025-03-26 08:51 wasbs://campus-user@madsstorage002.blob.core.windows.net/dew59/stations/part-00000-d05f6d0c-3adc-4399-ac77-2b0fb22e760b-c000.txt
2025-03-26 16:58:52,146 INFO impl.MetricsSystemImpl: Stopping azure-file-system metrics system...
2025-03-26 16:58:52,146 INFO impl.MetricsSystemImpl: azure-file-system metrics system stopped.
2025-03-26 16

In [22]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()

25/03/26 16:59:45 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
