### Spark notebook ###

This notebook will only work in a Jupyter notebook or Jupyter lab session running on the cluster master node in the cloud.

Follow the instructions on the computing resources page to start a cluster and open this notebook.

**Steps**

1. Connect to the Windows server using Windows App.
2. Connect to Kubernetes.
3. Start Jupyter and open this notebook from Jupyter in order to connect to Spark.

In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Constants used to interact with Azure Blob Storage using the hdfs command or Spark

global username

username = re.sub('@.*', '', getpass.getuser())

global azure_account_name
global azure_data_container_name
global azure_user_container_name
global azure_user_token

azure_account_name = "madsstorage002"
azure_data_container_name = "campus-data"
azure_user_container_name = "campus-user"
azure_user_token = r"sp=racwdl&st=2025-08-01T09:41:33Z&se=2026-12-30T16:56:33Z&spr=https&sv=2024-11-04&sr=c&sig=GzR1hq7EJ0lRHj92oDO1MBNjkc602nrpfB5H8Cl7FFY%3D"


# Functions used below

def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """
    html = []
    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')
    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():
        name = sc.getConf().get("spark.app.name")
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://localhost:{sc.uiWebUrl.split(":")[-1]}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else: 
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username} (notebook)</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
    
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{username}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.driver.memory", f'{master_memory}g')
        .config("spark.executor.memory", f'{worker_memory}g')
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.kubernetes.container.image", "madsregistry001.azurecr.io/hadoop-spark:v3.3.5-openjdk-8")
        .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")
        .config("spark.memory.fraction", "0.1")
        .config(f"fs.azure.sas.{azure_user_container_name}.{azure_account_name}.blob.core.windows.net",  azure_user_token)
        .config("spark.app.name", f"{username} (notebook)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """
    global spark
    global sc
    if 'spark' in globals() and 'sc' in globals():
        spark.stop()
        del spark
        del sc
    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

### Assignment 2 ###

- MSD containers:
  - `wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/` 

- MY containers:
  - `wasbs://campus-user@madsstorage002.blob.core.windows.net/`


In [2]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

25/10/08 08:43:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


0,1
spark.dynamicAllocation.enabled,false
spark.fs.azure.sas.campus-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2025-08-01T09:41:33Z&se=2026-12-30T16:56:33Z&spr=https&sv=2024-11-04&sr=c&sig=GzR1hq7EJ0lRHj92oDO1MBNjkc602nrpfB5H8Cl7FFY%3D"""
spark.kubernetes.driver.pod.name,spark-master-driver
spark.executor.instances,4
spark.driver.memory,4g
spark.kubernetes.namespace,dew59
spark.app.startTime,1759866199141
spark.kubernetes.container.image.pullPolicy,IfNotPresent
spark.sql.shuffle.partitions,32
spark.driver.extraJavaOptions,-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false -Dderby.system.home=/tmp/dew59/spark/


In [3]:
# My Imports
 
from IPython.display     import display  # calls between environments
from math                import acos, atan2, cos, radians, sin, sqrt
from matplotlib.ticker   import FuncFormatter, MaxNLocator
from pathlib             import Path
from pyspark.sql         import DataFrame
from pyspark.sql         import DataFrame as SparkDF
from pyspark.sql         import functions as F, types as T
from pyspark.sql.types   import *
from pyspark.sql.utils   import AnalysisException
from pyspark.sql.window  import Window
from time                import perf_counter  # Add this line for benchmark functions
from typing              import List, Optional, Tuple
from rich.tree           import Tree
from rich.console        import Console
from datetime            import datetime


import itertools         as it
import matplotlib.dates  as mdates
import matplotlib.pyplot as plt
import numpy             as np
import pandas            as pd
import warnings

import math, os, platform, re
import subprocess, sys, time

import time
from datetime import datetime

warnings.filterwarnings("ignore", category=UserWarning)
console = Console()


#The following shows the data structure

In [4]:
# overall time metric
notebook_run_time = time.time()

# Use the hdfs command to explore the data in Azure Blob Storage
#USERNAME    = "dew59"
WASBS_DATA  = "wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/"
WASBS_USER  = f"wasbs://campus-user@madsstorage002.blob.core.windows.net/{username}-A2/"

#WASBS_USER          = "wasbs://campus-user@madsstorage002.blob.core.windows.net/{}".format(USERNAME)
#WASBS_YEAR_SIZE     = "{}/years_size_metrics.parquet/".format(WASBS_USER)

 
#stations_path = f'wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/{stations_write_path}'
#common_data_path    = f'wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/'
#stations_read_name  =  inventory_read_name = ""
#stations_read_name  =  inventory_read_name = ""
 

print("Spark:", spark.version)
print("_" * 35 + "PATHS" + "_" * 35)
print("WASBS_DATA          :", WASBS_DATA)
print("WASBS_USER          :", WASBS_USER) 
print()

Spark: 3.5.1
___________________________________PATHS___________________________________
WASBS_DATA          : wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/
WASBS_USER          : wasbs://campus-user@madsstorage002.blob.core.windows.net/dew59-A2/



In [5]:
# HELPER AND DIAGNOSTIC FUNCTIONS

notebook_run_time = time.time()
print("_" * 35 + "HELPER / DIAGNOSTIC FUNCTIONS" + "_" * 35)

def hprint(text: str="", l=50):
    """Print formatted section header"""
    n = len(text)
    n = abs(n - l) // 2
    print("\n" + "_" * n + text + "_" * n)

def cleanup_parquet_files(cleanup=False):
    """Clean up existing parquet files in user directory.
    
    Args:
        cleanup (bool): When True, actually DELETES FILES. 
                        When False, only LISTS files.
    """
    hprint("Clean up existing parquet files")

    print("[cleanup] Listing files BEFORE cleanup:")
    get_ipython().system(f'hdfs dfs -ls {WASBS_USER}/*.parquet')
    
    if cleanup:
        print("\n[cleanup] Deleting all parquet folders...")
        get_ipython().system(f'hdfs dfs -rm -r -f {WASBS_USER}/*.parquet')
        
        print("\n[info] Listing files AFTER cleanup:")
        get_ipython().system(f'hdfs dfs -ls {WASBS_USER}/*.parquet')
        print("\n[cleanup] Parquet file cleanup complete - ready to restart Processing run with clean schema")

    else:
        print("\n[info] To actually delete files, call: cleanup_parquet_files(cleanup=True)")

def normalise_ids(df: DataFrame, col: str = "ID") -> DataFrame:
    """
    # Single source of truth for ID normalisation 
    Upper + trim + distinct on the given ID column.
    """
    print(f"[INFO] normalise_ids() on column: {col}")
    df.printSchema()
    df.show(20)
    return df.select(F.upper(F.trim(F.col(col))).alias("ID")).distinct()
    df.printSchema()
    df.show(20)

def df_as_html(df, n: int = 5, right_align: bool = False, show_index: bool = False):
    """
    HTML preview via pandas with no truncation. If right_align=True,
    only numeric columns are right-justified; everything else is 
    explicitly left-aligned.
    """
    pdf = df.limit(n).toPandas()
    print("[INFO] Converting Spark → pandas for HTML display (rows:", len(pdf), ")")
    print("[INFO] right_align (numeric columns):", right_align)

    with pd.option_context(
        "display.max_colwidth", None,   
        "display.max_columns", None,    
        "display.width", None            
    ):
        styler = pdf.style if show_index else pdf.style.hide(axis="index")

        #   table alignment: left for both headers and cells
        styler = styler.set_table_styles(
            [
                {"selector": "th", "props": [("text-align", "left")]},
                {"selector": "td", "props": [("text-align", "left")]},
            ],
            overwrite=True,  # make this the baseline
        )
         
        if right_align:
            numeric_cols = list(pdf.select_dtypes(include=["number"]).columns)
            print("[INFO] Right-aligning numeric columns:", numeric_cols)
            if numeric_cols:
                styler = styler.set_properties(subset=numeric_cols,
                                               **{"text-align": "right"})
        display(styler)

def show_df(df, n: int = 10, name: str = "", right_align: bool = False):
    """
    Print schema, 
    show an HTML sample,
    and row count.
    """
    hprint()
    print("name : ",name)
    df.printSchema()
    print("[check] sample:")
    df_as_html(df, n=n, right_align=right_align)

def write_parquet(df, dir_as_path: str, df_name:str = ""):    
    funct_time = time.time()
    path = _normalise_dir(dir_as_path)
    print(f"[file] write_parquet  : {path}")
    try:      
        show_df(df,df_name)
    except Exception as e:
        print("[catch] sample failed:", e)
        os.system(f'hdfs dfs -rm -r -f "{path}"')   # idempotent cleanup
    df.write.mode("overwrite").format("parquet").save(path)
    os.system(f'hdfs dfs -ls -R "{path}"')
    funct_time = time.time() - funct_time 
    print(f"[time] write_parquet (min)   : {funct_time/60:5.2f}")
    print(f"[time] write_parquet (sec)   : {funct_time:5.2f}")

def has_parquet(dir_as_path: str) -> bool:
    path   = _normalise_dir( dir_as_path)
    marker = path + '_SUCCESS'
    #print("\n[check] dir_path:", dir_path)
    #print("\n[check] path    :", path)
    print("\n[check] marker  :", marker)
    rc = os.system(f'hdfs dfs -test -e "{marker}"')
    print("[check] rc:", rc, "->", ("exists" if rc == 0 else "missing"))
    return (rc == 0)

def _to_spark(df_like, schema=None):
    """
    Return a Spark DataFrame  .
    """
    if isinstance(df_like, SparkDF):
        return df_like
    return spark.createDataFrame(df_like, schema=schema) if schema else spark.createDataFrame(df_like)

def ensure_dir(path: str) -> str:
    """
    ensures that path is a path 
    and not representing a file;
    add trailing slash if needed
    """
    if path is None:
        raise ValueError("Path is None")
    path = _normalise_dir(path)
#   print("ensure_dir -> ",path)
    return path

def _normalise_dir(s: str) -> str:
    """
    Ensure trailing slash so we point to
    the dataset directory (not a file)
    """
    return s if s.endswith("/") else s + "/"

def _success_exists(target_dir: str) -> bool:
    """
    Check for the Hadoop/Spark _SUCCESS marker;  
    """
    jvm = spark._jvm
    hconf = spark._jsc.hadoopConfiguration()
    try:
        uri = jvm.java.net.URI(target_dir)
        fs = jvm.org.apache.hadoop.fs.FileSystem.get(uri, hconf)
        success = jvm.org.apache.hadoop.fs.Path(target_dir + "_SUCCESS")
        exists = fs.exists(success)
        print(f"[status] _SUCCESS check at: {target_dir}_SUCCESS -> {exists}")
        return bool(exists)
    except Exception as e:
        print(f"[status] _SUCCESS check failed ({e}); attempting read-probe …")
        try:
            spark.read.parquet(target_dir).limit(1).count()
            print(f"[dewstatus59] read-probe succeeded at: {target_dir}")
            return True
        except Exception as e2:
            print(f"[status] read-probe failed ({e2}); treating as not existing.")
            return False

def _count_unique_ids(df: DataFrame) -> int:
    return normalise_ids(df).count()

 
# Where to save diagnostics (use your username as requested)

# Back-compat aliases hack to account for non-disciplined naming un-convention
# hack 
_ids       = normalise_ids
canon_ids  = normalise_ids
_canon_ids = normalise_ids

#print("[TEST] Using _canon_ids:", _canon_ids(stations).count())
#print("[TEST] Using canon_ids :", canon_ids(stations).count())
#print("[TEST] Using _ids      :", _ids(stations).count())

# : pairwise city distances in km using Spark built-ins 
def pairwise_city_distances_spark(cities, radius_km=6371.0):
    """
    cities: list[tuple[str, float, float]] -> [(name, lat_deg, lon_deg), ...]
    returns: Spark DataFrame with columns:
             city_a, city_b, haversine_km, slc_km, delta_km, delta_pct
    """
  #  from pyspark.sql import SparkSession, functions as F, types as T

    spark = SparkSession.getActiveSession()
    if spark is None:
        raise RuntimeError("No active Spark session.")

    schema = T.StructType([
        T.StructField("city", T.StringType(), False),
        T.StructField("lat",  T.DoubleType(), False),
        T.StructField("lon",  T.DoubleType(), False),
        ])
    df = spark.createDataFrame(cities, schema)

    a, b = df.alias("a"), df.alias("b")
    pairs = (a.join(b, F.col("a.city") < F.col("b.city"))
               .select(F.col("a.city").alias("city_a"),
                       F.col("b.city").alias("city_b"),
                       F.col("a.lat").alias("lat1"),
                       F.col("a.lon").alias("lon1"),
                       F.col("b.lat").alias("lat2"),
                       F.col("b.lon").alias("lon2")))

    R = F.lit(float(radius_km))
    lat1 = F.radians(F.col("lat1"));  lat2 = F.radians(F.col("lat2"))
    dlat = lat2 - lat1
    dlon = F.radians(F.col("lon2") - F.col("lon1"))

    a_term = F.sin(dlat/2)**2 + F.cos(lat1)*F.cos(lat2)*F.sin(dlon/2)**2
    c_term = 2*F.atan2(F.sqrt(a_term), F.sqrt(1 - a_term))
    hav_km = R * c_term

    cos_val = F.sin(lat1)*F.sin(lat2) + F.cos(lat1)*F.cos(lat2)*F.cos(dlon)
    cos_val = F.greatest(F.lit(-1.0), F.least(F.lit(1.0), cos_val))
    slc_km = R * F.acos(cos_val)

    delta_km  = F.abs(hav_km - slc_km)
    delta_pct = F.when(hav_km == 0, F.lit(0.0)).otherwise(delta_km / hav_km * 100.0)

    out_df = (pairs
              .withColumn("haversine_km", F.round(hav_km, 2))
              .withColumn("slc_km",       F.round(slc_km, 2))
              .withColumn("delta_km",     F.round(delta_km, 4))
              .withColumn("delta_pct",    F.round(delta_pct, 6))
              .select("city_a", "city_b", "haversine_km", "slc_km", "delta_km", "delta_pct")
              .orderBy("haversine_km"))
    return out_df

# --- Timing helpers for Spark & pure Python (no extra deps)

def benchmark_python_distances(cities, radius_km=6371.0, repeats=50000):
    """
    cities: [(name, lat_deg, lon_deg), ...]  (3 cities => 3 pairs)
    repeats: loop count to make timings stable
    returns: dict with seconds for haversine/slc
    """
    pairs = []
    for i in range(len(cities)):
        for j in range(i+1, len(cities)):
            (_, lat1, lon1), (_, lat2, lon2) = cities[i], cities[j]
            pairs.append((lat1, lon1, lat2, lon2))

    # haversine
    t0 = perf_counter()
    for _ in range(repeats):
        for lat1, lon1, lat2, lon2 in pairs:
            φ1, λ1, φ2, λ2 = map(radians, (lat1, lon1, lat2, lon2))
            dφ, dλ = (φ2 - φ1), (λ2 - λ1)
            a = sin(dφ/2)**2 + cos(φ1)*cos(φ2)*sin(dλ/2)**2
            c = 2*atan2(sqrt(a), sqrt(1 - a))
            _ = radius_km * c
    t1 = perf_counter()

    # spherical law of cosines (SLC)
    t2 = perf_counter()
    for _ in range(repeats):
        for lat1, lon1, lat2, lon2 in pairs:
            φ1, λ1, φ2, λ2 = map(radians, (lat1, lon1, lat2, lon2))
            cosv = sin(φ1)*sin(φ2) + cos(φ1)*cos(φ2)*cos(λ2 - λ1)
            cosv = max(-1.0, min(1.0, cosv))
            _ = radius_km * acos(cosv)
    t3 = perf_counter()

    return {
        "python_haversine_sec": t1 - t0,
        "python_slc_sec":       t3 - t2,
        "repeats": repeats,
        "pairs": len(pairs),
    }

def _parse_ls_bytes(line): 
    parts = line.split()
    if len(parts) < 8:
        return None, None
    try:
        size = int(parts[4])
    except ValueError:
        return None, None
    return size, parts[-1]

def _parse_du_bytes(line):
    parts = line.split()
    if len(parts) < 2:
        return None, None
    try:
        size = int(parts[0])
    except ValueError:
        return None, None
    return size, parts[-1]

def du_bytes(path):
    lines = get_ipython().getoutput(f'hdfs dfs -du "{path}"')
    total = 0
    for ln in lines:
        parts = ln.split()
        if len(parts) >= 2:
            try:
                total += int(parts[0])
            except ValueError:
                pass
    return total
    
def benchmark_spark_distances(cities, radius_km=6368.6, repeats=3):
    """
    Uses Spark built-ins only. Measures full execution
    time by forcing an action.
    
    returns: dict with seconds for haversine/slc and
    row counts used.
    
    For the radius:
    
    The Earth is slightly flattened, so the geocentric 
    radius depends on latitude.  For context: 
    
    * equatorial radius = 6,378.137 km; 
    * polar radius      = 6,356.752 km 
    
    Across New Zealand's latitudes (≈36–47°S), using the
    WGS-84 ellipsoid, you get roughly:

    Auckland (37°S):       ~6,370.4 km
    Christchurch (43.5°S): ~6,368.0 km
    Dunedin (45.9°S):      ~6,367.2 km
    __________________________________
    Wellington (41°S):     ~6,369.0 km
    mean                  ≈ 6,368.6 km
    """

    
    try:
        from pyspark.sql import SparkSession, functions as F, types as T
    except Exception:
        return None  # no Spark therefore save cannot run in vs code

    spark = SparkSession.getActiveSession()
    if spark is None:
        return None

    # build pairs once and cache
    schema = T.StructType([
        T.StructField("city", T.StringType(), False),
        T.StructField("lat",  T.DoubleType(), False),
        T.StructField("lon",  T.DoubleType(), False),
    ])
    df = spark.createDataFrame(cities, schema)
    a, b = df.alias("a"), df.alias("b")
    pairs = (a.join(b, F.col("a.city") < F.col("b.city"))
               .select(F.col("a.lat").alias("lat1"),
                       F.col("a.lon").alias("lon1"),
                       F.col("b.lat").alias("lat2"),
                       F.col("b.lon").alias("lon2"))
               .cache())
    _ = pairs.count()

    R = F.lit(float(radius_km))
    lat1 = F.radians(F.col("lat1")); lat2 = F.radians(F.col("lat2"))
    dlat = lat2 - lat1
    dlon = F.radians(F.col("lon2") - F.col("lon1"))

    # Haversine expr
    a_term = F.sin(dlat/2)**2 + F.cos(lat1)*F.cos(lat2)*F.sin(dlon/2)**2
    c_term = 2*F.atan2(F.sqrt(a_term), F.sqrt(1 - a_term))
    hav    = R * c_term

    # SLC expr
    cosv = F.sin(lat1)*F.sin(lat2) + F.cos(lat1)*F.cos(lat2)*F.cos(dlon)
    cosv = F.greatest(F.lit(-1.0), F.least(F.lit(1.0), cosv))
    slc = R * F.acos(cosv)

    # time Haversine
    t0 = perf_counter()
    for _ in range(repeats):
        _ = pairs.select(hav.alias("d")).agg(F.sum("d")).collect()
    t1 = perf_counter()

    # time SLC
    t2 = perf_counter()
    for _ in range(repeats):
        _ = pairs.select(slc.alias("d")).agg(F.sum("d")).collect()
    t3 = perf_counter()

    return {
        "spark_pairs": pairs.count(),
        "spark_repeats": repeats,
        "spark_haversine_sec": t1 - t0,
        "spark_slc_sec":       t3 - t2,
    }


def list_hdfs_csvgz_files(hdfs_path = WASBS_DATA, debug=False):
    """
    Lists .csv.gz files from an HDFS directory, extracting year and file size.

    Parameters
    ----------
    hdfs_path : str
        The HDFS path to list, e.g. 'wasbs://campus-data@...'
    debug : bool, optional
        If True, prints intermediate parsing steps.

    Returns
    -------
    list of tuple
        A list of (year, size) tuples for each .csv.gz file.
    """
    cmd = f"hdfs dfs -ls {hdfs_path}"
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)

    lines = result.stdout.strip().split("\n")
    rows = []

    for line in lines:
        parts = line.split()
        if debug:
            print("Parts:", parts)
        if len(parts) < 6:
            continue
        try:
            size = int(parts[2])
        except ValueError:
            continue
        path = parts[-1]
        if path.endswith(".csv.gz"):
            try:
                year = int(path.split("/")[-1].replace(".csv.gz", ""))
                rows.append((year, size))
            except ValueError:
                continue

    if debug:
        print("_____________________________________________________")
        print("Sample parsed rows:", rows[:5])

    return rows




def explore_hdfs_directory_tree(root_path, max_depth=2, show_sizes=True):
    """
    Explore and visualise any HDFS or WASBS directory tree.
    Works with any file types (not just .parquet).

    Parameters
    ----------
    root_path : str
        HDFS/WASBS path to explore.
    max_depth : int
        Maximum depth to traverse.
    show_sizes : bool
        Whether to display file sizes in MB.
    """

    console = Console()

    def build_tree(path, tree, depth=0):
        if depth >= max_depth:
            return

        try:
            # Run the HDFS ls command
            cmd = ['hdfs', 'dfs', '-ls', path]
            result = subprocess.run(cmd, capture_output=True, text=True, check=True)

            lines = result.stdout.strip().split('\n')
            if not lines:
                tree.add("[dim]Empty directory[/dim]")
                return

            # Skip 'Found N items' header
            if lines[0].startswith("Found"):
                lines = lines[1:]

            for line in lines:
                parts = line.split()
                if len(parts) < 8:
                    continue

                permissions, _, _, size, date, time_str, _, name = parts[-8:]
                item_name = name.split("/")[-1] or name.split("/")[-2]

                if permissions.startswith("d"):
                    # Directory node
                    subtree = tree.add(f"[bold cyan]{item_name}/[/bold cyan]")
                    if depth + 1 < max_depth:
                        build_tree(name, subtree, depth + 1)
                else:
                    # File node
                    display_name = item_name
                    if show_sizes and size.isdigit():
                        size_mb = int(size) / (1024 ** 2)
                        display_name += f" ({size_mb:.2f} MB)"
                    tree.add(display_name)

        except subprocess.CalledProcessError as e:
            tree.add(f"[red]Error accessing {path}: {e}[/red]")
        except Exception as e:
            tree.add(f"[red]Unexpected error: {e}[/red]")

    # Start visualisation
    console.print("=" * 60)
    console.print(f"[bold white]DIRECTORY TREE FOR:[/bold white] [cyan]{root_path}[/cyan]")
    console.print("=" * 60)

    tree = Tree(f"[green]{root_path}[/green]")
    build_tree(root_path, tree)
    console.print(tree)
    console.print("=" * 60)



def explore_hdfs_directory_tree(root_path, max_depth=3, show_sizes=True):
    console = Console()

    def build_tree(path, tree, depth=0):
        if depth > max_depth:
            return

        try:
            result = subprocess.run(
                ["hdfs", "dfs", "-ls", path],
                capture_output=True, text=True, check=True
            )
            lines = [ln for ln in result.stdout.strip().split("\n") if ln and not ln.startswith("Found")]

            for line in lines:
                parts = line.split()
                if len(parts) < 8:
                    continue

                perms, size, name = parts[0], parts[4], parts[-1]
                item_name = name.split("/")[-1] or name.split("/")[-2]

                if perms.startswith("d"):
                    subtree = tree.add(f"[bold cyan]{item_name}/[/bold cyan]")
                    build_tree(name, subtree, depth + 1)
                else:
                    size_mb = int(size)/(1024*1024) if size.isdigit() else 0
                    label = f"{item_name} ({size_mb:.2f} MB)" if show_sizes else item_name
                    tree.add(label)
        except subprocess.CalledProcessError as e:
            tree.add(f"[red]Error accessing {path}: {e}[/red]")

    # ✅ Header and recursive tree printing belong *inside* the function
    console.print("=" * 60)
    console.print(f"[bold white]DIRECTORY TREE FOR:[/bold white] [cyan]{root_path}[/cyan]")
    console.print("=" * 60)
    tree = Tree(f"[green]{root_path}[/green]")
    build_tree(root_path, tree)
    console.print(tree)
    console.print("=" * 60)


def list_hdfs_all(hdfs_path):
    """List all files and directories under a given HDFS/WASBS path."""
    cmd = f"hdfs dfs -ls -R {hdfs_path}"  # -R for recursive
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    output = result.stdout.strip()
    
    if not output:
        print(f"[INFO] No files or directories found in {hdfs_path}")
    else:
        print(f"Listing for {hdfs_path}:\n")
        print(output)


def build_directory_tree_df(root_path=None, max_depth=3):
    """
    build directory tree from hdfs/wasbs path and return as spark dataframe.
    
    parameters:
        root_path (str): wasbs path to explore (defaults to WASBS_DATA)
        max_depth (int): maximum depth to traverse
        
    returns:
        spark dataframe with columns: level, path, name, type, size, parent_path
    """
    if root_path is None:
        root_path = WASBS_DATA
        
    print(f"[info] building directory tree from: {root_path}")
    print(f"[info] max depth: {max_depth}")
    
    tree_data = []
    
    def explore_path(current_path, current_level, parent_path):
        if current_level > max_depth:
            return
            
        try:
            result = subprocess.run(
                ["hdfs", "dfs", "-ls", current_path],
                capture_output=True, 
                text=True, 
                check=True
            )
            
            lines = result.stdout.strip().split("\n")
            if lines and lines[0].startswith("Found"):
                lines = lines[1:]
                
            for line in lines:
                if not line.strip():
                    continue
                    
                parts = line.split()
                if len(parts) < 8:
                    continue
                    
                permissions = parts[0]
                size_str = parts[4]
                full_path = parts[-1]
                
                # extract item name
                item_name = full_path.rstrip('/').split('/')[-1]
                if not item_name:
                    item_name = full_path.split('/')[-2]
                
                # determine type and size
                is_dir = permissions.startswith('d')
                item_type = "dir" if is_dir else "file"
                size_bytes = 0 if is_dir else (int(size_str) if size_str.isdigit() else 0)
                
                # add to tree data
                tree_data.append({
                    "level": current_level,
                    "path": full_path,
                    "name": item_name,
                    "type": item_type,
                    "size": size_bytes,
                    "parent_path": parent_path
                })
                
                # recurse into directories
                if is_dir and current_level < max_depth:
                    explore_path(full_path, current_level + 1, current_path)
                    
        except subprocess.CalledProcessError as e:
            print(f"[error] failed to access {current_path}: {e}")
        except Exception as e:
            print(f"[error] unexpected error at {current_path}: {e}")
    
    # start exploration from root
    explore_path(root_path, 0, None)
    
    print(f"[info] collected {len(tree_data)} items from directory tree")
    
    # convert to spark dataframe
    schema = T.StructType([
        T.StructField("level", T.IntegerType(), False),
        T.StructField("path", T.StringType(), False),
        T.StructField("name", T.StringType(), False),
        T.StructField("type", T.StringType(), False),
        T.StructField("size", T.LongType(), False),
        T.StructField("parent_path", T.StringType(), True)
    ])
    
    df = spark.createDataFrame(tree_data, schema=schema)
    return df


def save_tree_to_parquet(df, output_path):
    """
    save directory tree dataframe to parquet.
    
    parameters:
        df: spark dataframe with tree structure
        output_path: wasbs path for output (should be in WASBS_USER)
    """
    print(f"[info] saving tree to: {output_path}")
    
    # ensure trailing slash
    if not output_path.endswith('/'):
        output_path += '/'
    
    try:
        df.write.mode("overwrite").parquet(output_path)
        print(f"[info] tree saved successfully to: {output_path}")
        
        # verify with hdfs ls
        result = subprocess.run(
            ["hdfs", "dfs", "-ls", output_path],
            capture_output=True,
            text=True
        )
        print(f"[info] parquet contents:\n{result.stdout}")
        
    except Exception as e:
        print(f"[error] failed to save tree: {e}")


def display_tree_as_text(df, show_sizes=True):
    """
    display directory tree dataframe in text format matching reference pdf.
    
    parameters:
        df: spark dataframe with tree structure
        show_sizes: whether to show file sizes in bytes
    """
    print("\n" + "=" * 70)
    print("DIRECTORY TREE STRUCTURE")
    print("=" * 70)
    
    # collect data sorted by level and path
    tree_data = df.orderBy("level", "path").collect()
    
    # build hierarchical display
    path_to_children = {}
    for row in tree_data:
        parent = row.parent_path
        if parent not in path_to_children:
            path_to_children[parent] = []
        path_to_children[parent].append(row)
    
    def print_tree(path, level=0, prefix="", is_last=True):
        """recursively print tree structure"""
        children = path_to_children.get(path, [])
        
        for i, child in enumerate(children):
            is_last_child = (i == len(children) - 1)
            
            # determine connector characters
            if level == 0:
                connector = "└── " if is_last_child else "├── "
                extension = "    " if is_last_child else "│   "
            else:
                connector = prefix + ("└── " if is_last_child else "├── ")
                extension = prefix + ("    " if is_last_child else "│   ")
            
            # format item name
            item_display = child.name
            if child.type == "dir":
                item_display += "/"
            elif show_sizes and child.size > 0:
                item_display += f" ({child.size})"
            
            # print the item
            print(connector + item_display)
            
            # recurse for directories
            if child.type == "dir":
                print_tree(child.path, level + 1, extension, is_last_child)
    
    # start from root (items with no parent)
    root_items = path_to_children.get(None, [])
    for i, root_item in enumerate(root_items):
        is_last = (i == len(root_items) - 1)
        print("└── " + root_item.name + ("/" if root_item.type == "dir" else ""))
        if root_item.type == "dir":
            print_tree(root_item.path, 1, "    " if is_last else "│   ", is_last)
    
    print("=" * 70 + "\n")



___________________________________HELPER / DIAGNOSTIC FUNCTIONS___________________________________


In [6]:

# USE SPARINGLY - these are for diagnostics only
# Set cleanup=True to actually delete files, or False to just list them 
# LEAVE cleanup=False after running this cell once! 
# if they have been created and are correct, change cleanup=False for quicker runs. 
cleanup_parquet_files(cleanup=False)


_________Clean up existing parquet files_________
[cleanup] Listing files BEFORE cleanup:
Found 2 items
-rw-r--r--   1 dew59 supergroup          0 2025-10-08 08:29 wasbs://campus-user@madsstorage002.blob.core.windows.net/dew59-A2/msd_directory_tree.parquet/_SUCCESS
-rw-r--r--   1 dew59 supergroup        727 2025-10-08 08:29 wasbs://campus-user@madsstorage002.blob.core.windows.net/dew59-A2/msd_directory_tree.parquet/part-00000-7069e22c-78cd-449b-967d-81d921730363-c000.snappy.parquet

[info] To actually delete files, call: cleanup_parquet_files(cleanup=True)


In [7]:
# overall time metric
start_notebook = time.time() 
start_time = datetime.fromtimestamp(start_notebook).strftime("%Y.%m.%d %H:%M")

hprint(f"started at: {start_time}")
# Use the hdfs command to explore the data in Azure Blob Storage
#!hdfs dfs -ls wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/msd/
!hdfs dfs -ls    -h {WASBS_DATA} 
!hdfs dfs -du -s -h {WASBS_DATA} 
!hdfs dfs -ls    -h {WASBS_USER} 
!hdfs dfs -du -s -h {WASBS_USER} 


___________started at: 2025.10.08 08:43___________
Found 4 items
drwxrwxrwx   -          0 1970-01-01 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/audio
drwxrwxrwx   -          0 1970-01-01 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/genre
drwxrwxrwx   -          0 1970-01-01 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/main
drwxrwxrwx   -          0 1970-01-01 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/tasteprofile
12.9 G  12.9 G  wasbs://campus-data@madsstorage002.blob.core.windows.net/msd
Found 1 items
drwxr-xr-x   - dew59 supergroup          0 2025-10-08 08:29 wasbs://campus-user@madsstorage002.blob.core.windows.net/dew59-A2/msd_directory_tree.parquet
727  727  wasbs://campus-user@madsstorage002.blob.core.windows.net/dew59-A2


In [9]:
cell_time = time.time() 
result = get_ipython().getoutput(f"hdfs dfs -du -s {WASBS_DATA}") 

print("Raw result:", result)
print()
data_size_bytes = int(result[0].split()[0])
print("firstpass size (bytes):", data_size_bytes)
print(f"firstpass size (MB)   : {data_size_bytes / (1024**2):.3f}")
 
lines = get_ipython().getoutput(f"hdfs dfs -ls {WASBS_DATA}")
print()
#other_size_bytes = 0
#for line in lines:
#    parts = line.split()
#    if len(parts) >= 6 and parts[0].startswith('-'):   # file, not directory
#        size = int(parts[2])                           # file size is parts[2] in your env
#        other_size_bytes += size
#print()
#print("_____________________________________________________") 
#print(f"[result] daily size (bytes): {daily_size_bytes:,d}")
#print(f"[result] daily size (MB)   : {daily_size_bytes / (1024**2):.2f}")
#print(f"[result] meta-data (bytes) : {other_size_bytes:,d}")
#print(f"[result] meta-data (MB)    : {other_size_bytes / (1024**2):.2f}")


cell_time = time.time() - cell_time
print(f"[time]   Cell time (sec)   : {cell_time:5.2f}") 
print(f"[time]   Cell time (min)   : {cell_time/60:5.2f}") 

Raw result: ['13854359805  13854359805  wasbs://campus-data@madsstorage002.blob.core.windows.net/msd']

firstpass size (bytes): 13854359805
firstpass size (MB)   : 13212.547

[time]   Cell time (sec)   :  4.83
[time]   Cell time (min)   :  0.08


### Q1 - Directory Tree Structure

In [None]:
# Q1(a) - Get the file structure of the whole MSD dataset
# Save hdfs ls -R output to a text file for inspection and parsing
get_ipython().system(f'hdfs dfs -ls -R {WASBS_DATA} > data_structure.txt')
print(f"[info] Saved directory structure to: data_structure.txt")



_______________Q1 - Directory Tree_______________
[status] _SUCCESS check at: wasbs://campus-user@madsstorage002.blob.core.windows.net/dew59-A2/msd_directory_tree.parquet/_SUCCESS -> True
[info] directory tree parquet exists in WASBS_USER
[info] path: wasbs://campus-user@madsstorage002.blob.core.windows.net/dew59-A2/msd_directory_tree.parquet/
[info] reading directory tree from parquet...


                                                                                

[info] loaded 0 items from directory tree

DIRECTORY TREE STRUCTURE


                                                                                



__________________________________________________
name :  MSD Directory Structure
root
 |-- level: integer (nullable = true)
 |-- path: string (nullable = true)
 |-- name: string (nullable = true)
 |-- type: string (nullable = true)
 |-- size: long (nullable = true)
 |-- parent_path: string (nullable = true)

[check] sample:
[INFO] Converting Spark → pandas for HTML display (rows: 0 )
[INFO] right_align (numeric columns): True
[INFO] Right-aligning numeric columns: ['level', 'size']


level,path,name,type,size,parent_path



_________________Example Queries_________________

[query] all .csv.gz files:
+----+----+----+
|name|size|path|
+----+----+----+
+----+----+----+


[query] audio folder contents:
+-----+----+----+----+
|level|name|type|size|
+-----+----+----+----+
+-----+----+----+----+

[time] cell time (sec): 11.81
[time] cell time (min):  0.20


### Q1(a) - Directory Tree Structure

The MSD dataset has the following structure:

```
└── msd/
    ├── audio/
    │   ├── attributes/
    │   │   ├── msd-jmir-area-of-moments-all-v1.0.attributes.csv
    │   │   ├── msd-jmir-lpc-all-v1.0.attributes.csv
    │   │   ├── msd-jmir-methods-of-moments-all-v1.0.attributes.csv
    │   │   ├── msd-jmir-mfcc-all-v1.0.attributes.csv
    │   │   ├── msd-jmir-spectral-all-v1.0.attributes.csv
    │   │   ├── msd-jmir-spectral-derivatives-all-v1.0.attributes.csv
    │   │   ├── msd-marsyas-timbral-v1.0.attributes.csv
    │   │   ├── msd-mvd-v1.0.attributes.csv
    │   │   ├── msd-rh-v1.0.attributes.csv
    │   │   ├── msd-rp-v1.0.attributes.csv
    │   │   ├── msd-ssd-v1.0.attributes.csv
    │   │   ├── msd-trh-v1.0.attributes.csv
    │   │   └── msd-tssd-v1.0.attributes.csv
    │   ├── features/
    │   │   ├── msd-jmir-area-of-moments-all-v1.0.csv/
    │   │   ├── msd-jmir-lpc-all-v1.0.csv/
    │   │   ├── msd-jmir-methods-of-moments-all-v1.0.csv/
    │   │   ├── msd-jmir-mfcc-all-v1.0.csv/
    │   │   ├── msd-jmir-spectral-all-v1.0.csv/
    │   │   ├── msd-jmir-spectral-derivatives-all-v1.0.csv/
    │   │   ├── msd-marsyas-timbral-v1.0.csv/
    │   │   ├── msd-mvd-v1.0.csv/
    │   │   ├── msd-rh-v1.0.csv/
    │   │   ├── msd-rp-v1.0.csv/
    │   │   ├── msd-ssd-v1.0.csv/
    │   │   ├── msd-trh-v1.0.csv/
    │   │   └── msd-tssd-v1.0.csv/
    │   └── statistics/
    │       └── sample_properties.csv.gz
    ├── genre/
    │   ├── msd-MAGD-genreAssignment.tsv
    │   ├── msd-MASD-styleAssignment.tsv
    │   └── msd-topMAGD-genreAssignment.tsv
    ├── main/
    │   └── summary/
    │       ├── analysis.csv.gz
    │       └── metadata.csv.gz
    └── tasteprofile/
        ├── mismatches/
        │   ├── sid_matches_manually_accepted.txt
        │   └── sid_mismatches.txt
        └── triplets.tsv
```

In [None]:
# Q1(a) continued - Generate and display directory tree PNG
import os
import sys
from IPython.display import Image, display

# Add code directory to path so we can import the module
if '../code' not in sys.path:
    sys.path.insert(0, os.path.abspath('../code'))

from generate_tree_png import create_tree_png

png_path = 'report/supplementary/msd_directory_tree.png'

# Test if PNG exists, generate if missing
if not os.path.exists(png_path):
    print(f"[info] PNG not found at {png_path}, generating it...")
    create_tree_png()
else:
    print(f"[info] PNG already exists at: {png_path}")

# Display the PNG
print(f"\n[info] Displaying: {png_path}")
display(Image(filename=png_path))

In [None]:
# Q1(b) - Parse the structure file and calculate summary statistics
cell_time = time.time()

hprint("Q1(b) - Summary Statistics")

import re

# Parse hdfs ls -R output to extract size and path
def parse_ls_line(line):
    """Parse a single line from hdfs ls -R output"""
    # Format: permissions replication user group size date time path
    # Example: -rw-r--r--   3 hdfs supergroup   1051 2024-01-15 10:30 /path/to/file
    parts = line.split()
    if len(parts) < 8:
        return None
    
    permissions = parts[0]
    size_str = parts[4]
    path = parts[-1]
    
    # Only process files (not directories)
    if not permissions.startswith('d'):
        try:
            size = int(size_str)
            return {'size': size, 'path': path, 'is_dir': False}
        except ValueError:
            return None
    else:
        return {'size': 0, 'path': path, 'is_dir': True}
    
    return None

# Read and parse the data structure file
try:
    with open("data_structure.txt", 'r') as f:
        lines = f.readlines()
    
    file_count = 0
    dir_count = 0
    total_size = 0
    
    for line in lines:
        parsed = parse_ls_line(line.strip())
        if parsed:
            if parsed['is_dir']:
                dir_count += 1
            else:
                file_count += 1
                total_size += parsed['size']
    
    print(f"\n[summary] directories: {dir_count}")
    print(f"[summary] files: {file_count}")
    print(f"[summary] total size: {total_size:,} bytes ({total_size/(1024**3):.2f} GB)")
    
except FileNotFoundError:
    print("[error] data_structure.txt not found. Please run the previous cell first.")
except Exception as e:
    print(f"[error] Failed to parse structure file: {e}")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")

In [None]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI
 
stop_spark()