### Spark notebook ###

This notebook will only work in a Jupyter notebook or Jupyter lab session running on the cluster master node in the cloud.

Follow the instructions on the computing resources page to start a cluster and open this notebook.

**Steps**

1. Connect to the Windows server using Windows App.
2. Connect to Kubernetes.
3. Start Jupyter and open this notebook from Jupyter in order to connect to Spark.

In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pyspark

from pyspark import SparkContext
from pyspark.sql import SparkSession


# Constants used to interact with Azure Blob Storage using the hdfs command or Spark

global username

username = re.sub('@.*', '', getpass.getuser())

global azure_account_name
global azure_data_container_name
global azure_user_container_name
global azure_user_token

azure_account_name = "madsstorage002"
azure_data_container_name = "campus-data"
azure_user_container_name = "campus-user"
azure_user_token = r"sp=racwdl&st=2025-08-01T09:41:33Z&se=2026-12-30T16:56:33Z&spr=https&sv=2024-11-04&sr=c&sig=GzR1hq7EJ0lRHj92oDO1MBNjkc602nrpfB5H8Cl7FFY%3D"


# Functions used below

def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """
    html = []
    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')
    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():
        name = sc.getConf().get("spark.app.name")
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://localhost:{sc.uiWebUrl.split(":")[-1]}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else: 
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username} (notebook)</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
    
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{username}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.driver.memory", f'{master_memory}g')
        .config("spark.executor.memory", f'{worker_memory}g')
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.kubernetes.container.image", "madsregistry001.azurecr.io/hadoop-spark:v3.3.5-openjdk-8")
        .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")
        .config("spark.memory.fraction", "0.1")
        .config(f"fs.azure.sas.{azure_user_container_name}.{azure_account_name}.blob.core.windows.net",  azure_user_token)
        .config("spark.app.name", f"{username} (notebook)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """
    global spark
    global sc
    if 'spark' in globals() and 'sc' in globals():
        spark.stop()
        del spark
        del sc
    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

NameError: name 're' is not defined

### -  –––––––––––––––––––– Assignment 2 BEGINS HERE ––––––––––––––––––––- - ###

- MSD containers:
  - `wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/` 

- MY containers:
  - `wasbs://campus-user@madsstorage002.blob.core.windows.net/`

In [None]:
# My Imports

# group 1: from imports (alphabetical by module)
from collections         import Counter
from datetime            import datetime
from IPython.display     import display, HTML, Image
from math                import acos, atan2, cos, radians, sin, sqrt
from matplotlib.ticker   import FuncFormatter, MaxNLocator
from pathlib             import Path
from pyspark.sql         import DataFrame, DataFrame as SparkDF
from pyspark.sql         import functions as F, types as T
from pyspark.sql.types   import *
from pyspark.sql.utils   import AnalysisException
from pyspark.sql.window  import Window
from rich.console        import Console
from rich.tree           import Tree
from time                import perf_counter
from typing              import List, Optional, Tuple

# group 2: import ... as ... (alphabetical)
import itertools         as it
import matplotlib.dates  as mdates
import matplotlib.pyplot as plt
import numpy             as np
import pandas            as pd

# group 3: import statements (alphabetical)
import json
import math
import os
import platform
import random
import re
import subprocess
import sys
import time
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
console = Console()

#The following shows the data structure

In [None]:
# overall time metric
notebook_run_time = time.time()

# Use the hdfs command to explore the data in Azure Blob Storage
#USERNAME    = "dew59"
WASBS_DATA  = "wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/"
WASBS_USER  = f"wasbs://campus-user@madsstorage002.blob.core.windows.net/{username}-A2/"

#WASBS_USER          = "wasbs://campus-user@madsstorage002.blob.core.windows.net/{}".format(USERNAME)
#WASBS_YEAR_SIZE     = "{}/years_size_metrics.parquet/".format(WASBS_USER)

 
#stations_path = f'wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/{stations_write_path}'
#common_data_path    = f'wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/'
#stations_read_name  =  inventory_read_name = ""
#stations_read_name  =  inventory_read_name = ""
 

print("Spark:", spark.version)
print("_" * 35 + "PATHS" + "_" * 35)
print("WASBS_DATA          :", WASBS_DATA)
print("WASBS_USER          :", WASBS_USER) 
print()

In [None]:
# HELPER AND DIAGNOSTIC FUNCTIONS

notebook_run_time = time.time()
print("_" * 35 + "HELPER / DIAGNOSTIC FUNCTIONS" + "_" * 35)

def hprint(text: str="", l=50):
    """Print formatted section header"""
    if len(text) > 0: 
        text = f" {text} "
    n = len(text)
    n = abs(n - l) // 2
    print("\n" + "–" * n + text + "–" * n)

def cleanup_parquet_files(cleanup=False):
    """Clean up existing parquet files in user directory.
    
    Args:
        cleanup (bool): When True, actually DELETES FILES. 
                        When False, only LISTS files.
    """
    hprint("Clean up existing parquet files")

    print("[cleanup] Listing files BEFORE cleanup:")
    get_ipython().system(f'hdfs dfs -ls {WASBS_USER}/*.parquet')
    
    if cleanup:
        print("\n[cleanup] Deleting all parquet folders...")
        get_ipython().system(f'hdfs dfs -rm -r -f {WASBS_USER}/*.parquet')
        
        print("\n[info] Listing files AFTER cleanup:")
        get_ipython().system(f'hdfs dfs -ls {WASBS_USER}/*.parquet')
        print("\n[cleanup] Parquet file cleanup complete - ready to restart Processing run with clean schema")

    else:
        print("\n[info] To actually delete files, call: cleanup_parquet_files(cleanup=True)")

def normalise_ids(df: DataFrame, col: str = "ID") -> DataFrame:
    """
    # Single source of truth for ID normalisation 
    Upper + trim + distinct on the given ID column.
    """
    print(f"[INFO] normalise_ids() on column: {col}")
    df.printSchema()
    df.show(20)
    return df.select(F.upper(F.trim(F.col(col))).alias("ID")).distinct()
    df.printSchema()
    df.show(20)

def show_df(df, n=10, name="DataFrame", max_width=None, right_align=False, left_trim=True, total_info=True):
    """
    Enhanced DataFrame display function with multiple formatting options.
    
    Args:
        df: Spark DataFrame to display
        n (int): Number of rows to show (default: 10)
        name (str): Name/title for the DataFrame
        max_width (int): Maximum column width for truncation
        right_align (bool): Whether to right-align numeric columns
        left_trim (bool): Whether to trim whitespace from string columns
        total_info (bool): Whether to show total row count info
    """
    if df is None:
        print(f"[show_df] {name}: DataFrame is None")
        return
    
    try:
        # Get total count if requested
        if total_info:
            total_rows = df.count()
            print(f"[show_df] {name}: Showing {min(n, total_rows)} of {total_rows} rows")
        else:
            print(f"[show_df] {name}: Showing {n} rows")
        
        # Configure display options
        if max_width:
            df.show(n, truncate=max_width)
        else:
            df.show(n, truncate=False)
            
        # Show schema info
        print(f"[schema] {len(df.columns)} columns: {', '.join(df.columns[:5])}{' ...' if len(df.columns) > 5 else ''}")
        
    except Exception as e:
        print(f"[show_df] Error displaying {name}: {str(e)}")

def benchmark_function(func, *args, **kwargs):
    """
    Benchmark a function's execution time.
    
    Args:
        func: Function to benchmark
        *args: Arguments to pass to the function
        **kwargs: Keyword arguments to pass to the function
    
    Returns:
        tuple: (result, execution_time_seconds)
    """
    start_time = perf_counter()
    try:
        result = func(*args, **kwargs)
        end_time = perf_counter()
        execution_time = end_time - start_time
        
        print(f"[benchmark] {func.__name__}: {execution_time:.3f} seconds")
        return result, execution_time
        
    except Exception as e:
        end_time = perf_counter()
        execution_time = end_time - start_time
        print(f"[benchmark] {func.__name__} FAILED after {execution_time:.3f} seconds: {str(e)}")
        raise

def save_to_parquet(df, path: str, mode: str = "overwrite", check_exists: bool = True) -> bool:
    """
    Save DataFrame to parquet with enhanced error handling and timing.
    
    Args:
        df: Spark DataFrame to save
        path (str): HDFS path to save to
        mode (str): Write mode ('overwrite', 'append', etc.)
        check_exists (bool): Whether to check if path already exists
    
    Returns:
        bool: True if successful, False otherwise
    """
    try:
        start_time = perf_counter()
        
        if check_exists and has_parquet(path):
            print(f"[save_parquet] Path already exists: {path}")
            return False
        
        print(f"[save_parquet] Writing to: {path}")
        df.write.mode(mode).parquet(path)
        
        end_time = perf_counter()
        elapsed = end_time - start_time
        
        print(f"[save_parquet] Completed in {elapsed:.2f} seconds")
        return True
        
    except Exception as e:
        print(f"[save_parquet] Error: {str(e)}")
        return False

def has_parquet(dir_as_path: str) -> bool:
    path   = _normalise_dir( dir_as_path)
    marker = path + '_SUCCESS'
    #print("\n[check] dir_path:", dir_path)
    #print("\n[check] path    :", path)
    print("\n[check] marker  :", marker)
    rc = os.system(f'hdfs dfs -test -e "{marker}"')
    print("[check] rc:", rc, "->", ("exists" if rc == 0 else "missing"))
    return (rc == 0)

def _to_spark(df_like, schema=None):
    """
    Return a Spark DataFrame  .
    """
    if isinstance(df_like, SparkDF):
        return df_like
    return spark.createDataFrame(df_like, schema=schema) if schema else spark.createDataFrame(df_like)

def ensure_dir(path: str) -> str:
    """
    ensures that path is a path 
    and not representing a file;
    add trailing slash if needed
    """
    if path is None:
        raise ValueError("Path is None")
    path = _normalise_dir(path)
#   print("ensure_dir -> ",path)
    return path

def _normalise_dir(s: str) -> str:
    """
    Ensure trailing slash so we point to
    the directory rather than the prefix.
    """
    if not s.endswith("/"):
        s += "/"
    return s

def write_parquet(df, path: str, mode: str = "overwrite") -> None:
    funct_time = time.time()
    print(f"Writing to parquet: {path}")
    df.write.mode(mode).parquet(path)
    funct_time = time.time() - funct_time
    print(f"[time] write_parquet (min)   : {funct_time/60:5.2f}")
    print(f"[time] write_parquet (sec)   : {funct_time:5.2f}")

In [None]:

# USE SPARINGLY - these are for diagnostics only
# Set cleanup=True to actually delete files, or False to just list them 
# LEAVE cleanup=False after running this cell once! 
# if they have been created and are correct, change cleanup=False for quicker runs. 
cleanup_parquet_files(cleanup=False)

In [None]:
# overall time metric
start_notebook = time.time() 
start_time = datetime.fromtimestamp(start_notebook).strftime("%Y.%m.%d %H:%M")

hprint(f"started at: {start_time}")
# Use the hdfs command to explore the data in Azure Blob Storage
#!hdfs dfs -ls wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/msd/
!hdfs dfs -ls    -h {WASBS_DATA} 
!hdfs dfs -du -s -h {WASBS_DATA} 
!hdfs dfs -ls    -h {WASBS_USER} 
!hdfs dfs -du -s -h {WASBS_USER} 

In [None]:
cell_time = time.time() 
result = get_ipython().getoutput(f"hdfs dfs -du -s {WASBS_DATA}") 

print("Raw result:", result)
print()
data_size_bytes = int(result[0].split()[0])
print("firstpass size (bytes):", data_size_bytes)
print(f"firstpass size (MB)   : {data_size_bytes / (1024**2):.3f}")
 
lines = get_ipython().getoutput(f"hdfs dfs -ls {WASBS_DATA}")
print()
#other_size_bytes = 0
#for line in lines:
#    parts = line.split()
#    if len(parts) >= 6 and parts[0].startswith('-'):   # file, not directory
#        size = int(parts[2])                           # file size is parts[2] in your env
#        other_size_bytes += size
#print()
#print("_____________________________________________________") 
#print(f"[result] daily size (bytes): {daily_size_bytes:,d}")
#print(f"[result] daily size (MB)   : {daily_size_bytes / (1024**2):.2f}")
#print(f"[result] meta-data (bytes) : {other_size_bytes:,d}")
#print(f"[result] meta-data (MB)    : {other_size_bytes / (1024**2):.2f}")


cell_time = time.time() - cell_time
print(f"[time]   Cell time (sec)   : {cell_time:5.2f}") 
print(f"[time]   Cell time (min)   : {cell_time/60:5.2f}") 

### Q1 - Directory Tree Structure

In [None]:
# Q1(a) - Get the file structure and display directory tree

png_path = '../report/supplementary/msd_directory_tree.png'

# ensure directory exists
os.makedirs(os.path.dirname(png_path), exist_ok=True)

# check if png already exists
if os.path.exists(png_path):
    print(f"[info] directory tree image exists, skipping generation")
    print(f"[display] reading from disk: {png_path}")
else:
    print(f"[info] directory tree image not found, generating...")
    
    # build directory tree dataframe
    tree_df = build_directory_tree_df(WASBS_DATA, max_depth=3)
    
    # save to parquet in wasbs_user
    tree_parquet_path = f"{WASBS_USER}msd_directory_tree.parquet/"
    tree_df.write.mode("overwrite").parquet(tree_parquet_path)
    print(f"[saved] tree dataframe: {tree_parquet_path}")
    
    # create rich console visualisation and save as png
    try:
        # create rich tree visualisation
        console_tree = Console(record=True, width=120)
        tree = Tree(f"[green]{WASBS_DATA}[/green]")
        
        # build tree from dataframe
        tree_data = tree_df.orderBy("level", "path").collect()
        path_to_node = {None: tree}
        
        for row in tree_data:
            parent_node = path_to_node.get(row.parent_path, tree)
            if row.type == "dir":
                node = parent_node.add(f"[bold cyan]{row.name}/[/bold cyan]")
                path_to_node[row.path] = node
            else:
                size_mb = row.size / (1024**2) if row.size > 0 else 0
                parent_node.add(f"{row.name} ({size_mb:.2f} MB)")
        
        # export to svg then convert to png
        console_tree.print(tree)
        svg_output = console_tree.export_svg(title="MSD Directory Tree")
        
        # save svg temporarily
        svg_path = png_path.replace('.png', '.svg')
        with open(svg_path, 'w') as f:
            f.write(svg_output)
        
        # convert svg to png using cairosvg or imagemagick
        try:
            import cairosvg
            cairosvg.svg2png(url=svg_path, write_to=png_path, dpi=150)
        except ImportError:
            # fallback: use matplotlib to create simple tree visualisation
            fig, ax = plt.subplots(figsize=(14, 10))
            ax.axis('off')
            tree_text = "\n".join([f"{'  ' * row.level}{row.name}{'/' if row.type == 'dir' else ''}" 
                                   for row in tree_data[:50]])  # limit to 50 items
            ax.text(0.05, 0.95, tree_text, fontsize=8, family='monospace', va='top')
            plt.savefig(png_path, bbox_inches='tight', dpi=150)
            plt.close()
        
        print(f"[saved] directory tree image: {png_path}")
        
    except Exception as e:
        print(f"[error] failed to generate tree image: {e}")
        print(f"[fallback] creating simple text-based image...")
        
        # simple fallback: create text-based visualisation
        fig, ax = plt.subplots(figsize=(14, 10))
        ax.axis('off')
        ax.text(0.5, 0.5, "Directory tree visualisation\n(see parquet file for details)", 
                ha='center', va='center', fontsize=12)
        plt.savefig(png_path, bbox_inches='tight', dpi=150)
        plt.close()
        print(f"[saved] fallback image: {png_path}")

# always display from disk (whether just created or already existed)
if os.path.exists(png_path):
    print(f"[display] showing directory tree from: {png_path}")
    display(Image(filename=png_path))
else:
    print(f"[warning] directory tree image not available")

In [None]:
# Q1(b) - Parse the structure file and calculate summary statistics
cell_time = time.time()

hprint("Q1(b) - Summary Statistics")

import re

# Parse hdfs ls -R output to extract size and path
def parse_ls_line(line):
    """Parse a single line from hdfs ls -R output"""
    # Format: permissions replication user group size date time path
    # Example: -rw-r--r--   3 hdfs supergroup   1051 2024-01-15 10:30 /path/to/file
    parts = line.split()
    if len(parts) < 8:
        return None
    
    permissions = parts[0]
    size_str = parts[4]
    path = parts[-1]
    
    # Only process files (not directories)
    if not permissions.startswith('d'):
        try:
            size = int(size_str)
            return {'size': size, 'path': path, 'is_dir': False}
        except ValueError:
            return None
    else:
        return {'size': 0, 'path': path, 'is_dir': True}
    
    return None

# Read and parse the data structure file
try:
    with open("data_structure.txt", 'r') as f:
        lines = f.readlines()
    
    file_count = 0
    dir_count = 0
    total_size = 0
    
    for line in lines:
        parsed = parse_ls_line(line.strip())
        if parsed:
            if parsed['is_dir']:
                dir_count += 1
            else:
                file_count += 1
                total_size += parsed['size']
    
    print(f"\n[summary] directories: {dir_count}")
    print(f"[summary] files: {file_count}")
    print(f"[summary] total size: {total_size:,} bytes ({total_size/(1024**3):.2f} GB)")
    
except FileNotFoundError:
    print("[error] data_structure.txt not found. Please run the previous cell first.")
except Exception as e:
    print(f"[error] Failed to parse structure file: {e}")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")

In [None]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI
 
#stop_spark()

---

## Q2 - Exploring the Audio Dataset

In this section, we will examine the audio feature datasets, understand their schemas, and develop a systematic approach to working with their column names.


### Q2(a) - Load Audio Feature Attribute Names and Types

The audio feature datasets are stored in two locations:
- **Attributes directory**: Contains CSV files defining column names and data types
- **Features directory**: Contains the actual feature data (partitioned CSV directories)

Each attribute file follows the format: `attribute_name,type`

We'll examine these attribute files to understand how they can be used to define schemas for loading the feature datasets.


In [None]:
# Q2(a) - Examine attribute files to understand column naming
cell_time = time.time()

hprint("Q2(a) - Audio Feature Attributes Analysis")

from pyspark.sql.types import *

# List all audio feature datasets
audio_datasets = [
    'msd-jmir-area-of-moments-all-v1.0',
    'msd-jmir-lpc-all-v1.0',
    'msd-jmir-methods-of-moments-all-v1.0',
    'msd-jmir-mfcc-all-v1.0',
    'msd-jmir-spectral-all-all-v1.0',
    'msd-jmir-spectral-derivatives-all-all-v1.0',
    'msd-marsyas-timbral-v1.0',
    'msd-mvd-v1.0',
    'msd-rh-v1.0',
    'msd-rp-v1.0',
    'msd-ssd-v1.0',
    'msd-trh-v1.0',
    'msd-tssd-v1.0'
]

print(f"[info] Found {len(audio_datasets)} audio feature datasets\n")

# Function to read and parse attribute file
def load_attribute_file(dataset_prefix):
    """
    Load attribute names and types from an attribute CSV file.
    
    Args:
        dataset_prefix: Name of the dataset (e.g., 'msd-jmir-lpc-all-v1.0')
    
    Returns:
        List of tuples: [(attribute_name, attribute_type), ...]
    """
    attr_path = f"{WASBS_DATA}/audio/attributes/{dataset_prefix}.attributes.csv"
    
    try:
        # Read attribute file as text
        attr_rdd = spark.sparkContext.textFile(attr_path)
        attributes = []
        
        for line in attr_rdd.collect():
            parts = line.strip().split(',')
            if len(parts) >= 2:
                attr_name = parts[0]
                attr_type = parts[1].lower()
                attributes.append((attr_name, attr_type))
            elif len(parts) == 1:
                # Some files might have just the attribute name
                attributes.append((parts[0], 'string'))
        
        return attributes
    
    except Exception as e:
        print(f"[error] Failed to load {dataset_prefix}: {e}")
        return []

# Load attributes for all datasets
print("[info] Loading attribute files...")
all_attributes = {}

for dataset in audio_datasets:
    attrs = load_attribute_file(dataset)
    all_attributes[dataset] = attrs
    print(f"[loaded] {dataset}: {len(attrs)} attributes")

print(f"\n[summary] Loaded attribute information for {len(all_attributes)} datasets")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")


In [None]:
# Q2(a) continued - Display sample column names from each dataset
cell_time = time.time()

hprint("Sample Column Names from Each Dataset")

print("Examining actual column names to understand naming patterns:\n")
print("="*80)

for dataset, attributes in all_attributes.items():
    if attributes:
        print(f"\n[dataset] {dataset}")
        print(f"[count]   {len(attributes)} columns")
        print(f"[sample]  First 5 columns:")
        
        # Show first 5 column names
        for i, (name, dtype) in enumerate(attributes[:5], 1):
            # Truncate long names for display
            display_name = name if len(name) <= 60 else name[:57] + "..."
            print(f"  {i}. {display_name:60s} ({dtype})")
        
        if len(attributes) > 5:
            print(f"  ... ({len(attributes) - 5} more columns)")

print("\n" + "="*80)

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")


In [None]:
# Q2(a) continued - Analyze column name characteristics
cell_time = time.time()

hprint("Column Name Characteristics Analysis")

print("Analyzing column name patterns for Q2(c) discussion:\n")

# Collect statistics about column names
all_column_names = []
column_lengths = []
data_types_used = Counter()

for dataset, attributes in all_attributes.items():
    for name, dtype in attributes:
        all_column_names.append(name)
        column_lengths.append(len(name))
        data_types_used[dtype] += 1

# Calculate statistics
avg_length = sum(column_lengths) / len(column_lengths) if column_lengths else 0
max_length = max(column_lengths) if column_lengths else 0
min_length = min(column_lengths) if column_lengths else 0

print(f"[total columns] {len(all_column_names)} across all datasets")
print(f"\n[column name length statistics]")
print(f"  Average: {avg_length:.1f} characters")
print(f"  Maximum: {max_length} characters")
print(f"  Minimum: {min_length} characters")

print(f"\n[data types used]")
for dtype, count in sorted(data_types_used.items()):
    print(f"  {dtype:10s}: {count:4d} columns ({count/len(all_column_names)*100:.1f}%)")

# Find longest column names
print(f"\n[longest column names (top 10)]")
name_length_pairs = [(name, len(name)) for name in all_column_names]
name_length_pairs.sort(key=lambda x: x[1], reverse=True)

for i, (name, length) in enumerate(name_length_pairs[:10], 1):
    display_name = name if length <= 70 else name[:67] + "..."
    print(f"  {i:2d}. {display_name:70s} ({length} chars)")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")


In [None]:
# Q2(a) continued - Check for column name collisions
cell_time = time.time()

hprint("Column Name Collision Detection")

print("Checking if any column names appear in multiple datasets...")
print("(CRITICAL for Q2(c) discussion and Q2(d) renaming strategy)\n")

from collections import defaultdict

# Build a dictionary: column_name -> [list of datasets that have it]
column_to_datasets = defaultdict(list)

for dataset, attributes in all_attributes.items():
    for name, dtype in attributes:
        column_to_datasets[name].append(dataset)

# Find columns that appear in multiple datasets
collisions = {name: datasets for name, datasets in column_to_datasets.items() 
              if len(datasets) > 1}

if collisions:
    print(f"[ALERT] Found {len(collisions)} column names that appear in multiple datasets!\n")
    print("="*80)
    
    # Sort by number of occurrences (most common first)
    sorted_collisions = sorted(collisions.items(), key=lambda x: len(x[1]), reverse=True)
    
    # Show top 20 most common collisions
    print(f"\nTop {min(20, len(sorted_collisions))} most common colliding column names:\n")
    
    for i, (col_name, datasets) in enumerate(sorted_collisions[:20], 1):
        print(f"{i:2d}. '{col_name}' appears in {len(datasets)} datasets:")
        for ds in datasets:
            # Abbreviate dataset name for display
            ds_abbr = ds.replace('msd-', '').replace('-all-v1.0', '')
            print(f"    - {ds_abbr}")
        print()
        
    if len(sorted_collisions) > 20:
        print(f"... and {len(sorted_collisions) - 20} more colliding column names")
    
    print("="*80)
    print(f"\n[CONCLUSION] WITHOUT renaming, merging datasets would cause column name conflicts!")
    print(f"[CONCLUSION] This demonstrates the NEED for systematic column renaming (Q2d)")
    
else:
    print(f"[OK] No column name collisions detected")
    print(f"[OK] However, descriptive prefixes would still improve clarity when merging")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")


In [None]:
# Q2(a) continued - Focus on the 4 required datasets for Audio Similarity
cell_time = time.time()

hprint("Required Datasets Analysis (for Audio Similarity)")

# These are the 4 datasets required for Audio Similarity Q1
required_datasets = [
    'msd-jmir-area-of-moments-all-v1.0',
    'msd-jmir-lpc-all-v1.0',
    'msd-jmir-spectral-all-all-v1.0',
    'msd-marsyas-timbral-v1.0'
]

print("The Audio Similarity section requires merging these 4 specific datasets:\n")

total_columns = 0
for dataset in required_datasets:
    if dataset in all_attributes:
        col_count = len(all_attributes[dataset])
        total_columns += col_count
        print(f"[{dataset}]")
        print(f"  Columns: {col_count}")
        print()

print(f"[TOTAL] {total_columns} columns after merging (excluding MSD_TRACKID)")
print(f"[NOTE]  Plus 1 MSD_TRACKID column = {total_columns + 1} total columns\n")

# Check for collisions among just these 4 datasets
print("Checking for collisions among these 4 required datasets...")
required_column_to_datasets = defaultdict(list)

for dataset in required_datasets:
    if dataset in all_attributes:
        for name, dtype in all_attributes[dataset]:
            required_column_to_datasets[name].append(dataset)

required_collisions = {name: datasets for name, datasets in required_column_to_datasets.items() 
                       if len(datasets) > 1}

if required_collisions:
    print(f"\n[ALERT] Found {len(required_collisions)} collisions among the 4 required datasets:")
    for col_name, datasets in sorted(required_collisions.items()):
        print(f"  - '{col_name}' in: {', '.join([d.replace('msd-', '').replace('-all-v1.0', '').replace('-all-all-v1.0', '') for d in datasets])}")
else:
    print(f"\n[OK] No collisions among the 4 required datasets")
    print(f"[INFO] However, renaming is still recommended for clarity and consistency")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")


In [None]:
# Q2(a) continued - Create summary table for report
cell_time = time.time()

hprint("Summary Table - Audio Feature Datasets")

# Create a summary DataFrame
summary_data = []
for dataset in audio_datasets:
    if dataset in all_attributes:
        attrs = all_attributes[dataset]
        col_count = len(attrs)
        
        # Calculate average column name length for this dataset
        lengths = [len(name) for name, _ in attrs]
        avg_len = sum(lengths) / len(lengths) if lengths else 0
        max_len = max(lengths) if lengths else 0
        
        # Count data types
        types = [dtype for _, dtype in attrs]
        type_counts = Counter(types)
        
        summary_data.append({
            'Dataset': dataset.replace('msd-', '').replace('-v1.0', ''),
            'Columns': col_count,
            'Avg Name Length': f"{avg_len:.0f}",
            'Max Name Length': max_len,
            'String': type_counts.get('string', 0),
            'Real': type_counts.get('real', 0),
            'Numeric': type_counts.get('numeric', 0)
        })

summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values('Columns', ascending=False)

print("\nAudio Feature Dataset Summary:\n")
print(summary_df.to_string(index=False))

print(f"\n[KEY OBSERVATIONS for Q2(c)]")
print(f"  1. Column names average {avg_length:.0f} characters - quite long for practical use")
print(f"  2. Longest column name is {max_length} characters - very cumbersome")
print(f"  3. Total of {len(all_column_names)} columns across 13 datasets")
if collisions:
    print(f"  4. {len(collisions)} column names appear in multiple datasets - collision risk!")
print(f"  5. Dominant data types: {', '.join([f'{k}({v})' for k,v in data_types_used.most_common(3)])}")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")


## Q2(b) - Automatic StructType Creation

In this section, we implement a function to automatically generate Spark `StructType` schemas from attribute files. This eliminates manual schema definition and ensures consistency across all 13 audio feature datasets.

In [None]:
# Q2(b) - Implement automatic StructType generation from attributes
cell_time = time.time()

hprint("Q2(b) - Schema Inference with inferSchema=True")

print("[info] using inferSchema=True approach for automatic schema detection")
print("[info] spark will analyze data to determine column types automatically")

# test loading one dataset with inferSchema
print("\n[test] loading msd-jmir-area-of-moments-all-v1.0 with inferSchema=True:")
aom_path = f"{WASBS_DATA}audio/features/msd-jmir-area-of-moments-all-v1.0/"
aom_df = spark.read.csv(aom_path, header=False, inferSchema=True)
print(f"[result] dataset loaded with {len(aom_df.columns)} columns and inferred schema")
print("\n[sample] inferred schema (first 5 fields):")
for i, field in enumerate(aom_df.schema.fields[:5], 1):
    print(f"  {i}. {field.name} ({field.dataType})")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")

In [None]:
# Q2(b) continued - Generate schemas for all 4 required datasets
cell_time = time.time()

print("\n____Generating Schemas for 4 Required Datasets____\n")

# dataset names for audio similarity section
required_datasets = [
    'msd-jmir-area-of-moments-all-v1.0',
    'msd-jmir-lpc-all-v1.0',
    'msd-jmir-spectral-all-all-v1.0',
    'msd-marsyas-timbral-v1.0'
]

# load datasets with inferSchema to get actual schemas
schemas = {}
dataframes = {}
for dataset_name in required_datasets:
    file_path = f"{WASBS_DATA}audio/features/{dataset_name}/"
    df = spark.read.csv(file_path, header=False, inferSchema=True)
    schemas[dataset_name] = df.schema
    dataframes[dataset_name] = df
    short_name = dataset_name.replace('msd-jmir-', '').replace('msd-marsyas-', '')
    print(f"[loaded] {short_name}: {len(df.schema.fields)} fields, {df.count()} rows")

print(f"\n[summary] successfully loaded all 4 required datasets with inferred schemas")
print(f"[total fields] {sum(len(s.fields) for s in schemas.values())} across all datasets")

# display example schema structure
print("\n[example] msd-marsyas-timbral-v1.0 schema (first 10 fields):")
timb_schema = schemas['msd-marsyas-timbral-v1.0']
for i, field in enumerate(timb_schema.fields[:10], 1):
    print(f"  {i:2d}. StructField('{field.name}', {field.dataType}, True)")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")


## Q2(c) - Discussion: Column Naming Advantages and Disadvantages

### Current Column Naming Characteristics

Based on our analysis in Q2(a), the audio feature datasets use descriptive, self-documenting column names that follow a consistent pattern. For example, names like `Method_of_Moments_Overall_Standard_Deviation_1` and `Spectral_Centroid_Overall_Average` clearly indicate the feature type, statistical measure, and variant. Our analysis revealed:

- **Average column name length:** 18 characters
- **Maximum column name length:** 108 characters
- **Total columns across 13 datasets:** 3,929 columns
- **Columns in 4 required datasets:** 184 columns (plus 1 MSD_TRACKID)
- **Column name collision detected:** `MSD_TRACKID` appears in 3 different datasets

### Advantages of Current Column Names

The existing descriptive naming convention offers several important benefits:

1. **Self-Documenting:** Names like `Zero_Crossings_Overall_Standard_Deviation` immediately convey the feature's meaning without requiring external documentation. This transparency helps researchers understand what each feature represents and how it was calculated.

2. **Traceability:** The naming convention maintains clear links to the original research papers and feature extraction methods. For example, the `MoM_` prefix directly references "Method of Moments" calculations, allowing researchers to trace features back to specific audio analysis techniques.

3. **Prevents Ambiguity:** The detailed names eliminate confusion when working with multiple datasets. Different statistical measures (mean, standard deviation, minimum, maximum) and different feature types are clearly distinguished, reducing the risk of accidentally using the wrong feature in analysis.

4. **Dataset Integrity:** The descriptive names preserve the original research context, ensuring that feature interpretations remain consistent with the published methodologies used to create the Million Song Dataset.

### Disadvantages of Current Column Names

Despite these advantages, the descriptive naming convention presents significant practical challenges for machine learning workflows:

1. **Excessive Length:** With an average of 18 characters and maximum of 108 characters, column names become unwieldy. The longest names like `Mean_Acc5_Mean_Mem20_PeakRatio_Average_Chroma_A_Power_powerFFT_WinH...` (truncated at 108 chars) are impractical for typing, displaying in tables, and referencing in code. This verbosity slows development and makes code harder to read.

2. **Not ML-Friendly:** Machine learning libraries often work more efficiently with shorter, consistent column identifiers. Long names increase memory overhead in model metadata, complicate feature importance visualizations, and make serialized models larger. Many ML tools expect compact feature names for optimal performance.

3. **Collision Risk When Merging:** Our analysis identified that `MSD_TRACKID` appears in multiple datasets (area-of-moments, lpc, and spectral-all). When merging these datasets for the Audio Similarity section, we must handle this collision explicitly. While this is the only collision detected among the 4 required datasets, it demonstrates the risk of assuming uniqueness with descriptive names.

4. **Inconsistent Patterns Across Datasets:** While individual datasets follow internal conventions, the 13 datasets use varying naming patterns. The `marsyas-timbral` dataset uses different conventions than the `jmir` datasets, making it difficult to write generic processing code that works uniformly across all datasets.

### Conclusion

While the descriptive column names preserve valuable semantic information and research context, their excessive length and collision potential make them impractical for machine learning workflows. A systematic renaming strategy is essential to balance interpretability with usability. The ideal solution should: (1) create unique, short identifiers suitable for ML algorithms, (2) eliminate collision risks when merging datasets, (3) maintain traceability through a mapping table that preserves the original descriptive names, and (4) apply consistently across all datasets. This approach allows us to work efficiently with compact column names while preserving the ability to interpret results using the original descriptive terminology.

## Q2(d) - Systematic Column Renaming Implementation

### Renaming Strategy: 2-Letter + 3-Digit Convention

To address the limitations identified in Q2(c), we implement a systematic renaming convention using **2-letter dataset codes + 3-digit zero-padded numbers** (format: `{AA}{NNN}`). This approach provides:

- **Fixed length:** All feature names are exactly 5 characters
- **Uniqueness:** Each dataset receives a distinct prefix (AO, LP, SP, TI)
- **Collision elimination:** The prefix system ensures no overlapping names across datasets
- **ML-friendly:** Short, consistent identifiers optimize performance
- **Scalability:** 3 digits support up to 999 features per dataset (current max is 125)

**Dataset Code Mapping:**
- `AO` = Area-Of-moments (21 features)
- `LP` = LPC (21 features)  
- `SP` = SPectral-all (17 features)
- `TI` = TImbral/marsyas (125 features)

**Special handling:** `MSD_TRACKID` remains unchanged as it serves as the common join key across all datasets.

In [None]:
# Q2(d) - Column Renaming Implementation (uses helper function from Cell 8)
cell_time = time.time()

hprint("Q2(d) - Column Renaming Implementation")

# rename_audio_columns() already defined in Cell 8 helper functions section
# function signature: rename_audio_columns(df, dataset_code, keep_msd_trackid=True)
# returns: (renamed_df, mapping_dict)

print("[info] using helper function: rename_audio_columns()")
print("[info] naming convention: {AA}{NNN} (2 letters + 3 zero-padded digits)")
print("[info] dataset codes: AO=area-of-moments, LP=lpc, SP=spectral-all, TI=timbral")
print("[info] MSD_TRACKID preserved as common join key")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")


In [None]:
# Q2(d) continued - Load and rename the 4 required datasets
cell_time = time.time()

print("\n____Loading and Renaming 4 Required Datasets____\n")

# config for each dataset
datasets_config = [
    {
        'name': 'msd-jmir-area-of-moments-all-v1.0',
        'code': 'AO',
        'path': f"{WASBS_DATA}/audio/features/msd-jmir-area-of-moments-all-v1.0.csv/"
    },
    {
        'name': 'msd-jmir-lpc-all-v1.0',
        'code': 'LP',
        'path': f"{WASBS_DATA}/audio/features/msd-jmir-lpc-all-v1.0.csv/"
    },
    {
        'name': 'msd-jmir-spectral-all-all-v1.0',
        'code': 'SP',
        'path': f"{WASBS_DATA}/audio/features/msd-jmir-spectral-all-all-v1.0.csv/"
    },
    {
        'name': 'msd-marsyas-timbral-v1.0',
        'code': 'TI',
        'path': f"{WASBS_DATA}/audio/features/msd-marsyas-timbral-v1.0.csv/"
    }
]

# storage for renamed dataframes and mappings
renamed_dfs = {}
all_mappings = {}

for config in datasets_config:
    print(f"[loading] {config['name']}...")
    
    # load with schema
    schema = schemas[config['name']]
    df = spark.read.csv(config['path'], header=True, schema=schema)
    
    # count before
    col_count_before = len(df.columns)
    
    # rename columns
    renamed_df, mapping = rename_audio_columns(df, config['code'])
    
    # count after
    col_count_after = len(renamed_df.columns)
    
    # store results
    renamed_dfs[config['code']] = renamed_df
    all_mappings[config['code']] = mapping
    
    print(f"[renamed] {config['code']}: {col_count_before} columns → {col_count_after} columns")
    print(f"[sample] {list(renamed_df.columns)[:5]}...\n")

print(f"[summary] successfully renamed all 4 datasets")
print(f"[total mappings] {sum(len(m) for m in all_mappings.values())} column name mappings created")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")


In [None]:
# Q2(d) continued - Display before/after examples
cell_time = time.time()

print("\n____Before/After Column Naming Examples____\n")

# show examples from each dataset
examples = [
    ('AO', 'msd-jmir-area-of-moments-all-v1.0'),
    ('LP', 'msd-jmir-lpc-all-v1.0'),
    ('SP', 'msd-jmir-spectral-all-all-v1.0'),
    ('TI', 'msd-marsyas-timbral-v1.0')
]

for code, dataset_name in examples:
    print(f"[{code}] {dataset_name}:")
    mapping = all_mappings[code]
    
    # get first 5 feature columns (skip MSD_TRACKID if present)
    feature_mappings = [(old, new) for old, new in mapping.items() if old != 'MSD_TRACKID'][:5]
    
    for old_name, new_name in feature_mappings:
        # truncate long names for display
        display_old = old_name if len(old_name) <= 60 else old_name[:57] + '...'
        print(f"  {display_old:60s} → {new_name}")
    print()

# show length comparison
print("[length comparison]")
old_lengths = []
new_lengths = []
for mapping in all_mappings.values():
    for old, new in mapping.items():
        if old != 'MSD_TRACKID':  # exclude join key
            old_lengths.append(len(old))
            new_lengths.append(len(new))

print(f"  original names: avg={sum(old_lengths)/len(old_lengths):.1f} chars, max={max(old_lengths)} chars")
print(f"  new names:      avg={sum(new_lengths)/len(new_lengths):.1f} chars, max={max(new_lengths)} chars")
print(f"  reduction:      {100*(1 - sum(new_lengths)/sum(old_lengths)):.1f}% fewer characters overall")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")


In [None]:
# Q2(d) continued - Create comprehensive mapping table (translation table)
cell_time = time.time()

print("\n____Creating Column Name Mapping Table____\n")

# build comprehensive mapping dataframe
mapping_rows = []
for dataset_code, mapping in all_mappings.items():
    # get full dataset name
    dataset_full_names = {
        'AO': 'msd-jmir-area-of-moments-all-v1.0',
        'LP': 'msd-jmir-lpc-all-v1.0',
        'SP': 'msd-jmir-spectral-all-all-v1.0',
        'TI': 'msd-marsyas-timbral-v1.0'
    }
    full_name = dataset_full_names[dataset_code]
    
    # add each mapping
    for original, new in mapping.items():
        mapping_rows.append({
            'dataset_code': dataset_code,
            'dataset_name': full_name,
            'original_column_name': original,
            'new_column_name': new,
            'original_length': len(original),
            'new_length': len(new),
            'is_join_key': 'Yes' if original == 'MSD_TRACKID' else 'No'
        })

# create pandas dataframe
mapping_df = pd.DataFrame(mapping_rows)

print(f"[created] mapping table with {len(mapping_df)} rows")
print(f"[datasets] {mapping_df['dataset_code'].nunique()} datasets")
print(f"[columns] {mapping_df.groupby('dataset_code')['new_column_name'].count().to_dict()}")

# display sample
print("\n[sample] first 10 rows of mapping table:")
display(mapping_df.head(10))

# save to csv for supplementary materials
csv_output_path = '../report/supplementary/audio_column_name_mapping.csv'
mapping_df.to_csv(csv_output_path, index=False)
print(f"\n[saved] mapping table to: {csv_output_path}")
print("[info] this csv file can be used to translate between original and new column names")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")


In [None]:
# Q2(d) continued - Verify renaming success and show final statistics
cell_time = time.time()

print("\n____Renaming Verification and Final Statistics____\n")

# verify all renamed dataframes
print("[verification] checking renamed dataframes:")
for code, df in renamed_dfs.items():
    print(f"  {code}: {len(df.columns)} columns, {df.count()} rows")

# check for any column name collisions after renaming
print("\n[collision check] verifying uniqueness across all datasets:")
all_new_columns = []
for code, mapping in all_mappings.items():
    all_new_columns.extend([new for old, new in mapping.items() if old != 'MSD_TRACKID'])

unique_columns = set(all_new_columns)
if len(all_new_columns) == len(unique_columns):
    print(f"  ✓ all {len(all_new_columns)} feature columns are unique across datasets")
    print(f"  ✓ no collisions detected")
else:
    print(f"  ✗ warning: {len(all_new_columns) - len(unique_columns)} collision(s) found")

# verify MSD_TRACKID preserved
print("\n[join key check] verifying MSD_TRACKID preservation:")
for code, df in renamed_dfs.items():
    has_trackid = 'MSD_TRACKID' in df.columns
    status = '✓' if has_trackid else '✗'
    print(f"  {status} {code}: MSD_TRACKID {'present' if has_trackid else 'MISSING'}")

# final summary
print("\n[summary] Q2(d) systematic column renaming complete:")
print(f"  • renamed {sum(len(m) for m in all_mappings.values())} columns across 4 datasets")
print(f"  • naming convention: 2-letter code + 3-digit number (e.g., AO001, LP001)")
print(f"  • MSD_TRACKID preserved as common join key")
print(f"  • mapping table saved to: report/supplementary/audio_column_name_mapping.csv")
print(f"  • average name length reduced from {sum(old_lengths)/len(old_lengths):.1f} to {sum(new_lengths)/len(new_lengths):.1f} characters")
print(f"  • ready for audio similarity section (binary and multiclass classification)")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")


## Processing Section Completion

The following cells complete the Processing section requirements by generating AI-readable artifacts and validation outputs.

In [None]:
hprint("Infrastructure: Local Paths")
# supports: all processing completion cells
# does: defines local supplementary path and ensures directory exists for ai-readable outputs

# local supplementary folder (relative from code/ directory)
LOCAL_SUPPLEMENTARY = '../report/supplementary/'

# ensure local directory exists
os.makedirs(LOCAL_SUPPLEMENTARY, exist_ok=True)

print(f"[paths] WASBS_DATA: {WASBS_DATA}")
print(f"[paths] WASBS_USER: {WASBS_USER}")
print(f"[paths] LOCAL_SUPPLEMENTARY: {LOCAL_SUPPLEMENTARY}")
print(f"[check] supplementary folder exists: {os.path.exists(LOCAL_SUPPLEMENTARY)}")

In [None]:
hprint("Processing: Q1(b)01")
# supports: Q1(b) — compute dataset statistics (names, sizes, formats, row counts)
# does: extracts hdfs directory sizes, creates comprehensive statistics dataframe, saves as csv/json/png

cell_time = time.time()

print("\n[Q1(b)] extracting dataset statistics from WASBS_DATA...")

# extract directory sizes using hdfs dfs -du
def get_directory_size(path):
    """get size of hdfs directory in bytes"""
    try:
        result = get_ipython().getoutput(f'hdfs dfs -du -s {path}')
        if result:
            size_bytes = int(result[0].split()[0])
            return size_bytes
    except Exception as e:
        print(f"[warning] failed to get size for {path}: {e}")
        return 0
    return 0

# define dataset paths and extract sizes
datasets = [
    ('audio-features', f"{WASBS_DATA}audio/features/"),
    ('audio-attributes', f"{WASBS_DATA}audio/attributes/"),
    ('genre', f"{WASBS_DATA}genre/"),
    ('main', f"{WASBS_DATA}main/"),
    ('tasteprofile', f"{WASBS_DATA}tasteprofile/"),
    ('tasteprofile-triplets', f"{WASBS_DATA}tasteprofile/triplets.tsv/"),
    ('tasteprofile-mismatches', f"{WASBS_DATA}tasteprofile/mismatches/")
]

print("[processing] extracting sizes for each dataset directory...")
stats_data = []
for name, path in datasets:
    size_bytes = get_directory_size(path)
    size_mb = size_bytes / (1024**2)
    stats_data.append({
        'dataset': name,
        'path': path,
        'size_bytes': size_bytes,
        'size_mb': round(size_mb, 2)
    })

# Display report-formatted results with MB units and 3-column tabbed format
print("\n" + "="*80)
print("MILLION SONG DATASET - STORAGE SUMMARY")
print("="*80)

total_size_mb = sum(item['size_mb'] for item in stats_data)
total_size_bytes = sum(item['size_bytes'] for item in stats_data)

print("[result] Dataset sizes (MB format with 3-column tabbed output):")
print("MB\t\tBytes\t\t\tName")
print("-"*80)

for item in stats_data:
    name = item['dataset']
    size_mb = item['size_mb']
    size_bytes = item['size_bytes']
    print(f"[result] {size_mb:.2f}\t\t{size_bytes:,}\t\t\t{name}")

print("-"*80)
print(f"[result] TOTAL:\t{total_size_mb:.2f}\t\t{total_size_bytes:,}\t\t\tALL DATASETS")
print("="*80)

# create statistics dataframe
stats_df = pd.DataFrame(stats_data)

# add row counts (will be populated after loading data)
stats_df['row_count'] = 0  # placeholder, will update in next cell
stats_df['column_count'] = 0  # placeholder, will update in next cell

print(f"\n[DEEBUG] dataset statistics summary (internal):")
print(stats_df.to_string(index=False))

# save as csv
csv_path = f"{LOCAL_SUPPLEMENTARY}dataset_statistics.csv"
stats_df.to_csv(csv_path, index=False)
print(f"\n[save] csv: {csv_path}")

# save as json  
json_path = f"{LOCAL_SUPPLEMENTARY}dataset_statistics.json"
stats_dict = {
    'total_size_mb': round(stats_df['size_mb'].sum(), 2),
    'total_size_bytes': int(stats_df['size_bytes'].sum()),
    'dataset_count': len(stats_df),
    'datasets': stats_df.to_dict('records')
}
with open(json_path, 'w') as f:
    json.dump(stats_dict, f, indent=2)
print(f"[save] json: {json_path}")

# save as png table image
fig, ax = plt.subplots(figsize=(14, 6))
ax.axis('off')

# create table with formatted data
table_data = []
for _, row in stats_df.iterrows():
    table_data.append([
        row['dataset'],
        f"{row['size_mb']:.2f} MB",
        f"{row['size_bytes']:,}",
        str(row['row_count']) if row['row_count'] > 0 else 'TBD',
        str(row['column_count']) if row['column_count'] > 0 else 'TBD'
    ])

table = ax.table(
    cellText=table_data,
    colLabels=['Dataset', 'Size (MB)', 'Size (Bytes)', 'Rows', 'Columns'],
    cellLoc='left',
    loc='center',
    colWidths=[0.25, 0.15, 0.25, 0.15, 0.15]
)
table.auto_set_font_size(False)
table.set_fontsize(9)
table.scale(1, 2)

# style header row
for i in range(5):
    table[(0, i)].set_facecolor('#4472C4')
    table[(0, i)].set_text_props(weight='bold', color='white')

plt.title('Dataset Statistics Summary', fontsize=14, weight='bold', pad=20)
png_path = f"{LOCAL_SUPPLEMENTARY}dataset_statistics.png"
plt.savefig(png_path, bbox_inches='tight', dpi=150)
plt.close()
print(f"[result] png: {png_path}")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")

In [None]:
hprint("Processing: Q2(validation)01")
# supports: Q2(b) — validate generated schemas against actual data with inferschema
# does: loads 4 audio feature files with inferschema=true, compares to generated schemas, handles track_id column

cell_time = time.time()

print("\n[Q2(validation)] validating generated schemas against actual data...")

# define audio feature files
audio_files = {
    'AO': ('msd-jmir-area-of-moments-all-v1.0.csv', 'area-of-moments'),
    'LP': ('msd-jmir-lpc-all-v1.0.csv', 'lpc'),
    'SP': ('msd-jmir-spectral-all-derivatives-all-v1.0.csv', 'spectral'),
    'TI': ('msd-marsyas-timbral-v1.0.csv', 'timbral')
}

validation_results = {}

for code, (filename, attr_key) in audio_files.items():
    print(f"\n[validate] {code}: {filename}")
    
    try:
        # load with inferschema
        file_path = f"{WASBS_DATA}audio/features/{filename}"
        df_inferred = spark.read.csv(file_path, header=False, inferSchema=True)
        inferred_schema = df_inferred.schema
        
        print(f"  • inferred columns: {len(inferred_schema.fields)}")
        print(f"  • inferred types: {[str(f.dataType) for f in inferred_schema.fields[:3]]} ...")
        
        # get generated schema
        if 'generated_schemas' in dir() and code in generated_schemas:
            generated_schema = generated_schemas[code]
            generated_cols = len(generated_schema.fields)
            
            print(f"  • generated columns: {generated_cols}")
            
            # check for extra track_id column
            inferred_cols = len(inferred_schema.fields)
            extra_track_id = inferred_cols == generated_cols + 1
            
            # determine validation status
            if inferred_cols == generated_cols:
                status = "PASS"
            elif extra_track_id:
                status = "PASS_WITH_TRACK_ID"
            else:
                status = "MISMATCH"
                
            validation_results[code] = {
                'filename': filename,
                'inferred_columns': inferred_cols,
                'generated_columns': generated_cols,
                'extra_track_id': extra_track_id,
                'status': status
            }
            
            print(f"  • status: {status}")
            
        else:
            print(f"  • warning: no generated schema found for {code.lower()}")
            validation_results[code] = {
                'filename': filename,
                'inferred_columns': len(inferred_schema.fields),
                'generated_columns': 0,
                'extra_track_id': False,
                'status': 'MISSING_SCHEMA'
            }
            
    except Exception as e:
        print(f"  • error: {str(e)}")
        validation_results[code] = {
            'filename': filename,
            'inferred_columns': 0,
            'generated_columns': 0,
            'extra_track_id': False,
            'status': 'ERROR'
        }

print("\n[DEEBUG] schema validation summary:")
for code, result in validation_results.items():
    print(f"  • {code}: {result['status']}")

# save validation results as json
json_path = f"{LOCAL_SUPPLEMENTARY}schema_validation.json"
with open(json_path, 'w') as f:
    json.dump(validation_results, f, indent=2)
print(f"\n[save] json: {json_path}")

# create validation comparison table as png
fig, ax = plt.subplots(figsize=(12, 5))
ax.axis('off')

table_data = []
for code, result in validation_results.items():
    if result['status'] != 'MISSING_SCHEMA':
        table_data.append([
            code,
            result['filename'],
            result['inferred_columns'],
            result['generated_columns'],
            'Yes' if result['extra_track_id'] else 'No',
            result['status']
        ])

# check if we have data for the table
if len(table_data) > 0:
    table = ax.table(
        cellText=table_data,
        colLabels=['Dataset', 'Filename', 'Inferred Cols', 'Generated Cols', 'Extra track_id', 'Status'],
        cellLoc='left',
        loc='center',
        colWidths=[0.08, 0.35, 0.12, 0.12, 0.13, 0.10]
    )
    table.auto_set_font_size(False)
    table.set_fontsize(9)
    table.scale(1, 2)

    # style header
    for i in range(6):
        table[(0, i)].set_facecolor('#4472C4')
        table[(0, i)].set_text_props(weight='bold', color='white')

    # colour status cells
    for i, (code, result) in enumerate(validation_results.items(), 1):
        if result['status'] != 'MISSING_SCHEMA':
            if result['status'] == 'PASS' or result['status'] == 'PASS_WITH_TRACK_ID':
                table[(i, 5)].set_facecolor('#C6EFCE')
            elif result['status'] == 'MISMATCH':
                table[(i, 5)].set_facecolor('#FFC7CE')

    plt.title('Schema Validation Results', fontsize=14, weight='bold', pad=20)
else:
    # create a message when no validation data is available
    ax.text(0.5, 0.5, 'No schema validation data available\n(All schemas missing)', 
            ha='center', va='center', fontsize=14, 
            bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgray"))
    plt.title('Schema Validation Results - No Data', fontsize=14, weight='bold', pad=20)

png_path = f"{LOCAL_SUPPLEMENTARY}schema_comparison.png"
plt.savefig(png_path, bbox_inches='tight', dpi=150)
plt.close()
print(f"[result] png: {png_path}")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")

In [None]:
hprint("Processing: Q2(counts)01")
# supports: Q2 — document row counts for all datasets
# does: counts rows in all dataframes (4 audio + genre + main + tasteprofile), saves as json/png

cell_time = time.time()

print("\n[Q2(counts)] counting rows in all datasets...")

row_counts = {}

# count rows in audio feature datasets (using renamed dataframes if available)
print("\n[DEEBUG] audio feature datasets:")
if 'renamed_dfs' in dir() and renamed_dfs:
    for code, df in renamed_dfs.items():
        count = df.count()
        row_counts[f'audio_{code.lower()}'] = count
        print(f"  • {code}: {count:,} rows")
else:
    print("  • warning: renamed_dfs not found, loading from original files...")
    for code, (filename, _) in audio_files.items():
        file_path = f"{WASBS_DATA}audio/features/{filename}"
        df = spark.read.csv(file_path, header=False, inferSchema=True)
        count = df.count()
        row_counts[f'audio_{code.lower()}'] = count
        print(f"  • {code}: {count:,} rows")

# count rows in other datasets (load if not already in memory)
print("\n[info] other datasets:")

# genre dataset
try:
    genre_path = f"{WASBS_DATA}genre/"
    df_genre = spark.read.parquet(genre_path)
    count = df_genre.count()
    row_counts['genre'] = count
    print(f"  • genre: {count:,} rows")
except Exception as e:
    print(f"  • genre: unable to load ({e})")
    row_counts['genre'] = 0

# main dataset
try:
    main_path = f"{WASBS_DATA}main/"
    df_main = spark.read.parquet(main_path)
    count = df_main.count()
    row_counts['main'] = count
    print(f"  • main: {count:,} rows")
except Exception as e:
    print(f"  • main: unable to load ({e})")
    row_counts['main'] = 0

# tasteprofile dataset
try:
    taste_path = f"{WASBS_DATA}tasteprofile/"
    df_taste = spark.read.parquet(taste_path)
    count = df_taste.count()
    row_counts['tasteprofile'] = count
    print(f"  • tasteprofile: {count:,} rows")
except Exception as e:
    print(f"  • tasteprofile: unable to load ({e})")
    row_counts['tasteprofile'] = 0

print(f"\n[info] total datasets counted: {len(row_counts)}")
print(f"[info] total rows across all datasets: {sum(row_counts.values()):,}")

# save as json
json_path = f"{LOCAL_SUPPLEMENTARY}row_counts.json"
with open(json_path, 'w') as f:
    json.dump(row_counts, f, indent=2)
print(f"\n[save] json: {json_path}")

# create row counts table as png
fig, ax = plt.subplots(figsize=(10, 6))
ax.axis('off')

table_data = [[k.replace('_', ' ').title(), f"{v:,}"] for k, v in sorted(row_counts.items())]
table_data.append(['TOTAL', f"{sum(row_counts.values()):,}"])

table = ax.table(
    cellText=table_data,
    colLabels=['Dataset', 'Row Count'],
    cellLoc='left',
    loc='center',
    colWidths=[0.6, 0.4]
)
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 2.5)

# style header
for i in range(2):
    table[(0, i)].set_facecolor('#4472C4')
    table[(0, i)].set_text_props(weight='bold', color='white')

# style total row
last_row = len(table_data)
for i in range(2):
    table[(last_row, i)].set_facecolor('#E7E6E6')
    table[(last_row, i)].set_text_props(weight='bold')

plt.title('Row Counts by Dataset', fontsize=14, weight='bold', pad=20)
png_path = f"{LOCAL_SUPPLEMENTARY}row_counts.png"
plt.savefig(png_path, bbox_inches='tight', dpi=150)
plt.close()
print(f"[result] png: {png_path}")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")

In [None]:
hprint("Processing: Q2(d)visualise01")
# supports: Q2(d) — visualise renamed schemas for documentation
# does: generates 4 png schema diagrams with column mappings, checks existence first to avoid regeneration

cell_time = time.time()

print("\n[Q2(d)] generating schema visualisation images...")

# define schema visualisations to create
schema_vis = {
    'AO': ('aom_schema.png', 'Area of Moments Schema'),
    'LP': ('lpc_schema.png', 'Linear Predictive Coding Schema'),
    'SP': ('spectral_schema.png', 'Spectral Features Schema'),
    'TI': ('timbral_schema.png', 'Timbral Features Schema')
}

images_created = 0
images_skipped = 0

for code, (filename, title) in schema_vis.items():
    png_path = f"{LOCAL_SUPPLEMENTARY}{filename}"
    
    # check if image already exists
    if os.path.exists(png_path):
        print(f"  • {code}: skipped (already exists) - {png_path}")
        images_skipped += 1
        continue
    
    print(f"  • {code}: generating {filename}...")
    
    # get renamed dataframe
    if 'renamed_dfs' in dir() and code in renamed_dfs:
        df = renamed_dfs[code]
        
        # extract schema information
        schema_info = []
        for i, field in enumerate(df.schema.fields, 1):
            type_str = str(field.dataType).replace('Type()', '')
            schema_info.append(f"{field.name:<15} {type_str:<10}")
        
        # create figure
        fig, ax = plt.subplots(figsize=(10, max(8, len(schema_info) * 0.3)))
        ax.axis('off')
        
        # add title
        ax.text(0.5, 0.98, title, fontsize=14, weight='bold', ha='center', va='top')
        ax.text(0.5, 0.95, f"Total Columns: {len(schema_info)}", fontsize=10, ha='center', va='top')
        
        # add schema text in columns (split if too many)
        if len(schema_info) <= 30:
            # single column
            schema_text = '\n'.join(schema_info)
            ax.text(0.1, 0.90, schema_text, fontsize=8, family='monospace', va='top')
        else:
            # two columns
            mid = len(schema_info) // 2
            col1 = '\n'.join(schema_info[:mid])
            col2 = '\n'.join(schema_info[mid:])
            ax.text(0.05, 0.90, col1, fontsize=8, family='monospace', va='top')
            ax.text(0.52, 0.90, col2, fontsize=8, family='monospace', va='top')
        
        plt.savefig(png_path, bbox_inches='tight', dpi=150)
        plt.close()
        print(f"    ✓ saved: {png_path}")
        images_created += 1
    else:
        print(f"    ✗ warning: renamed dataframe for {code} not found")

print(f"\n[summary] images created: {images_created}, skipped: {images_skipped}")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")

In [None]:
hprint("Processing: Q2(schemas)save01")
# supports: Q2(b) — persist schemas and data samples for ai analysis
# does: converts structtype schemas to json format, saves 10-row samples of each dataset as csv, creates metadata

cell_time = time.time()

print("\n[Q2(schemas)] saving schemas and data samples for ai analysis...")

# collect all schemas in json format
all_schemas_json = {}

for code, (filename, attr_key) in audio_files.items():
    print(f"\n[save] {code}: {filename}")
    
    # get renamed dataframe
    if 'renamed_dfs' in dir() and code in renamed_dfs:
        df = renamed_dfs[code]
        
        # convert schema to json-serialisable format
        schema_json = {
            'fields': [
                {
                    'name': f.name,
                    'type': str(f.dataType),
                    'nullable': f.nullable
                }
                for f in df.schema.fields
            ],
            'column_count': len(df.schema.fields)
        }
        all_schemas_json[code] = schema_json
        print(f"  • schema: {len(df.schema.fields)} fields")
        
        # save sample data as csv (10 rows)
        sample_csv_path = f"{LOCAL_SUPPLEMENTARY}{code.lower()}_sample.csv"
        df.limit(10).toPandas().to_csv(sample_csv_path, index=False)
        print(f"  • sample csv: {sample_csv_path}")
        
        # save individual statistics as json
        stats = {
            'dataset_code': code,
            'filename': filename,
            'columns': len(df.schema.fields),
            'sample_rows': 10,
            'column_names': [f.name for f in df.schema.fields],
            'column_types': [str(f.dataType) for f in df.schema.fields]
        }
        stats_json_path = f"{LOCAL_SUPPLEMENTARY}{code.lower()}_stats.json"
        with open(stats_json_path, 'w') as f:
            json.dump(stats, f, indent=2)
        print(f"  • stats json: {stats_json_path}")
    else:
        print(f"  • warning: renamed dataframe for {code} not found")

# save combined schemas json
schemas_json_path = f"{LOCAL_SUPPLEMENTARY}audio_schemas.json"
with open(schemas_json_path, 'w') as f:
    json.dump(all_schemas_json, f, indent=2)
print(f"\n[save] combined schemas json: {schemas_json_path}")
print(f"[info] schemas saved for {len(all_schemas_json)} datasets")

cell_time = time.time() - cell_time
print(f"\n[time] cell time (sec): {cell_time:5.2f}")
print(f"[time] cell time (min): {cell_time/60:5.2f}")

## Processing Section Complete

All required artifacts have been generated:

**Dataset Documentation:**
- `dataset_statistics.csv/json/png` - Complete dataset inventory with sizes
- `row_counts.json/png` - Row counts for all datasets

**Schema Validation:**
- `schema_validation.json` - Validation results comparing generated vs inferred schemas
- `schema_comparison.png` - Visual comparison table

**Schema Documentation:**
- `audio_schemas.json` - All 4 audio feature schemas in JSON format
- `aom_schema.png` - Area of Moments schema diagram
- `lpc_schema.png` - Linear Predictive Coding schema diagram
- `spectral_schema.png` - Spectral features schema diagram
- `timbral_schema.png` - Timbral features schema diagram

**Data Samples (AI-readable):**
- `ao_sample.csv`, `lp_sample.csv`, `sp_sample.csv`, `ti_sample.csv` - 10 rows each
- `ao_stats.json`, `lp_stats.json`, `sp_stats.json`, `ti_stats.json` - Individual metadata

All files saved to: `../report/supplementary/`

**Ready for Audio Similarity Section**