**Spark notebook**

This notebook will only work in a Jupyter session running on `mathmadslinux2p`.

You can start your own Jupyter session on `mathmadslinux2p` and open this notebook in Chrome on the MADS Windows server by

1. Login to the MADS Windows server using https://mathportal.canterbury.ac.nz/.
2. Download or copy this notebook to your home directory.
3. Open powershell and run `ssh mathmadslinux2p`.
4. Run `start_pyspark_notebook` or `/opt/anaconda3/bin/jupyter-notebook --ip 132.181.129.68 --port $((8000 + $((RANDOM % 999))))`.
5. Copy / paste the url provided in the shell window into Chrome on the MADS Windows server.
6. Open the notebook from the Jupyter root directory (which is your home directory).
7. Run `start_spark()` to start a spark session in the notebook.
8. Run `stop_spark()` before closing the notebook or kill your spark application by hand using the link in the Spark UI.

In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [2]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

0,1
spark.dynamicAllocation.enabled,false
spark.executor.instances,4
spark.ui.port,4494
spark.sql.warehouse.dir,file:/users/home/dcp31/assignment_2/spark-warehouse
spark.driver.memory,4g
spark.executor.memory,4g
spark.master,spark://masternode2:7077
spark.app.name,dcp31 (jupyter)
spark.executor.id,driver
spark.executor.cores,2


In [3]:
# Write your imports and code here or insert cells below

from pyspark.sql import Row, DataFrame, Window, functions as F
from pyspark.sql.types import *

# Data Processing

### Question 1 (a)

In [231]:
!hdfs dfs -count /data/msd/
!hdfs dfs -ls /data/msd/

          24          133        13896584474 /data/msd
Found 4 items
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/audio
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/genre
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:28 /data/msd/main
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/tasteprofile


In [4]:
!hdfs fsck /data/msd/ -files -blocks

Connecting to namenode via http://masternode2:9870/fsck?ugi=dcp31&files=1&blocks=1&path=%2Fdata%2Fmsd
FSCK started by dcp31 (auth:SIMPLE) from /192.168.40.11 for path /data/msd at Wed Jun 05 15:44:34 NZST 2024

/data/msd <dir>
/data/msd/audio <dir>
/data/msd/audio/attributes <dir>
/data/msd/audio/attributes/msd-jmir-area-of-moments-all-v1.0.attributes.csv 1051 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761895_21073 len=1051 Live_repl=8

/data/msd/audio/attributes/msd-jmir-lpc-all-v1.0.attributes.csv 671 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761904_21082 len=671 Live_repl=8

/data/msd/audio/attributes/msd-jmir-methods-of-moments-all-v1.0.attributes.csv 484 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761900_21078 len=484 Live_repl=8

/data/msd/audio/attributes/msd-jmir-mfcc-all-v1.0.attributes.csv 898 bytes

#### Audio

In [232]:
!hdfs dfs -count /data/msd/audio/
!hdfs dfs -ls /data/msd/audio/

          17          118        13167872421 /data/msd/audio
Found 3 items
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/audio/attributes
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/audio/features
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:28 /data/msd/audio/statistics


In [233]:
!hdfs dfs -count /data/msd/audio/attributes
!hdfs dfs -ls /data/msd/audio/attributes

           1           13             105513 /data/msd/audio/attributes
Found 13 items
-rwxr-xr-x   8 jsw93 supergroup       1051 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-area-of-moments-all-v1.0.attributes.csv
-rwxr-xr-x   8 jsw93 supergroup        671 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-lpc-all-v1.0.attributes.csv
-rwxr-xr-x   8 jsw93 supergroup        484 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-methods-of-moments-all-v1.0.attributes.csv
-rwxr-xr-x   8 jsw93 supergroup        898 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-mfcc-all-v1.0.attributes.csv
-rwxr-xr-x   8 jsw93 supergroup        777 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-spectral-all-all-v1.0.attributes.csv
-rwxr-xr-x   8 jsw93 supergroup        777 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-spectral-derivatives-all-all-v1.0.attributes.csv
-rwxr-xr-x   8 jsw93 supergroup      12317 2021-09-29 10:35 /data/msd/audio/attributes/msd-marsyas-timbral-v1.0

In [15]:
!hdfs dfs -cat /data/msd/audio/attributes/msd-jmir-area-of-moments-all-v1.0.attributes.csv | head

Area_Method_of_Moments_Overall_Standard_Deviation_1,real
Area_Method_of_Moments_Overall_Standard_Deviation_2,real
Area_Method_of_Moments_Overall_Standard_Deviation_3,real
Area_Method_of_Moments_Overall_Standard_Deviation_4,real
Area_Method_of_Moments_Overall_Standard_Deviation_5,real
Area_Method_of_Moments_Overall_Standard_Deviation_6,real
Area_Method_of_Moments_Overall_Standard_Deviation_7,real
Area_Method_of_Moments_Overall_Standard_Deviation_8,real
Area_Method_of_Moments_Overall_Standard_Deviation_9,real
Area_Method_of_Moments_Overall_Standard_Deviation_10,real


In [16]:
!hdfs dfs -cat /data/msd/audio/attributes/msd-trh-v1.0.attributes.csv | head

"component_0",NUMERIC
"component_1",NUMERIC
"component_2",NUMERIC
"component_3",NUMERIC
"component_4",NUMERIC
"component_5",NUMERIC
"component_6",NUMERIC
"component_7",NUMERIC
"component_8",NUMERIC
"component_9",NUMERIC


In [234]:
!hdfs dfs -count /data/msd/audio/features
!hdfs dfs -ls /data/msd/audio/features # these are directories of part files

          14          104        13125542239 /data/msd/audio/features
Found 13 items
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:31 /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:33 /data/msd/audio/features/msd-jmir-lpc-all-v1.0.csv
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/audio/features/msd-jmir-methods-of-moments-all-v1.0.csv
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/audio/features/msd-jmir-mfcc-all-v1.0.csv
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:33 /data/msd/audio/features/msd-jmir-spectral-all-all-v1.0.csv
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/audio/features/msd-jmir-spectral-derivatives-all-all-v1.0.csv
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/audio/features/msd-marsyas-timbral-v1.0.csv
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:29 /data/msd/audio/f

In [36]:
!hdfs dfs -du -h /data/msd/audio/features/ # checking the sizes of the datasets

65.5 M   524.2 M  /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv
53.1 M   424.6 M  /data/msd/audio/features/msd-jmir-lpc-all-v1.0.csv
35.8 M   286.5 M  /data/msd/audio/features/msd-jmir-methods-of-moments-all-v1.0.csv
70.8 M   566.1 M  /data/msd/audio/features/msd-jmir-mfcc-all-v1.0.csv
51.1 M   408.9 M  /data/msd/audio/features/msd-jmir-spectral-all-all-v1.0.csv
51.1 M   408.9 M  /data/msd/audio/features/msd-jmir-spectral-derivatives-all-all-v1.0.csv
412.2 M  3.2 G    /data/msd/audio/features/msd-marsyas-timbral-v1.0.csv
1.3 G    10.3 G   /data/msd/audio/features/msd-mvd-v1.0.csv
240.3 M  1.9 G    /data/msd/audio/features/msd-rh-v1.0.csv
4.0 G    32.3 G   /data/msd/audio/features/msd-rp-v1.0.csv
640.6 M  5.0 G    /data/msd/audio/features/msd-ssd-v1.0.csv
1.4 G    11.5 G   /data/msd/audio/features/msd-trh-v1.0.csv
3.9 G    31.0 G   /data/msd/audio/features/msd-tssd-v1.0.csv


In [17]:
!hdfs dfs -ls /data/msd/audio/features/msd-tssd-v1.0.csv

Found 8 items
-rwxr-xr-x   8 jsw93 supergroup  523653885 2021-09-29 10:30 /data/msd/audio/features/msd-tssd-v1.0.csv/part-00000.csv.gz
-rwxr-xr-x   8 jsw93 supergroup  523973513 2021-09-29 10:31 /data/msd/audio/features/msd-tssd-v1.0.csv/part-00001.csv.gz
-rwxr-xr-x   8 jsw93 supergroup  523846402 2021-09-29 10:29 /data/msd/audio/features/msd-tssd-v1.0.csv/part-00002.csv.gz
-rwxr-xr-x   8 jsw93 supergroup  523674195 2021-09-29 10:29 /data/msd/audio/features/msd-tssd-v1.0.csv/part-00003.csv.gz
-rwxr-xr-x   8 jsw93 supergroup  523778813 2021-09-29 10:30 /data/msd/audio/features/msd-tssd-v1.0.csv/part-00004.csv.gz
-rwxr-xr-x   8 jsw93 supergroup  524004691 2021-09-29 10:30 /data/msd/audio/features/msd-tssd-v1.0.csv/part-00005.csv.gz
-rwxr-xr-x   8 jsw93 supergroup  523959080 2021-09-29 10:30 /data/msd/audio/features/msd-tssd-v1.0.csv/part-00006.csv.gz
-rwxr-xr-x   8 jsw93 supergroup  499578908 2021-09-29 10:29 /data/msd/audio/features/msd-tssd-v1.0.csv/part-00007.csv.gz


In [19]:
!hdfs dfs -cat /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv/part-00000.csv.gz | gunzip | head

1.431,6713.0,52600.0,160600000.0,1264000000.0,9943000000.0,7.086e+12,11400000000.0,89730000000.0,3.465e+15,5.252,11580.0,90080.0,-179100000.0,-1396000000.0,-10870000000.0,6.236e+12,12580000000.0,98020000000.0,2.97e+15,'TRMMMYQ128F932D901'
0.9864,3361.0,24270.0,40110000.0,287800000.0,2064000000.0,8.837e+11,2596000000.0,18630000000.0,3.232e+14,2.773,5774.0,41490.0,-44600000.0,-320900000.0,-2307000000.0,7.756e+11,2885000000.0,20760000000.0,2.883e+14,'TRMMMKD128F425225D'
1.791,6717.0,57790.0,160900000.0,1385000000.0,11910000000.0,7.105e+12,12520000000.0,1.077e+11,4.52e+15,6.43,11600.0,99690.0,-179500000.0,-1544000000.0,-13270000000.0,6.255e+12,13950000000.0,1.2e+11,3.976e+15,'TRMMMRX128F93187D9'
2.209,3371.0,34750.0,40350000.0,412300000.0,4210000000.0,8.912e+11,3710000000.0,37900000000.0,9.415e+14,5.734,5792.0,58320.0,-44870000.0,-454600000.0,-4603000000.0,7.828e+11,4083000000.0,41370000000.0,8.199e+14,'TRMMMCH128F425532C'
0.6846,6708.0,30690.0,160400000.0,748900000.0,3492000000.0,7.07

In [18]:
!hdfs dfs -ls /data/msd/audio/statistics

Found 1 items
-rwxr-xr-x   8 jsw93 supergroup   42224669 2021-09-29 10:28 /data/msd/audio/statistics/sample_properties.csv.gz


In [21]:
!hdfs dfs -cat /data/msd/audio/statistics/sample_properties.csv.gz | gunzip | head

track_id,title,artist_name,duration,7digita_Id,sample_bitrate,sample_length,sample_rate,sample_mode,sample_version,filesize
TRMMMYQ128F932D901,"Silent Night","Faster Pussy cat",252.05506,7032331,128,60.1935770567,22050,1,2,960887
TRMMMKD128F425225D,"Tanssi vaan",Karkkiautomaatti,156.55138,1514808,64,30.2244270016,22050,1,2,242038
TRMMMRX128F93187D9,"No One Could Ever","Hudson Mohawke",138.97098,6945353,128,60.1935770567,22050,1,2,960887
TRMMMCH128F425532C,"Si Vos Querés","Yerba Brava",145.05751,2168257,64,30.2083516484,22050,1,2,240534
TRMMMWA128F426B589,"Tangle Of Aspens","Der Mystic",514.29832,2264873,64,60.3382103611,22050,1,2,480443
TRMMMXN128F42936A5,"Symphony No. 1 G minor ""Sinfonie Serieuse""/Allegro con energia","David Montgomery",816.53506,3360982,128,30.1360348456,44100,0,1,481070
TRMMMLR128F1494097,"We Have Got Love","Sasha / Turbulence",212.37506,552626,64,60.3542857143,22050,1,2,480686
TRMMMBB12903CB7D21,"2 Da Beat Ch'yall","Kris Kross",221.20444,6435649,128,30.13

In [14]:
!hdfs fsck /data/msd/audio/statistics/ -files -blocks

Connecting to namenode via http://masternode2:9870/fsck?ugi=dcp31&files=1&blocks=1&path=%2Fdata%2Fmsd%2Faudio%2Fstatistics
FSCK started by dcp31 (auth:SIMPLE) from /192.168.40.11 for path /data/msd/audio/statistics at Tue Jun 04 08:19:18 NZST 2024

/data/msd/audio/statistics <dir>
/data/msd/audio/statistics/sample_properties.csv.gz 42224669 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761717_20895 len=42224669 Live_repl=8


Status: HEALTHY
 Number of data-nodes:	32
 Number of racks:		1
 Total dirs:			1
 Total symlinks:		0

Replicated Blocks:
 Total size:	42224669 B
 Total files:	1
 Total blocks (validated):	1 (avg. block size 42224669 B)
 Minimally replicated blocks:	1 (100.0 %)
 Over-replicated blocks:	0 (0.0 %)
 Under-replicated blocks:	0 (0.0 %)
 Mis-replicated blocks:		0 (0.0 %)
 Default replication factor:	4
 Average block replication:	8.0
 Missing blocks:		0
 Corrupt blocks:		0
 Missing replicas:	

#### Genre

In [235]:
!hdfs dfs -count /data/msd/genre/
!hdfs dfs -ls /data/msd/genre/

           1            3           31585889 /data/msd/genre
Found 3 items
-rwxr-xr-x   8 jsw93 supergroup   11625230 2021-09-29 10:35 /data/msd/genre/msd-MAGD-genreAssignment.tsv
-rwxr-xr-x   8 jsw93 supergroup    8820054 2021-09-29 10:35 /data/msd/genre/msd-MASD-styleAssignment.tsv
-rwxr-xr-x   8 jsw93 supergroup   11140605 2021-09-29 10:35 /data/msd/genre/msd-topMAGD-genreAssignment.tsv


In [101]:
!hdfs dfs -cat /data/msd/genre/msd-MASD-styleAssignment.tsv | head

TRAAAAK128F9318786	Metal_Alternative
TRAAAAV128F421A322	Punk
TRAAAAW128F429D538	Hip_Hop_Rap
TRAAACV128F423E09E	Rock_Neo_Psychedelia
TRAAAEF128F4273421	Pop_Indie
TRAAAFP128F931B4E3	Hip_Hop_Rap
TRAAAGR128F425B14B	Pop_Contemporary
TRAAAHD128F42635A5	Rock_Hard
TRAAAHJ128F931194C	Pop_Indie
TRAAAHZ128E0799171	Hip_Hop_Rap
cat: Unable to write to output stream.


In [10]:
!hdfs fsck /data/msd/genre/ -files -blocks

Connecting to namenode via http://masternode2:9870/fsck?ugi=dcp31&files=1&blocks=1&path=%2Fdata%2Fmsd%2Fgenre
FSCK started by dcp31 (auth:SIMPLE) from /192.168.40.11 for path /data/msd/genre at Tue Jun 04 08:17:11 NZST 2024

/data/msd/genre <dir>
/data/msd/genre/msd-MAGD-genreAssignment.tsv 11625230 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761918_21096 len=11625230 Live_repl=8

/data/msd/genre/msd-MASD-styleAssignment.tsv 8820054 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761917_21095 len=8820054 Live_repl=8

/data/msd/genre/msd-topMAGD-genreAssignment.tsv 11140605 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761916_21094 len=11140605 Live_repl=8


Status: HEALTHY
 Number of data-nodes:	32
 Number of racks:		1
 Total dirs:			1
 Total symlinks:		0

Replicated Blocks:
 Total size:	31585889 

#### Main

In [236]:
!hdfs dfs -count /data/msd/main/
!hdfs dfs -ls /data/msd/main/

           2            2          182869445 /data/msd/main
Found 1 items
drwxr-xr-x   - jsw93 supergroup          0 2022-05-23 12:11 /data/msd/main/summary


In [237]:
!hdfs dfs -count /data/msd/main/summary/
!hdfs dfs -ls /data/msd/main/summary/

           1            2          182869445 /data/msd/main/summary
Found 2 items
-rwxr-xr-x   8 jsw93 supergroup   58658141 2021-09-29 10:28 /data/msd/main/summary/analysis.csv.gz
-rwxr-xr-x   8 jsw93 supergroup  124211304 2021-09-29 10:28 /data/msd/main/summary/metadata.csv.gz


In [18]:
!hdfs dfs -cat /data/msd/main/summary/analysis.csv.gz | gunzip | head

analysis_sample_rate,audio_md5,danceability,duration,end_of_fade_in,energy,idx_bars_confidence,idx_bars_start,idx_beats_confidence,idx_beats_start,idx_sections_confidence,idx_sections_start,idx_segments_confidence,idx_segments_loudness_max,idx_segments_loudness_max_time,idx_segments_loudness_start,idx_segments_pitches,idx_segments_start,idx_segments_timbre,idx_tatums_confidence,idx_tatums_start,key,key_confidence,loudness,mode,mode_confidence,start_of_fade_out,tempo,time_signature,time_signature_confidence,track_id
22050,aee9820911781c734e7694c5432990ca,0.0,252.05506,2.049,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0.777,-4.829,0,0.688,236.635,87.002,4,0.94,TRMMMYQ128F932D901
22050,ed222d07c83bac7689d52753610a513a,0.0,156.55138,0.258,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0.808,-10.555,1,0.355,148.66,150.778,1,0.0,TRMMMKD128F425225D
22050,96c7104889a128fef84fa469d60e380c,0.0,138.97098,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0.418,-2.06,1,0.566,138.971,177.768,4,0.446,TRMMMRX128F93187D9
2205

In [4]:
!hdfs dfs -cat /data/msd/main/summary/metadata.csv.gz | gunzip | head

analyzer_version,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_mbid,artist_name,artist_playmeid,genre,idx_artist_terms,idx_similar_artists,release,release_7digitalid,song_hotttnesss,song_id,title,track_7digitalid
,4069,0.6498221002008776,0.3940318927141434,ARYZTJS1187B98C555,,,,357ff05d-848a-44cf-b608-cb34b5701ae5,Faster Pussy cat,44895,,0,0,Monster Ballads X-Mas,633681,0.5428987432910862,SOQMMHC12AB0180CB8,Silent Night,7032331
,113480,0.4396039666767154,0.3569921077564064,ARMVN3U1187FB3A1EB,,,,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9,Karkkiautomaatti,-1,,0,0,Karkuteillä,145266,0.2998774882739778,SOVFVAK12A8C1350D9,Tanssi vaan,1514808
,63531,0.6436805720579895,0.4375038365946544,ARGEKB01187FB50750,55.8578,"Glasgow, Scotland",-4.24251,3d403d44-36ce-465c-ad43-ae877e65adc4,Hudson Mohawke,-1,,0,0,Butter,625706,0.6178709693948196,SOGTUKN12AB017F4F1,No One Could Ever,6945353
,65051,0.44850115965646636,0.37234906851712

In [9]:
!hdfs fsck /data/msd/main/summary/ -files -blocks

Connecting to namenode via http://masternode2:9870/fsck?ugi=dcp31&files=1&blocks=1&path=%2Fdata%2Fmsd%2Fmain%2Fsummary
FSCK started by dcp31 (auth:SIMPLE) from /192.168.40.11 for path /data/msd/main/summary at Tue Jun 04 08:16:46 NZST 2024

/data/msd/main/summary <dir>
/data/msd/main/summary/analysis.csv.gz 58658141 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761716_20894 len=58658141 Live_repl=8

/data/msd/main/summary/metadata.csv.gz 124211304 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761715_20893 len=124211304 Live_repl=8


Status: HEALTHY
 Number of data-nodes:	32
 Number of racks:		1
 Total dirs:			1
 Total symlinks:		0

Replicated Blocks:
 Total size:	182869445 B
 Total files:	2
 Total blocks (validated):	2 (avg. block size 91434722 B)
 Minimally replicated blocks:	2 (100.0 %)
 Over-replicated blocks:	0 (0.0 %)
 Under-replicated blocks:	0 

#### Taste Profile

In [238]:
!hdfs dfs -count /data/msd/tasteprofile/
!hdfs dfs -ls /data/msd/tasteprofile/

           3           10          514256719 /data/msd/tasteprofile
Found 2 items
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/tasteprofile/mismatches
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/tasteprofile/triplets.tsv


In [239]:
!hdfs dfs -count /data/msd/tasteprofile/mismatches/
!hdfs dfs -ls /data/msd/tasteprofile/mismatches/

           1            2            2117524 /data/msd/tasteprofile/mismatches
Found 2 items
-rwxr-xr-x   8 jsw93 supergroup      91342 2021-09-29 10:35 /data/msd/tasteprofile/mismatches/sid_matches_manually_accepted.txt
-rwxr-xr-x   8 jsw93 supergroup    2026182 2021-09-29 10:35 /data/msd/tasteprofile/mismatches/sid_mismatches.txt


In [23]:
!hdfs dfs -cat /data/msd/tasteprofile/mismatches/sid_matches_manually_accepted.txt | head

9d8
< ERROR: <SOFQHZM12A8C142342 TRMWMFG128F92FFEF2> Josipa Lisac  -  razloga  !=  Lisac Josipa  -  1000 razloga
19d17
< ERROR: <SODXUTF12AB018A3DA TRMWPCD12903CCE5ED> Lutan Fyah  -  Nuh Matter the Crisis Feat. Midnite  !=  Midnite  -  Nah Matter the Crisis
29d26
< ERROR: <SOASCRF12A8C1372E6 TRMHIPJ128F426A2E2> Gaetano Donizetti  -  L'Elisir d'Amore: Act Two: Come sen va contento!  !=  Gianandrea Gavazzeni_ Orchestra E Coro Del Maggio Musicale Fiorentino_ Carlo Bergonzi_ Renata Scotto  -  L'Elisir D'Amore_ Act 2: Come Sen Va Contento (Adina) (Donizetti)
33d29
< ERROR: <SOITDUN12A58A7AACA TRMHXGK128F42446AB> C.J. Chenier  -  Ay, Ai Ai  !=  Clifton Chenier  -  Ay_ Ai Ai
52d47
< ERROR: <SOLZXUM12AB018BE39 TRMRSOF12903CCF516> 許志安  -  男人最痛  !=  Andy Hui  -  Nan Ren Zui Tong
cat: Unable to write to output stream.


In [6]:
!hdfs fsck /data/msd/tasteprofile/mismatches/ -files -blocks

Connecting to namenode via http://masternode2:9870/fsck?ugi=dcp31&files=1&blocks=1&path=%2Fdata%2Fmsd%2Ftasteprofile%2Fmismatches
FSCK started by dcp31 (auth:SIMPLE) from /192.168.40.11 for path /data/msd/tasteprofile/mismatches at Tue Jun 04 08:15:20 NZST 2024

/data/msd/tasteprofile/mismatches <dir>
/data/msd/tasteprofile/mismatches/sid_matches_manually_accepted.txt 91342 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761915_21093 len=91342 Live_repl=8

/data/msd/tasteprofile/mismatches/sid_mismatches.txt 2026182 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761914_21092 len=2026182 Live_repl=8


Status: HEALTHY
 Number of data-nodes:	32
 Number of racks:		1
 Total dirs:			1
 Total symlinks:		0

Replicated Blocks:
 Total size:	2117524 B
 Total files:	2
 Total blocks (validated):	2 (avg. block size 1058762 B)
 Minimally replicated blocks:	2 (100.0 %)
 Over-replicated blocks

In [240]:
!hdfs dfs -count /data/msd/tasteprofile/triplets.tsv/
!hdfs dfs -ls /data/msd/tasteprofile/triplets.tsv/

           1            8          512139195 /data/msd/tasteprofile/triplets.tsv
Found 8 items
-rwxr-xr-x   8 jsw93 supergroup   64020759 2021-09-29 10:35 /data/msd/tasteprofile/triplets.tsv/part-00000.tsv.gz
-rwxr-xr-x   8 jsw93 supergroup   64038083 2021-09-29 10:35 /data/msd/tasteprofile/triplets.tsv/part-00001.tsv.gz
-rwxr-xr-x   8 jsw93 supergroup   64077499 2021-09-29 10:35 /data/msd/tasteprofile/triplets.tsv/part-00002.tsv.gz
-rwxr-xr-x   8 jsw93 supergroup   64102442 2021-09-29 10:35 /data/msd/tasteprofile/triplets.tsv/part-00003.tsv.gz
-rwxr-xr-x   8 jsw93 supergroup   63998697 2021-09-29 10:35 /data/msd/tasteprofile/triplets.tsv/part-00004.tsv.gz
-rwxr-xr-x   8 jsw93 supergroup   64049032 2021-09-29 10:35 /data/msd/tasteprofile/triplets.tsv/part-00005.tsv.gz
-rwxr-xr-x   8 jsw93 supergroup   64064101 2021-09-29 10:35 /data/msd/tasteprofile/triplets.tsv/part-00006.tsv.gz
-rwxr-xr-x   8 jsw93 supergroup   63788582 2021-09-29 10:35 /data/msd/tasteprofile/triplets.tsv/part-00007.

In [22]:
!hdfs dfs -cat /data/msd/tasteprofile/triplets.tsv/part-00000.tsv.gz | gunzip | head

b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOAKIMP12A8C130995	1
b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOAPDEY12A81C210A9	1
b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBBMDR12A8C13253B	2
b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBFNSP12AF72A0E22	1
b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBFOVM12A58A7D494	1
b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBNZDC12A6D4FC103	1
b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBSUJE12A6D4F8CF5	2
b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBVFZR12A6D4F8AE3	1
b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBXALG12A8C13C108	1
b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBXHDL12A81C204C0	1

gzip: stdout: Broken pipe
cat: Unable to write to output stream.


In [5]:
# How many blocks?
!hdfs fsck /data/msd/tasteprofile/triplets.tsv/ -files -blocks

Connecting to namenode via http://masternode2:9870/fsck?ugi=dcp31&files=1&blocks=1&path=%2Fdata%2Fmsd%2Ftasteprofile%2Ftriplets.tsv
FSCK started by dcp31 (auth:SIMPLE) from /192.168.40.11 for path /data/msd/tasteprofile/triplets.tsv at Tue Jun 04 08:13:16 NZST 2024

/data/msd/tasteprofile/triplets.tsv <dir>
/data/msd/tasteprofile/triplets.tsv/part-00000.tsv.gz 64020759 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761910_21088 len=64020759 Live_repl=8

/data/msd/tasteprofile/triplets.tsv/part-00001.tsv.gz 64038083 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761912_21090 len=64038083 Live_repl=8

/data/msd/tasteprofile/triplets.tsv/part-00002.tsv.gz 64077499 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761911_21089 len=64077499 Live_repl=8

/data/msd/tasteprofile/triplets.tsv/part-00003.tsv.gz 64102442 

### Question 1 (b)

Loading datasets and counting rows.
Counting unique songs.

In [5]:
df = spark.read.csv("hdfs:////data/msd/main/summary/metadata.csv.gz", header=True, inferSchema=True)

row_count = df.count()

print(row_count)

1000000


In [6]:
df.select(F.col("song_id")).distinct().count()

998963

In [7]:
df = spark.read.csv("hdfs:////data/msd/main/summary/analysis.csv.gz", header=True, inferSchema=True)

row_count = df.count()

print(row_count)

1000000


In [8]:
df.select(F.col("track_id")).distinct().count() # analysis refers to tracks not songs

1000000

In [13]:
df = spark.read.csv("hdfs:////data/msd/audio/attributes/*.csv", header=False, inferSchema=True)

row_count = df.count()

print(row_count)

3916


In [21]:
df = spark.read.csv("hdfs:////data/msd/audio/attributes/msd-rh-v1.0.attributes.csv", header=False, inferSchema=True)

row_count = df.count()

print(row_count)

61


In [20]:
df = spark.read.csv("hdfs:////data/msd/audio/features/*.csv", header=False, inferSchema=True)

row_count = df.count()

print(row_count)

12927867


In [19]:
#msd-jmir-area-of-moments-all-v1.0.csv

df = spark.read.csv("hdfs:////data/msd/audio/features/msd-jmir-lpc-all-v1.0.csv", header=False, inferSchema=True)

row_count = df.count()

print(row_count)

994623


In [25]:
df = spark.read.csv("hdfs:////data/msd/audio/statistics/sample_properties.csv.gz", header=True, inferSchema=True)

row_count = df.count()

print(row_count)

992865


In [30]:
df = spark.read.csv("hdfs:////data/msd/tasteprofile/triplets.tsv/*.tsv.gz", sep='\t', header=False, inferSchema=True)

row_count = df.count()

print(row_count)

48373586


In [31]:
df = spark.read.text("hdfs:////data/msd/tasteprofile/mismatches/*.txt")

row_count = df.count()

print(row_count)

20032


In [34]:
df = spark.read.csv("hdfs:////data/msd/genre/*.tsv", sep='\t', header=False, inferSchema=True)

row_count = df.count()

print(row_count)

1103077


In [None]:
df = spark.read.csv("hdfs:////data/msd/genre/msd-MAGD-genreAssignment.tsv", sep='\t', header=False, inferSchema=True)

row_count = df.count()

print(row_count)

### Question 2 (b)

In [10]:
# load attributes 

attributes = (
    spark.read.format("com.databricks.spark.csv")
    .option("header", "false")
    .option("inferSchema", "true")
    .load("hdfs:////data/msd/audio/attributes/msd-jmir-spectral-all-all-v1.0.attributes.csv")
)

show_as_html(attributes)

Unnamed: 0,_c0,_c1
0,Spectral_Centroid_Overall_Standard_Deviation_1,real
1,Spectral_Rolloff_Point_Overall_Standard_Deviat...,real
2,Spectral_Flux_Overall_Standard_Deviation_1,real
3,Compactness_Overall_Standard_Deviation_1,real
4,Spectral_Variability_Overall_Standard_Deviation_1,real
5,Root_Mean_Square_Overall_Standard_Deviation_1,real
6,Fraction_Of_Low_Energy_Windows_Overall_Standar...,real
7,Zero_Crossings_Overall_Standard_Deviation_1,real
8,Spectral_Centroid_Overall_Average_1,real
9,Spectral_Rolloff_Point_Overall_Average_1,real


In [11]:
# load features

features = (
    spark.read.format("com.databricks.spark.csv")
    .option("header", "false")
    .option("inferSchema", "true")
    .load("hdfs:////data/msd/audio/features/msd-jmir-spectral-derivatives-all-all-v1.0.csv")
    .limit(10)
)

show_as_html(features)

Unnamed: 0,_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11,_c12,_c13,_c14,_c15,_c16
0,7.928,0.07893,0.001245,222.2,0.001429,0.05438,0.05324,22.35,12.81,0.09207,0.000914,1682.0,0.003026,0.1199,0.5313,38.15,'TRHFHQZ12903C9E2D5'
1,8.501,0.07007,0.005855,200.6,0.003042,0.09163,0.05096,21.18,7.432,0.05245,0.003384,1570.0,0.004289,0.1532,0.5988,25.07,'TRHFHYX12903CAF953'
2,5.101,0.04946,0.007952,241.3,0.002879,0.08716,0.03366,13.13,9.995,0.07575,0.01031,1455.0,0.008896,0.3404,0.5227,34.82,'TRHFHAU128F9341A0E'
3,8.101,0.06402,0.002458,238.5,0.002335,0.08902,0.06764,18.71,15.35,0.102,0.001901,1712.0,0.004152,0.1649,0.5467,41.47,'TRHFHLP128F14947A7'
4,7.226,0.05985,0.005215,194.7,0.002057,0.05784,0.04056,15.88,12.98,0.1094,0.008331,1595.0,0.008042,0.3087,0.5067,39.75,'TRHFHFF128F930AC11'
5,4.304,0.03282,0.001262,279.3,0.002383,0.08844,0.07417,10.88,7.721,0.04463,0.001093,1778.0,0.004259,0.1622,0.5364,18.54,'TRHFHYJ128F4234782'
6,2.724,0.02075,0.001779,203.1,0.001305,0.0453,0.05082,9.718,5.263,0.02806,0.000893,1748.0,0.002865,0.1078,0.5391,19.3,'TRHFHHR128F9339010'
7,15.66,0.09097,0.000516,178.1,0.001069,0.03922,0.08063,33.94,9.158,0.05251,0.000295,1621.0,0.001542,0.05869,0.5568,23.59,'TRHFHMB128F4213BC9'
8,2.161,0.01658,0.003491,239.0,0.0018,0.05721,0.04387,7.207,3.613,0.02298,0.002387,1676.0,0.004306,0.1585,0.5359,10.82,'TRHFHWT128F429032D'
9,8.862,0.07809,0.005187,218.2,0.003705,0.1118,0.05035,23.79,7.212,0.05154,0.003541,1547.0,0.006084,0.212,0.554,24.34,'TRHFHKO12903CBAF09'


In [4]:
# from help session 3
# data types

lookup = {
    'real': DoubleType(),
    'NUMERIC': DoubleType(),
    'float': DoubleType(),
    'string': StringType(),
    'STRING': StringType(),
}

In [5]:
# From help session 3
# Choose a dataset name, load attribute names, and define schemas based on attribute names

name = 'msd-jmir-spectral-all-all-v1.0'

metadata_schema = StructType([
    StructField("name", StringType()),
    StructField("type", StringType()),
])
metadata = spark.read.csv(f'/data/msd/audio/attributes/{name}.attributes.csv', schema=metadata_schema)

metadata.show(truncate=False)

+-----------------------------------------------------------+------+
|name                                                       |type  |
+-----------------------------------------------------------+------+
|Spectral_Centroid_Overall_Standard_Deviation_1             |real  |
|Spectral_Rolloff_Point_Overall_Standard_Deviation_1        |real  |
|Spectral_Flux_Overall_Standard_Deviation_1                 |real  |
|Compactness_Overall_Standard_Deviation_1                   |real  |
|Spectral_Variability_Overall_Standard_Deviation_1          |real  |
|Root_Mean_Square_Overall_Standard_Deviation_1              |real  |
|Fraction_Of_Low_Energy_Windows_Overall_Standard_Deviation_1|real  |
|Zero_Crossings_Overall_Standard_Deviation_1                |real  |
|Spectral_Centroid_Overall_Average_1                        |real  |
|Spectral_Rolloff_Point_Overall_Average_1                   |real  |
|Spectral_Flux_Overall_Average_1                            |real  |
|Compactness_Overall_Average_1    

In [6]:
# From help session 3
# selected the simple schema for redability

schema_simple = StructType([
    StructField(f"F{i:03d}", DoubleType(), True) for i in range(0, metadata.count() - 1)
] + [
    StructField(f"id", StringType(), True)
])

audio_features = spark.read.csv(f'/data/msd/audio/features/{name}.csv', schema=schema_simple, quote="'")

show_as_html(audio_features)

Unnamed: 0,F000,F001,F002,F003,F004,F005,F006,F007,F008,F009,F010,F011,F012,F013,F014,F015,id
0,7.928,0.07893,0.001245,222.2,0.001429,0.05438,0.05324,22.35,12.81,0.09207,0.000914,1682.0,0.003026,0.1199,0.5313,38.15,TRHFHQZ12903C9E2D5
1,8.501,0.07007,0.005855,200.6,0.003042,0.09163,0.05096,21.18,7.432,0.05245,0.003384,1570.0,0.004289,0.1532,0.5988,25.07,TRHFHYX12903CAF953
2,5.101,0.04946,0.007952,241.3,0.002879,0.08716,0.03366,13.13,9.995,0.07575,0.01031,1455.0,0.008896,0.3404,0.5227,34.82,TRHFHAU128F9341A0E
3,8.101,0.06402,0.002458,238.5,0.002335,0.08902,0.06764,18.71,15.35,0.102,0.001901,1712.0,0.004152,0.1649,0.5467,41.47,TRHFHLP128F14947A7
4,7.226,0.05985,0.005215,194.7,0.002057,0.05784,0.04056,15.88,12.98,0.1094,0.008331,1595.0,0.008042,0.3087,0.5067,39.75,TRHFHFF128F930AC11
5,4.304,0.03282,0.001262,279.3,0.002383,0.08844,0.07417,10.88,7.721,0.04463,0.001093,1778.0,0.004259,0.1622,0.5364,18.54,TRHFHYJ128F4234782
6,2.724,0.02075,0.001779,203.1,0.001305,0.0453,0.05082,9.718,5.263,0.02806,0.000893,1748.0,0.002865,0.1078,0.5391,19.3,TRHFHHR128F9339010
7,15.66,0.09097,0.000516,178.1,0.001069,0.03922,0.08063,33.94,9.158,0.05251,0.000295,1621.0,0.001542,0.05869,0.5568,23.59,TRHFHMB128F4213BC9
8,2.161,0.01658,0.003491,239.0,0.0018,0.05721,0.04387,7.207,3.613,0.02298,0.002387,1676.0,0.004306,0.1585,0.5359,10.82,TRHFHWT128F429032D
9,8.862,0.07809,0.005187,218.2,0.003705,0.1118,0.05035,23.79,7.212,0.05154,0.003541,1547.0,0.006084,0.212,0.554,24.34,TRHFHKO12903CBAF09


In [9]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()