In [1]:
import pyspark
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, FloatType
from pyspark.sql.functions import col

import h5py
import numpy as np
import pandas as pd
import glob

In [None]:
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.130:7077") \
        .appName("de16_sparky_olle")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","300s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .config("spark.hadoop.fs.defaultFS", "hdfs://192.168.2.130:9000")\
        .getOrCreate()

sc = spark_session.sparkContext

In [None]:
columns = StructType([
    StructField('artist_name', StringType(), nullable=True),
    StructField('danceability', DoubleType(), nullable=True),
    StructField('duration', DoubleType(), nullable=True),
    StructField('end_of_fade_in', DoubleType(), nullable=True),
    StructField('energy', DoubleType(), nullable=True),
    StructField('key', IntegerType(), nullable=True),
    StructField('key_confidence', DoubleType(), nullable=True),
    StructField('loudness', DoubleType(), nullable=True),
    StructField('mode', IntegerType(), nullable=True),
    StructField('mode_confidence', DoubleType(), nullable=True),
    StructField('release', StringType(), nullable=True),
    StructField('song_hotttnesss', DoubleType(), nullable=True),
    StructField('song_id', StringType(), nullable=True),
    StructField('start_of_fade_out', DoubleType(), nullable=True),
    StructField('tempo', DoubleType(), nullable=True),
    StructField('time_signature', IntegerType(), nullable=True),
    StructField('time_signature_confidence', DoubleType(), nullable=True),
    StructField('title', StringType(), nullable=True),
    StructField('year', IntegerType(), nullable=True)
])

def init_df():
    df = spark_session.createDataFrame([], columns)
    return df

In [None]:
import io

def add_song(df, path):
    binary = spark_session.read.format("binaryFile").load(path)
    content = io.BytesIO(binary.first()['content'])
    with h5py.File(content, 'r') as file:
        new_song_values = Row(
            artist_name = str(file['metadata']['songs'][0][9]),  # StringType
            danceability = float(file['analysis']['songs'][0][2]),  # DoubleType
            duration = float(file['analysis']['songs'][0][3]),  # DoubleType
            end_of_fade_in = float(file['analysis']['songs'][0][4]),  # DoubleType
            energy = float(file['analysis']['songs'][0][5]),  # DoubleType
            key = int(file['analysis']['songs'][0][21]),  # IntegerType
            key_confidence = float(file['analysis']['songs'][0][22]),  # DoubleType
            loudness = float(file['analysis']['songs'][0][23]),  # DoubleType
            mode = int(file['analysis']['songs'][0][24]),  # IntegerType
            mode_confidence = float(file['analysis']['songs'][0][25]),  # DoubleType
            release = str(file['metadata']['songs'][0][14]),  # StringType
            song_hotttnesss = float(file['metadata']['songs'][0][16]),  # DoubleType
            song_id = str(file['metadata']['songs'][0][17]),
            start_of_fade_out = float(file['analysis']['songs'][0][26]),
            tempo = float(file['analysis']['songs'][0][27]),  # DoubleType
            time_signature = int(file['analysis']['songs'][0][28]),  # IntegerType
            time_signature_confidence = float(file['analysis']['songs'][0][29]),
            title = str(file['metadata']['songs'][0][18]),
            year = int(file['musicbrainz']['songs'][0][1])  # IntegerType
        )
        new_song = spark_session.createDataFrame([new_song_values], columns)
    return df.union(new_song).repartition(2)

In [None]:
df = init_df()

def list_h5_files(path):
    hadoop_conf = sc._jsc.hadoopConfiguration()
    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_conf)
    files = []
    status = fs.listStatus(sc._jvm.org.apache.hadoop.fs.Path(path))
    for file_status in status:
        file_path = file_status.getPath()
        if file_status.isDirectory():
            files.extend(list_h5_files(file_path.toString()))
        elif file_path.toString().endswith(".h5"):
            files.append(file_path.toString())
    return files

base_directory = "hdfs://192.168.2.130:9000/user/MillionSongSubset"
song_paths = list_h5_files(base_directory)

for i in range(10):  # Max 10000
    if i % 1 == 0:
        print(i)
    df = add_song(df, song_paths[i])

df.show()
df.printSchema()
print(df.count())
print(df.rdd.getNumPartitions())

In [None]:
#spark_session.stop()