In [None]:
import pyspark
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, FloatType
from pyspark.sql.functions import col

import h5py
import numpy as np
import pandas as pd
import glob

In [None]:
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.130:7077") \
        .appName("de16_sparky_ludde2")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","300s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .config("spark.hadoop.fs.defaultFS", "hdfs://192.168.2.130:9000")\
        .getOrCreate()

sc = spark_session.sparkContext

In [None]:
columns = StructType([
        StructField('danceability', DoubleType(), nullable=True),
        StructField('song_hotttnesss', DoubleType(), nullable=True),
        StructField('energy', DoubleType(), nullable=True),
        StructField('duration', DoubleType(), nullable=True),
        StructField('key', IntegerType(), nullable=True),
        StructField('loudness', DoubleType(), nullable=True),
        StructField('tempo', DoubleType(), nullable=True),
        StructField('time_signature', DoubleType(), nullable=True),
        StructField('year', IntegerType(), nullable=True)
    ])

def init_df():
    df = spark_session.createDataFrame([], columns)
    return df

In [None]:
import io

def add_song(df, path):
    binary = spark_session.read.format("binaryFile").load(path)
    content = io.BytesIO(binary.first()['content'])
    with h5py.File(content, 'r') as file:
        new_song_values = Row(
            danceability = float(file['analysis']['songs'][0][2]),  # DoubleType
            song_hotttnesss = float(file['metadata']['songs'][0][16]),  # DoubleType
            energy = float(file['analysis']['songs'][0][5]),  # DoubleType
            duration = float(file['analysis']['songs'][0][3]),  # DoubleType
            key = int(file['analysis']['songs'][0][21]),  # IntegerType
            loudness = float(file['analysis']['songs'][0][23]),  # DoubleType
            tempo = float(file['analysis']['songs'][0][27]),  # DoubleType
            time_signature = float(file['analysis']['songs'][0][28]),  # DoubleType
            year = int(file['musicbrainz']['songs'][0][1])  # IntegerType
        )
        new_song = spark_session.createDataFrame([new_song_values], columns)
    return df.union(new_song)

In [None]:
df = init_df()

def list_h5_files(path):
    hadoop_conf = sc._jsc.hadoopConfiguration()
    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_conf)
    files = []
    status = fs.listStatus(sc._jvm.org.apache.hadoop.fs.Path(path))
    for file_status in status:
        file_path = file_status.getPath()
        if file_status.isDirectory():
            files.extend(list_h5_files(file_path.toString()))
        elif file_path.toString().endswith(".h5"):
            files.append(file_path.toString())
    return files

base_directory = "hdfs://192.168.2.130:9000/user/MillionSongSubset"
song_paths = list_h5_files(base_directory)


for i in range(100):  # Max 10000
    if i % 10 == 0:
        print(i)
    df = add_song(df, song_paths[i])

path = "hdfs://192.168.2.130:9000/user/MillionSongSubset/A/A/A/TRAAAAW128F429D538.h5"

df.show()
df.printSchema()
print(df.count())
print(df.rdd.getNumPartitions())

In [None]:
import io

# ... (other code definitions)

# Parallelized file processing with foreachPartition
def process_partition(iterator):
    song_data = []
    for path in iterator:
        # Process individual files within a partition
        row = add_song(spark_session.createDataFrame([], columns), path)
        song_data.append(row)
    return song_data

# Listing HDF5 files using Spark's FileSystem
fs = spark.sparkContext.getFileSystem()
path = sc._jvm.org.apache.hadoop.fs.Path(base_directory)
song_paths = fs.listStatus(path).map(lambda f: f.getPath().toString())
filtered_paths = song_paths.filter(col("path").endsWith(".h5"))

# Parallel processing and list creation
song_data = []
filtered_paths.foreachPartition(process_partition, includes=song_data)

# Combine processed song data
df = spark.createDataFrame(song_data, columns)

df.show()
df.printSchema()
print(df.count())
print(df.rdd.getNumPartitions())


In [None]:
spark_session.stop

In [None]:
print("HEJ")