In [1]:
import pyspark
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, FloatType
from pyspark.sql.functions import col
import h5py

In [2]:
# Spark session build
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.130:7077") \
        .appName("de16_sparky_read_h5_to_csv")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","300s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .config("spark.hadoop.fs.defaultFS", "hdfs://192.168.2.130:9000")\
        .getOrCreate()

sc = spark_session.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/07 19:37:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Structure of the final DataFrame
columns = StructType([
    StructField('artist_name', StringType(), nullable=True),
    StructField('danceability', DoubleType(), nullable=True),
    StructField('duration', DoubleType(), nullable=True),
    StructField('end_of_fade_in', DoubleType(), nullable=True),
    StructField('energy', DoubleType(), nullable=True),
    StructField('key', IntegerType(), nullable=True),
    StructField('key_confidence', DoubleType(), nullable=True),
    StructField('loudness', DoubleType(), nullable=True),
    StructField('mode', IntegerType(), nullable=True),
    StructField('mode_confidence', DoubleType(), nullable=True),
    StructField('release', StringType(), nullable=True),
    StructField('song_hotttnesss', DoubleType(), nullable=True),
    StructField('song_id', StringType(), nullable=True),
    StructField('start_of_fade_out', DoubleType(), nullable=True),
    StructField('tempo', DoubleType(), nullable=True),
    StructField('time_signature', IntegerType(), nullable=True),
    StructField('time_signature_confidence', DoubleType(), nullable=True),
    StructField('title', StringType(), nullable=True),
    StructField('year', IntegerType(), nullable=True)
])

In [4]:
import io

# Read relevant content of h5 file
def read_h5_content(path):
    binary = spark_session.read.format("binaryFile").load(path)
    content = io.BytesIO(binary.first()['content'])
    with h5py.File(content, 'r') as file:
        data = {
            'artist_name': str(file['metadata']['songs'][0][9])[2:-1],
            'danceability': float(file['analysis']['songs'][0][2]),
            'duration': float(file['analysis']['songs'][0][3]),
            'end_of_fade_in': float(file['analysis']['songs'][0][4]),
            'energy': float(file['analysis']['songs'][0][5]),
            'key': int(file['analysis']['songs'][0][21]),
            'key_confidence': float(file['analysis']['songs'][0][22]),
            'loudness': float(file['analysis']['songs'][0][23]),
            'mode': int(file['analysis']['songs'][0][24]),
            'mode_confidence': float(file['analysis']['songs'][0][25]),
            'release': str(file['metadata']['songs'][0][14])[2:-1],
            'song_hotttnesss': float(file['metadata']['songs'][0][16]),
            'song_id': str(file['metadata']['songs'][0][17])[2:-1],
            'start_of_fade_out': float(file['analysis']['songs'][0][26]),
            'tempo': float(file['analysis']['songs'][0][27]),
            'time_signature': int(file['analysis']['songs'][0][28]),
            'time_signature_confidence': float(file['analysis']['songs'][0][29]),
            'title': str(file['metadata']['songs'][0][18])[2:-1],
            'year': int(file['musicbrainz']['songs'][0][1])
        }
    return data

In [6]:
import time

# Get paths to all h5 files in the data set
def list_h5_files(path):
    hadoop_conf = sc._jsc.hadoopConfiguration()
    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_conf)
    files = []
    status = fs.listStatus(sc._jvm.org.apache.hadoop.fs.Path(path))
    for file_status in status:
        file_path = file_status.getPath()
        if file_status.isDirectory():
            files.extend(list_h5_files(file_path.toString()))
        elif file_path.toString().endswith(".h5"):
            files.append(file_path.toString())
    return files

base_directory = "hdfs://192.168.2.130:9000/user/MillionSongSubset"
start_time = time.time()

print('Getting paths...')
song_paths = list_h5_files(base_directory)

print('Extracting data...')
extracted_data = [read_h5_content(path) for path in song_paths]

print('Converting to rows...')
rows = [Row(**data) for data in extracted_data]

print('Converting to DataFrame')
df = spark_session.createDataFrame(rows, schema=columns)
print(f'Time spent reading data: {time.time()-start_time}')

# Display DataFrame to verify
df.show()
df.printSchema()
print(df.count())
print(df.rdd.getNumPartitions())

# Write DataFrame to csv on HDFS
df.write.mode("overwrite").csv("hdfs://192.168.2.130:9000/user/MillionSongSubset.csv")

Getting paths...
Extracting data...


                                                                                

Converting to rows...
Converting to DataFrame
Time spent reading data: 1030.6085057258606
+--------------------+------------+---------+--------------+------+---+--------------+--------+----+---------------+--------------------+-------------------+------------------+-----------------+-------+--------------+-------------------------+--------------------+----+
|         artist_name|danceability| duration|end_of_fade_in|energy|key|key_confidence|loudness|mode|mode_confidence|             release|    song_hotttnesss|           song_id|start_of_fade_out|  tempo|time_signature|time_signature_confidence|               title|year|
+--------------------+------------+---------+--------------+------+---+--------------+--------+----+---------------+--------------------+-------------------+------------------+-----------------+-------+--------------+-------------------------+--------------------+----+
|              Casual|         0.0|218.93179|         0.247|   0.0|  1|         0.736| -11.197|   0|

24/03/07 20:37:58 ERROR TransportResponseHandler: Still have 1 requests outstanding when connection from /192.168.2.142:51656 is closed
24/03/07 20:37:58 WARN BlockManagerMasterEndpoint: Error trying to remove broadcast 39365 from block manager BlockManagerId(0, 192.168.2.142, 10005, None)
java.io.IOException: Connection from /192.168.2.142:51656 closed
	at org.apache.spark.network.client.TransportResponseHandler.channelInactive(TransportResponseHandler.java:147)
	at org.apache.spark.network.server.TransportChannelHandler.channelInactive(TransportChannelHandler.java:117)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:305)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:281)
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelInactive(AbstractChannelHandlerContext.java:274)
	at io.netty.channel.ChannelInboundHandlerAdapter.channelInactive(ChannelInboundHandl

In [7]:
# Read DataFrame from csv file in HDFS to verify
df = spark_session.read.csv("hdfs://192.168.2.130:9000/user/MillionSongSubset.csv")
df.show()
df.printSchema()
print(df.count())
print(df.rdd.getNumPartitions())

                                                                                

+--------------------+---+---------+-----+---+---+-----+-------+---+-----+--------------------+-------------------+------------------+-------+-------+----+-----+--------------------+----+
|                 _c0|_c1|      _c2|  _c3|_c4|_c5|  _c6|    _c7|_c8|  _c9|                _c10|               _c11|              _c12|   _c13|   _c14|_c15| _c16|                _c17|_c18|
+--------------------+---+---------+-----+---+---+-----+-------+---+-----+--------------------+-------------------+------------------+-------+-------+----+-----+--------------------+----+
|              Casual|0.0|218.93179|0.247|0.0|  1|0.736|-11.197|  0|0.636|         Fear Itself| 0.6021199899057548|SOMZWCG12A8C13C480|218.932| 92.198|   4|0.778|    I Didn't Mean To|   0|
|        The Box Tops|0.0|148.03546|0.148|0.0|  6|0.169| -9.843|  0| 0.43|          Dimensions|                NaN|SOCIWDW12A8C13D406|137.915|121.274|   4|0.384|           Soul Deep|1969|
|    Sonora Santanera|0.0|177.47546|0.282|0.0|  8|0.643| -9.

In [8]:
spark_session.stop()