In [263]:
import pyspark
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
import matplotlib.pyplot as plt
import numpy as np
import h5py
import pandas as pd
import glob
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType

In [264]:
cores = 4
partitions = 4
spark_session = SparkSession.builder.appName("John").master(f"local[{cores}]").getOrCreate()


In [265]:
start_time = time.time()

In [266]:
def read_h5_to_row(filename):
    """
    Read a single HDF5 file and return a tuple representing a row of data.
    """
    try:
        with h5py.File(filename, 'r') as file:
            return (

                float(file['analysis']['songs'][0][23]),

                int(file['musicbrainz']['songs'][0][1]),
            )
    except Exception as e:
        print(f"Error processing file {filename}: {e}")
        return None




song_paths = glob.glob('./MillionSongSubset/*/*/*/*.h5')

# Use Spark to parallelize the file processing and read the data
song_data_rdd = spark_session.sparkContext.parallelize(song_paths[:8000]).map(read_h5_to_row)

print(f'Partitions: {song_data_rdd.getNumPartitions()}')
song_data_rdd = song_data_rdd.repartition(partitions)
print(f'Partitions: {song_data_rdd.getNumPartitions()}')

# Convert RDD to Spark DataFrame
schema = StructType([

    StructField("loudness", FloatType(), True),

    StructField("year", IntegerType(), True)
])

columns = ['loudness', 'year']
df = spark_session.createDataFrame(song_data_rdd, schema=schema)
print(f'Partitions: {df.rdd.getNumPartitions()}')

Partitions: 4
Partitions: 4
Partitions: 4


In [267]:
# song_data_rdd = song_data_rdd.map(lambda x: tuple(float(y) if isinstance(y, np.float64) else y for y in x))

# for item in song_data_rdd.take(5):
#     print(item)


In [268]:
# print(type(song_data_rdd))

In [269]:
# df = spark_session.createDataFrame(song_data_rdd, schema=schema)
# df.show()

In [270]:
# df = spark_session.read.csv("test.csv", header=True, inferSchema=True)#.cache()
# df.show(5)
filtered_df = df.filter(col("year") != 0)
# filtered_df = filtered_df.repartition(10)
print(f'Partitions: {filtered_df.rdd.getNumPartitions()}')
# df.show()

Partitions: 4


In [271]:

assembler = VectorAssembler(inputCols=["year"], outputCol="features")
assembled_df = assembler.transform(filtered_df)
# assembled_df.show(10, truncate= False)

In [272]:
lr = LinearRegression(featuresCol="features", labelCol="loudness")
model = lr.fit(assembled_df)
coefficients = [model.coefficients[0], model.intercept]

end_time = time.time()
duration = end_time - start_time


print("Coefficients:", coefficients)
print(f'execution time: {duration}')
print(f'Partitions: {assembled_df.rdd.getNumPartitions()}')
print(f'number of cores: {cores}')

Coefficients: [0.11658413690774709, -242.5665448140215]
execution time: 17.690048217773438
Partitions: 4
number of cores: 4


In [273]:


# # Assuming df is your original DataFrame with the feature and target columns
# # Assuming lr_model is your trained linear regression model

# # Extract feature and target data from DataFrame
# feature_data = filtered_df.select("year").collect()  # Replace "feature_column" with your actual feature column name
# target_data = filtered_df.select("loudness").collect()    # Replace "target_column" with your actual target column name

# # Convert to NumPy arrays
# feature_data = np.array([row[0] for row in feature_data])
# target_data = np.array([row[0] for row in target_data])

# # Plot scatter plot of feature vs. target
# plt.scatter(feature_data, target_data, 5, 'k')

# # Generate the line corresponding to linear regression
# x_values = np.linspace(min(feature_data), max(feature_data), 100)
# y_values = coefficients[0] * x_values + coefficients[1]

# # Plot the linear regression line
# plt.plot(x_values, y_values, color='red', label="Linear Regression")

# # Add labels and legend
# plt.xlabel("Year")
# plt.ylabel("Loudness")
# plt.legend()

# # Show plot
# plt.show()

In [274]:
spark_session.stop()