In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
import time

In [None]:
num_cores = 1 # 1-4

# Spark session build
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.130:7077") \
        .appName("de16_sparky_loudness_linreg")\
        .config("spark.dynamicAllocation.enabled", False)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","300s")\
        .config("spark.executor.instances", num_cores)\
        .config("spark.executor.cores",1)\
        .config("spark.cores.max",num_cores)\
        .config("spark.default.parallelism",num_cores)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .config("spark.hadoop.fs.defaultFS", "hdfs://192.168.2.130:9000")\
        .getOrCreate()

In [None]:
# Structure of the DataFrame
columns = StructType([
    StructField('artist_name', StringType(), nullable=True),
    StructField('danceability', DoubleType(), nullable=True),
    StructField('duration', DoubleType(), nullable=True),
    StructField('end_of_fade_in', DoubleType(), nullable=True),
    StructField('energy', DoubleType(), nullable=True),
    StructField('key', IntegerType(), nullable=True),
    StructField('key_confidence', DoubleType(), nullable=True),
    StructField('loudness', DoubleType(), nullable=True),
    StructField('mode', IntegerType(), nullable=True),
    StructField('mode_confidence', DoubleType(), nullable=True),
    StructField('release', StringType(), nullable=True),
    StructField('song_hotttnesss', DoubleType(), nullable=True),
    StructField('song_id', StringType(), nullable=True),
    StructField('start_of_fade_out', DoubleType(), nullable=True),
    StructField('tempo', DoubleType(), nullable=True),
    StructField('time_signature', IntegerType(), nullable=True),
    StructField('time_signature_confidence', DoubleType(), nullable=True),
    StructField('title', StringType(), nullable=True),
    StructField('year', IntegerType(), nullable=True)
])


# Read data from csv file on HDFS
start_time1 = time.time()

df = spark_session.read.csv("hdfs://192.168.2.130:9000/user/MillionSongSubset.csv", header=False, schema=columns)
for i in range(100-1): # Repeat a certain number of times to replicate the subset for different tests
    df = df.union(spark_session.read.csv("hdfs://192.168.2.130:9000/user/MillionSongSubset.csv", header=False, schema=columns))

# Repartition and filter
df = df.repartition(num_cores)
filtered_df = df.filter(col("year") != 0)

print(f"Data loading/filtering time: {time.time() - start_time1}")

# Check that the loaded data looks correct
print(f'Partitions: {df.rdd.getNumPartitions()}')
print(f'Count: {filtered_df.count()}')
filtered_df.show()

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

start_time2 = time.time()

# Assemble feature column
assembler = VectorAssembler(inputCols=["year"], outputCol="features")
assembled_df = assembler.transform(filtered_df)

# Fit linear regression model to data
lr = LinearRegression(featuresCol="features", labelCol="loudness")
model = lr.fit(assembled_df)

print(f"Model training time: {time.time() - start_time2}")

# Get coefficients of the fit regression
coefficients = [model.coefficients[0], model.intercept]
print("Coefficients of linear fit:", coefficients)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

start_time3 = time.time()

# Collect data for scatter plot
feature_data = np.array([row[0] for row in filtered_df.select("year").take(10000)])
target_data = np.array([row[0] for row in filtered_df.select("loudness").take(10000)])
plt.scatter(feature_data, target_data, 5, 'k')

# Linear fit guideline
x_values = np.linspace(min(feature_data), max(feature_data), 100)
y_values = coefficients[0] * x_values + coefficients[1]
plt.plot(x_values, y_values, color='red', label="Linear Regression")

# Add labels and legend
plt.xlabel("Year")
plt.ylabel("Loudness")
plt.show()

print(f"Plotting time: {time.time() - start_time3}")

In [None]:
spark_session.stop()