In [1]:
import pandas as pd
import plotly.express as px
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName("BD").getOrCreate()

# Pipeline imports
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer, StandardScaler, MinMaxScaler, PCA
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier

# read data
data = spark.read.csv("../data/training.csv", header=True, inferSchema=True, nullValue="", sep=";")
data = data.fillna(0, subset=["tempo"])
data = data.dropna()
dev, test = data.randomSplit([0.9, 0.1], seed=12345)

# columns
label = ["music_genre"]
categorical_features = ['key', 'mode']
numerical_features = ['popularity','acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']

In [29]:
indexer = StringIndexer(inputCols=categorical_features + label, outputCols=[col + "_index" for col in categorical_features + label])
# ohe = OneHotEncoder(inputCols=[col + "_index" for col in categorical_features], outputCols=[col + "_ohe" for col in categorical_features])
assembler = VectorAssembler(inputCols=[col + "_index" for col in categorical_features] + numerical_features, outputCol="features")
standard = StandardScaler(inputCol="features", outputCol="scaled_features")
# pca = PCA(k=13, inputCol="scaled_features", outputCol="pca_features")
estimator = MultilayerPerceptronClassifier(featuresCol="scaled_features", labelCol="music_genre_index")
pipeline = Pipeline(stages=[indexer,  assembler, standard, estimator])

params = (ParamGridBuilder()
    # .addGrid(estimator.layers, [[13, 12, 8, 4, 2, 10], [13, 12, 8, 4, 2, 5], [13, 12, 8, 4, 2, 3]])
    # .addGrid(estimator.maxIter, [100, 200, 300])
    # .addGrid(estimator.blockSize, [128, 256, 512])
    .addGrid(estimator.layers, [[13, 20, 20, 20, 10]])
    .addGrid(estimator.maxIter, [2000,])
    .addGrid(estimator.blockSize, [512])
    .build())
evaluator = MulticlassClassificationEvaluator(labelCol="music_genre_index", predictionCol="prediction", metricName="f1")
cross = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

# build and train the pipeline
model = cross.fit(dev)

# evaluate the model
predictions = model.transform(test)
f1 = evaluator.evaluate(predictions)
print(f"f1: {f1}")

f1: 0.5741793905773369


In [23]:
# print best params
best_model = model.bestModel
best_estimator = best_model.stages[-1]
print(f"best layers: {best_estimator.getLayers()}")
print(f"best maxIter: {best_estimator.getMaxIter()}")
print(f"best blockSize: {best_estimator.getBlockSize()}")

best layers: [13, 20, 10]
best maxIter: 100
best blockSize: 512


In [21]:
# print training time per model
print(f"training time: {model.avgMetrics}")


training time: [0.5708128838259379]
