In [39]:
import pandas as pd
import plotly.express as px
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName("BD").getOrCreate()

# Pipeline imports
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer, StandardScaler, MinMaxScaler, PCA
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LinearSVC, RandomForestClassifier, LogisticRegression, GBTClassifier

# read data
data = spark.read.csv("../data/training.csv", header=True, inferSchema=True, nullValue="", sep=";")
data = data.fillna(0, subset=["tempo"])
data = data.dropna()
dev, test = data.randomSplit([0.9, 0.1], seed=12345)

# columns
label = ["music_genre"]
categorical_features = ['key', 'mode']
numerical_features = ['popularity','acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']

In [48]:
indexer = StringIndexer(inputCols=categorical_features + label, outputCols=[col + "_index" for col in categorical_features + label])
# ohe = OneHotEncoder(inputCols=[col + "_index" for col in categorical_features], outputCols=[col + "_ohe" for col in categorical_features])
assembler = VectorAssembler(inputCols=[col + "_index" for col in categorical_features] + numerical_features, outputCol="features")
standard = StandardScaler(inputCol="features", outputCol="scaled_features")
# pca = PCA(k=12, inputCol="scaled_features", outputCol="pca_features")
estimator = LogisticRegression(labelCol="music_genre_index", featuresCol="scaled_features")
pipeline = Pipeline(stages=[indexer, ohe, assembler, standard, pca, estimator])

params = (ParamGridBuilder()
    .addGrid(estimator.regParam, [0.001]) # 0.1, 0.01, 0.001, 0.0001
    .addGrid(estimator.elasticNetParam, [1.0])
    .addGrid(estimator.maxIter, [100])
    .build())
evaluator = MulticlassClassificationEvaluator(labelCol="music_genre_index", predictionCol="prediction", metricName="f1")
cross = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

# build and train the pipeline
model = cross.fit(dev)

# evaluate the model
predictions = model.transform(test)
f1 = evaluator.evaluate(predictions)
print(f"f1: {f1}")

f1: 0.5039532169952553


In [37]:
# print best params
best_model = model.bestModel
best_estimator = best_model.stages[-1]
print(f"regParam: {best_estimator.getRegParam()}")
print(f"elasticNetParam: {best_estimator.getElasticNetParam()}")
print(f"maxIter: {best_estimator.getMaxIter()}")

regParam: 0.001
elasticNetParam: 1.0
maxIter: 100
