In [11]:
import pandas as pd
import plotly.express as px
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName("BD").getOrCreate()

# Pipeline imports
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer, StandardScaler, MinMaxScaler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

# read data
data = spark.read.csv("../data/training.csv", header=True, inferSchema=True, nullValue="", sep=";")
data = data.fillna(0, subset=["tempo"])
data = data.dropna()
dev, test = data.randomSplit([0.9, 0.1], seed=12345)

# columns
label = ["music_genre"]
categorical_features = ['key', 'mode']
numerical_features = ['popularity','acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']

In [20]:


# preprocessing
indexer = StringIndexer(inputCols=categorical_features + label, outputCols=[col + "_index" for col in categorical_features + label])
# ohe = OneHotEncoder(inputCols=[col + "_index" for col in categorical_features], outputCols=[col + "_ohe" for col in categorical_features])
assembler = VectorAssembler(inputCols=[col + "_index" for col in categorical_features] + numerical_features, outputCol="features")
standard = StandardScaler(inputCol="features", outputCol="scaled_features")
# standard = MinMaxScaler(inputCol="features", outputCol="scaled_features")

# cross validation
estimator = RandomForestClassifier(labelCol="music_genre_index", featuresCol="scaled_features")
estimator_params = (ParamGridBuilder()
    .addGrid(RandomForestClassifier.numTrees, [10, 20, 30, 40, 50])
    .addGrid(RandomForestClassifier.maxDepth, [5, 10, 15, 20])
    .addGrid(RandomForestClassifier.maxBins, [32, 64, 100])
    .addGrid(RandomForestClassifier.featureSubsetStrategy, ['auto', 'sqrt', 'log2'])
    .addGrid(RandomForestClassifier.impurity, ['entropy', 'gini'])
    .addGrid(RandomForestClassifier.minInstancesPerNode, [1, 2, 4])
    .addGrid(RandomForestClassifier.minInfoGain, [0.0, 0.1, 0.2])
    .build())
evaluator = MulticlassClassificationEvaluator(labelCol="music_genre_index", predictionCol="prediction", metricName="f1")
cross = CrossValidator(estimator=estimator, estimatorParamMaps=estimator_params, evaluator=evaluator, numFolds=10)

# build and train the pipeline
pipeline = Pipeline(stages=[indexer, assembler, standard, cross])
model = pipeline.fit(dev)

# print the best model
best_model = model.stages[-1].bestModel
print(best_model)

RandomForestClassificationModel: uid=RandomForestClassifier_4d7378c430ef, numTrees=20, numClasses=10, numFeatures=13


In [22]:

# evaluate the model
predictions = model.transform(test)
f1 = evaluator.evaluate(predictions)
print("F1 score: ", f1)


F1 score:  0.48845460974074956


In [26]:
model.stages[-1].

CrossValidatorModel_2d4c9b7a97e5