In [31]:
# read data using pyspark
import pandas as pd
import plotly.express as px
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName("BD").getOrCreate()

In [32]:
# read data
app_df = spark.read.csv("../data/training.csv", header=True, inferSchema=True, nullValue="", sep=";")
app_df.limit(10).toPandas()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,9.0,0.757,0.0816,169627.0,0.0554,0.312,A#,0.108,-19.406,Minor,0.0391,74.907,0.0703,Classical
1,43.0,0.996,0.585,230307.0,0.014,0.929,F,0.126,-29.092,Major,0.102,127.593,0.266,Classical
2,40.0,0.914,0.24,246733.0,0.325,0.000398,G,0.115,-13.26,Major,0.0354,119.141,0.109,Classical
3,52.0,0.7,0.462,225067.0,0.741,0.0,A#,0.34,-8.008,Minor,0.162,,0.589,Hip-Hop
4,16.0,8e-06,0.315,207987.0,0.919,0.772,C,0.342,-2.588,Major,0.0467,175.054,0.574,Anime
5,32.0,0.0411,0.719,190312.0,0.917,0.000645,A,0.226,-3.4,Minor,0.0441,,0.491,Electronic
6,40.0,0.721,0.678,323120.0,0.781,0.477,F,0.106,-8.721,Minor,0.0496,82.762,0.53,Jazz
7,37.0,0.0282,0.515,221378.0,0.875,0.418,A,0.218,-5.708,Minor,0.0364,165.991,0.497,Anime
8,24.0,0.0163,0.678,255480.0,0.859,0.0,G#,0.33,-4.464,Major,0.0415,131.027,0.66,Anime
9,43.0,0.223,0.539,230107.0,0.82,1e-06,A,0.154,-3.913,Major,0.0363,104.033,0.56,Country


In [33]:
from pyspark.sql.functions import count, when, isnan, col

# Analyze data
print("Number of rows: ", app_df.count())
numerical_features = [t[0] for t in app_df.dtypes if t[1] == 'int' or t[1] == 'double']
categorical_features = [t[0] for t in app_df.dtypes if t[1] == 'string']
print("Number of numerical features: ", len(numerical_features))
print("Number of categorical features: ", len(categorical_features))

# print statistical summary of numerical features
descriptors = app_df.select(numerical_features).describe().toPandas()
# add missing values count
missing_values = app_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in numerical_features]).toPandas()
missing_values["summary"] = "missing"
descriptors = pd.concat([descriptors, missing_values], ignore_index=True)
descriptors = descriptors.transpose()
descriptors = descriptors.rename(columns=descriptors.iloc[0]).drop(descriptors.index[0])
descriptors = descriptors.astype(float)

print("Statistical summary of numerical features:")
display(descriptors.round(2))

# print statistical summary of categorical features
print("Statistical summary of categorical features:")
for col in categorical_features:
    tmp_df = app_df.groupBy(col).count().toPandas().sort_values('count', ascending=False)
    tmp_df['percentage'] = tmp_df['count'] / tmp_df['count'].sum() * 100
    # use plotlyexpress to plot the data
    fig = px.bar(tmp_df, x=col, y='count', title=f"{col} ({app_df.select(col).distinct().count()})")
    fig.show()
    tmp_df = tmp_df.transpose().round(2)
    display(tmp_df.rename(columns=tmp_df.iloc[0]).drop(tmp_df.index[0]))

Number of rows:  30003
Number of numerical features:  11
Number of categorical features:  3
Statistical summary of numerical features:


Unnamed: 0,count,mean,stddev,min,max,missing
popularity,30002.0,44.22,15.56,0.0,97.0,1.0
acousticness,30002.0,0.31,0.34,0.0,1.0,1.0
danceability,30002.0,0.56,0.18,0.06,0.98,1.0
duration_ms,30002.0,220503.48,126619.91,-1.0,4276000.0,1.0
energy,30002.0,0.6,0.27,0.0,1.0,1.0
instrumentalness,30002.0,0.18,0.33,0.0,0.99,1.0
liveness,30002.0,0.19,0.16,0.01,0.99,1.0
loudness,30002.0,-9.18,6.23,-47.05,3.74,1.0
speechiness,30002.0,0.09,0.1,0.02,0.94,1.0
tempo,27016.0,120.0,30.68,34.47,220.28,2987.0


Statistical summary of categorical features:


Unnamed: 0,G,C,C#,D,A,F,B,E,G#,A#,F#,D#,None
count,3486.0,3270.0,3208.0,3192.0,2934.0,2599.0,2255.0,2226.0,2020.0,1999.0,1876.0,937.0,1.0
percentage,11.618838,10.89891,10.692264,10.638936,9.779022,8.662467,7.515915,7.419258,6.73266,6.662667,6.252708,3.123021,0.003333


Unnamed: 0,Major,Minor,None
count,19212.0,10790.0,1.0
percentage,64.033597,35.96307,0.003333


Unnamed: 0,Jazz,Hip-Hop,Classical,Country,Blues,Rap,Alternative,Electronic,Rock,Anime,None
count,3048.0,3025.0,3023.0,3021.0,3010.0,3001.0,3000.0,2971.0,2958.0,2945.0,1.0
percentage,10.158984,10.082325,10.075659,10.068993,10.03233,10.002333,9.999,9.902343,9.859014,9.815685,0.003333


In [34]:
### Data cleaning
# rellenamos todos los tempo vacios con 0
app_df = app_df.fillna(0, subset=["tempo"])
# Hay una fila (29617) que todos los valores son nulos, nos la cargamos
app_df = app_df.dropna()
print("Number of rows after dropping rows with null values: ", app_df.count())
# como se puede ver solo hemos perdido una fila

### Split data in dev and test
dev_df, test_df = app_df.randomSplit([0.9, 0.1], seed=1245)
print("Number of rows in dev data: ", dev_df.count())
print("Number of rows in test data: ", test_df.count())
print("Intersecting rows: ", dev_df.intersect(test_df).count())

# Guardamos los datos TODO
# dev_df.write.mode("overwrite").csv("../data/dev.csv", header=True)
# test_df.write.mode("overwrite").csv("../data/test.csv", header=True)


Number of rows after dropping rows with null values:  30002
Number of rows in dev data:  27083
Number of rows in test data:  2919
Intersecting rows:  0


In [35]:
# naive approach to clasification (baseline)
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer
from pyspark.ml import Pipeline

# preprocessing
indexer = StringIndexer(inputCols=categorical_features, outputCols=[col + "_index" for col in categorical_features])
ohe = OneHotEncoder(inputCols=[col + "_index" for col in categorical_features], outputCols=[col + "_ohe" for col in categorical_features])
assembler = VectorAssembler(inputCols=[col + "_ohe" for col in categorical_features] + numerical_features, outputCol="features")

In [36]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

# cross validation
estimator = RandomForestClassifier(labelCol="music_genre_index", featuresCol="features")
estimator_params = ParamGridBuilder().addGrid(RandomForestClassifier.numTrees, [10, 20, 30]).build()
evaluator = MulticlassClassificationEvaluator(labelCol="music_genre_index", predictionCol="prediction", metricName="f1")
cross = CrossValidator(estimator=estimator, estimatorParamMaps=estimator_params, evaluator=evaluator, numFolds=10)

# build and train the pipeline
pipeline = Pipeline(stages=[indexer, ohe, assembler, cross])
model = pipeline.fit(dev_df)




Exception ignored in: <function JavaModelWrapper.__del__ at 0x000001F728615A80>
Traceback (most recent call last):
  File "c:\Users\edoelas\git\MUIARFID 2023-2024\BD\.venv\Lib\site-packages\pyspark\mllib\common.py", line 152, in __del__
    assert self._sc._gateway is not None
           ^^^^^^^^
AttributeError: 'MulticlassMetrics' object has no attribute '_sc'
Exception ignored in: <function JavaModelWrapper.__del__ at 0x000001F728615A80>
Traceback (most recent call last):
  File "c:\Users\edoelas\git\MUIARFID 2023-2024\BD\.venv\Lib\site-packages\pyspark\mllib\common.py", line 152, in __del__
    assert self._sc._gateway is not None
           ^^^^^^^^
AttributeError: 'MulticlassMetrics' object has no attribute '_sc'
Exception ignored in: <function JavaModelWrapper.__del__ at 0x000001F728615A80>
Traceback (most recent call last):
  File "c:\Users\edoelas\git\MUIARFID 2023-2024\BD\.venv\Lib\site-packages\pyspark\mllib\common.py", line 152, in __del__
    assert self._sc._gateway is not

In [44]:

# evaluate the model
predictions = model.transform(test_df)
f1 = evaluator.evaluate(predictions)
print("F1 score: ", f1)


F1 score:  0.9910770997303311


In [48]:
preds_df = predictions.select("music_genre_index", "prediction").toPandas()
preds_df["correct"] = preds_df.music_genre_index == preds_df.prediction
display(preds_df)
display(preds_df.prediction.value_counts())
print("accuracy: ", preds_df.correct.mean())


Unnamed: 0,music_genre_index,prediction,correct
0,8.0,8.0,True
1,9.0,9.0,True
2,4.0,4.0,True
3,4.0,4.0,True
4,6.0,6.0,True
...,...,...,...
2914,1.0,1.0,True
2915,3.0,3.0,True
2916,1.0,1.0,True
2917,7.0,7.0,True


prediction
5.0    329
6.0    310
0.0    303
8.0    302
4.0    294
1.0    284
7.0    280
9.0    278
2.0    273
3.0    266
Name: count, dtype: int64

accuracy:  0.9910928400137033
