In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, InputLayer, LSTM
from tensorflow.keras.optimizers import  Adam, SGD
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import export_graphviz
from sklearn.utils import resample
#Modelos
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
import pickle

from sklearn.tree import DecisionTreeClassifier

In [45]:
df = pd.read_csv('train.csv')
df 

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Class
0,Bruno Mars,That's What I Like (feat. Gucci Mane),60.0,0.854,0.564,1.0,-4.964,1,0.0485,0.017100,,0.0849,0.8990,134.071,234596.0,4,5
1,Boston,Hitch a Ride,54.0,0.382,0.814,3.0,-7.230,1,0.0406,0.001100,0.004010,0.1010,0.5690,116.454,251733.0,4,10
2,The Raincoats,No Side to Fall In,35.0,0.434,0.614,6.0,-8.334,1,0.0525,0.486000,0.000196,0.3940,0.7870,147.681,109667.0,4,6
3,Deno,Lingo (feat. J.I & Chunkz),66.0,0.853,0.597,10.0,-6.528,0,0.0555,0.021200,,0.1220,0.5690,107.033,173968.0,4,5
4,Red Hot Chili Peppers,Nobody Weird Like Me - Remastered,53.0,0.167,0.975,2.0,-4.279,1,0.2160,0.000169,0.016100,0.1720,0.0918,199.060,229960.0,4,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17991,Green-House,Find Home,35.0,0.166,0.109,7.0,-17.100,0,0.0413,0.993000,0.824000,0.0984,0.1770,171.587,193450.0,3,6
17992,Micatone,All Gone,27.0,0.638,0.223,11.0,-10.174,0,0.0329,0.858000,0.000016,0.0705,0.3350,73.016,257067.0,4,2
17993,Smash Hit Combo,Peine perdue,34.0,0.558,0.981,4.0,-4.683,0,0.0712,0.000030,0.000136,0.6660,0.2620,105.000,216222.0,4,8
17994,Beherit,Salomon's Gate,29.0,0.215,0.805,6.0,-12.757,0,0.1340,0.001290,0.916000,0.2560,0.3550,131.363,219693.0,4,8


In [46]:
# Eliminar las columnas 'Artist Name' y 'Track Name'
df = df.drop(['Track Name', 'Artist Name','mode'], axis=1)

In [47]:
#obtiene la frecuencia de cada genero y su media
frecuencia_genero = df['Class'].value_counts()

#imprime los resultados
print(frecuencia_genero)

Class
10    4949
6     2587
9     2524
8     1854
5     1447
1     1373
2     1272
0      625
7      576
3      402
4      387
Name: count, dtype: int64


In [48]:
# Calculando la media de 'Popularity' e 'instrumentalness' por cada clase
mean_popularity = df.groupby('Class')['Popularity'].mean()
mean_instrumentalness = df.groupby('Class')['instrumentalness'].mean()

#Calcular la moda de 'key' por cada clase
moda_key = df.groupby('Class')['key'].agg(lambda x:x.value_counts().index[0])

In [49]:
#Imprimiendo los resultados
print(mean_popularity)
print(mean_instrumentalness)

Class
0     38.105263
1     45.883185
2     32.911695
3     26.242268
4     57.294278
5     48.638596
6     41.474843
7     41.506399
8     42.226674
9     50.355953
10    46.967914
Name: Popularity, dtype: float64
Class
0     0.099464
1     0.199239
2     0.097938
3     0.131430
4     0.012675
5     0.068779
6     0.262643
7     0.740386
8     0.235927
9     0.049807
10    0.112833
Name: instrumentalness, dtype: float64


In [50]:
print(moda_key)

Class
0     2.0
1     7.0
2     7.0
3     7.0
4     2.0
5     1.0
6     9.0
7     2.0
8     1.0
9     1.0
10    2.0
Name: key, dtype: float64


In [51]:
#Rellenar valores nulos de 'Popularity' y 'instrumentalness' con la media de la columna segun su clase
df['Popularity'] = df['Popularity'].fillna(df.groupby('Class')['Popularity'].transform('mean'))
df['instrumentalness'] = df['instrumentalness'].fillna(df.groupby('Class')['instrumentalness'].transform('mean'))

#Rellenar valores nulos de 'key' con la moda de la columna segun su clase
df['key'] = df['key'].fillna(df.groupby('Class')['key'].transform(lambda x:x.value_counts().index[0]))

In [52]:
#Revisar valores nulos
df_nulls = df.isnull().sum()
df_nulls

Popularity            0
danceability          0
energy                0
key                   0
loudness              0
speechiness           0
acousticness          0
instrumentalness      0
liveness              0
valence               0
tempo                 0
duration_in min/ms    0
time_signature        0
Class                 0
dtype: int64

In [53]:
#media de las frecuencias
media = df['Class'].value_counts().mean()
media

1636.0

In [54]:
# Separar el dataset por clases
clases = df['Class'].unique()
data_por_clase = {clase: df[df['Class'] == clase] for clase in clases}

# Aplicar submuestreo y sobremuestreo
data_balanced = pd.DataFrame(columns=df.columns)
for clase, datos_clase in data_por_clase.items():
    if len(datos_clase) > media:
        # Submuestreo
        datos_muestreados = resample(datos_clase, replace=False, n_samples=int(media), random_state=0)
    else:
        # Sobremuestreo
        datos_muestreados = resample(datos_clase, replace=True, n_samples=int(media), random_state=0)
    data_balanced = pd.concat([data_balanced, datos_muestreados])

# Verificar el nuevo balance de las clases
df = data_balanced
#resetea el index
df = df.reset_index(drop=True)
nuevo_balance_sklearn = df['Class'].value_counts()
nuevo_balance_sklearn, media

  data_balanced = pd.concat([data_balanced, datos_muestreados])


(Class
 5     1636
 10    1636
 6     1636
 2     1636
 4     1636
 8     1636
 9     1636
 3     1636
 7     1636
 1     1636
 0     1636
 Name: count, dtype: int64,
 1636.0)

In [55]:
# Estandarización de los datos
le = LabelEncoder()
valores_numericos = df.columns.drop(['key', 'time_signature', 'Class'])

scaler = StandardScaler()

df[valores_numericos] = scaler.fit_transform(df[valores_numericos])

df.head()

Unnamed: 0,Popularity,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Class
0,0.639962,0.151541,0.318869,5.0,0.488781,1.652413,-0.080222,-0.379873,0.3796,0.668159,1.557587,0.597042,4,5
1,1.888554,1.963485,0.206903,1.0,0.901362,2.462383,-0.169595,-0.379873,0.588559,0.744617,-1.06648,1.230884,4,5
2,1.377766,1.667534,-0.627045,5.0,0.118579,3.612776,-0.5606,-0.379873,2.501838,1.517247,0.739115,0.437035,4,5
3,0.015665,1.456141,-0.893446,1.0,0.234102,0.255507,-0.883458,-0.584536,-0.671071,-0.289581,-1.518758,0.480812,4,5
4,-1.743715,-0.536997,1.226174,4.0,0.926626,2.849761,-1.00925,-0.610016,-0.390936,0.724496,1.246525,0.478904,4,5


In [56]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)

encoded_columns = encoder.fit_transform(df[['key', 'time_signature']])

encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(['key', 'time_signature']))

df = df.drop(['key', 'time_signature'], axis=1)
df = pd.concat([df, encoded_df], axis=1)
encoded_df

Unnamed: 0,key_1.0,key_2.0,key_3.0,key_4.0,key_5.0,key_6.0,key_7.0,key_8.0,key_9.0,key_10.0,key_11.0,time_signature_1,time_signature_3,time_signature_4,time_signature_5
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17991,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
17993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
17994,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [57]:
#guardar csv en la carpeta data con el nombre generos.csv
df.to_csv('generos.csv', index=False)