**Importar librerías**

In [1]:
%pip install pandas numpy scikit-learn tables tqdm

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


**Crear CSV con database**

In [2]:
import os
import glob
import pandas as pd
import numpy as np
import tables
import hdf5_getters as GETTERS
from tqdm import tqdm

# Ruta al subset del Million Song Dataset
DATASET_PATH = "MillionSongSubset/"

# Buscar todos los archivos .h5
files = glob.glob(os.path.join(DATASET_PATH, '**/*.h5'), recursive=True)
print(f"Archivos encontrados: {len(files)}")

data = []

for f in tqdm(files, desc="Procesando canciones"):
    try:
        h5 = tables.open_file(f, 'r')

        # Metadata básica
        track_id = GETTERS.get_track_id(h5).decode("utf-8")
        title = GETTERS.get_title(h5).decode("utf-8")
        artist = GETTERS.get_artist_name(h5).decode("utf-8")
        release = GETTERS.get_release(h5).decode("utf-8")
        year = GETTERS.get_year(h5)

        # Features acústicas
        tempo = GETTERS.get_tempo(h5)
        key = GETTERS.get_key(h5)
        mode = GETTERS.get_mode(h5)
        ts = GETTERS.get_time_signature(h5)
        loudness = GETTERS.get_loudness(h5)
        duration = GETTERS.get_duration(h5)
        danceability = GETTERS.get_danceability(h5)
        energy = GETTERS.get_energy(h5)

        # Popularidad
        artist_hotttnesss = GETTERS.get_artist_hotttnesss(h5)
        song_hotttnesss = GETTERS.get_song_hotttnesss(h5)

        # Promedio segments_timbre y segments_pitches (12 dimensiones cada uno)
        segments_timbre = GETTERS.get_segments_timbre(h5)
        segments_pitches = GETTERS.get_segments_pitches(h5)

        timbre_avg = np.mean(segments_timbre, axis=0) if segments_timbre.size else np.zeros(12)
        pitches_avg = np.mean(segments_pitches, axis=0) if segments_pitches.size else np.zeros(12)

        # Guardar todo en una fila
        row = [
            track_id, title, artist, release, year,
            tempo, key, mode, ts, loudness,
            duration, danceability, energy,
            artist_hotttnesss, song_hotttnesss
        ] + timbre_avg.tolist() + pitches_avg.tolist()

        data.append(row)
        h5.close()

    except Exception as e:
        print(f"Error procesando {f}: {e}")

# Columnas
timbre_cols = [f"timbre_{i}" for i in range(12)]
pitches_cols = [f"pitch_{i}" for i in range(12)]
columns = [
    "track_id", "title", "artist_name", "release", "year",
    "tempo", "key", "mode", "time_signature", "loudness",
    "duration", "danceability", "energy",
    "artist_hotttnesss", "song_hotttnesss"
] + timbre_cols + pitches_cols

# Crear DataFrame
df = pd.DataFrame(data, columns=columns)

# Guardar CSV
output_file = "msd_subset_features_full.csv"
df.to_csv(output_file, index=False)

print(f"\n✅ CSV guardado como {output_file} con {len(df)} canciones.")


Archivos encontrados: 10000


Procesando canciones: 100%|██████████| 10000/10000 [04:02<00:00, 41.28it/s]



✅ CSV guardado como msd_subset_features_full.csv con 10000 canciones.


In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors


**Carga del dataset**

In [2]:
#df = pd.read_csv("msd_subset_features_full.csv")
df = pd.read_csv("train.csv")
print(df.shape)
df.head()

(114000, 21)


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


**Limpieza de Datos**

In [3]:
print(df.isna().sum())
df.fillna(0, inplace=True)


Unnamed: 0          0
track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64


**Variables numéricas**

Aquí podria afegir-li més variables numèriques

In [4]:
#features = [
    #"tempo", "key", "mode", "time_signature", "loudness",
    #"duration", "danceability", "energy",
    #"artist_hotttnesss", "song_hotttnesss"
#] + [f"timbre_{i}" for i in range(12)] + [f"pitch_{i}" for i in range(12)]
features = [
    "danceability", "energy", "key", "loudness", "mode",
    "speechiness", "acousticness", "instrumentalness", "liveness",
    "valence", "tempo", "duration_ms", "time_signature", "popularity"
]
X = df[features]
X.head()


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,popularity
0,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,230666,4,73
1,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,149610,4,55
2,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,210826,4,57
3,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,201933,3,71
4,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,198853,4,82


**Normalización de las variables**

In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

**Modelo de recomendación**

In [11]:
sim_matrix = cosine_similarity(X_scaled)
pca = PCA(n_components=13)
X_pca = pca.fit_transform(X_scaled)
print("Varianza explicada acumulada:", pca.explained_variance_ratio_.cumsum()[-1])

def recomendar_por_titulo(title, n=5):
    # Verificar si el título existe
    if title not in df["track_name"].values:
        return f"❌ La canción '{title}' no se encuentra en el dataset"
    
    # Obtener índice de la primera coincidencia
    idx = df.index[df["track_name"] == title][0]
    
    # Calcular similitudes
    sim_scores = list(enumerate(sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Seleccionar top-n (excluyendo la propia canción)
    top = sim_scores[1:n+1]
    
    return df.iloc[[i for i, _ in top]][["track_id", "track_name", "artists"]]

def recomendar_por_titulo_PCA(title, n=5):
    if title not in df["track_name"].values:
        return f"❌ La canción '{title}' no está en el dataset"
    
    idx = df.index[df["track_name"] == title][0]
    sim_scores = list(enumerate(sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top = sim_scores[1:n+1]  # saltar la propia canción
    
    return df.iloc[[i for i, _ in top]][["track_id", "track_name", "artists"]]

MemoryError: Unable to allocate 96.8 GiB for an array with shape (114000, 114000) and data type float64

In [6]:
pca = PCA(n_components=13, random_state=42)
X_reduced = pca.fit_transform(X_scaled)
knn = NearestNeighbors(n_neighbors=11, metric="cosine", algorithm="brute")
knn.fit(X_reduced)

def recomendar_canciones(song_title, n_recommendations=10):
    if song_title not in df['track_name'].values:
        print(f"'{song_title}' no está en el dataset.")
        return [] 
    idx = df.index[df['track_name'] == song_title][0]
    distances, indices = knn.kneighbors([X_reduced[idx]], n_neighbors=n_recommendations+1)    
    recommended_indices = indices.flatten()[1:]
    recommended_songs = df.iloc[recommended_indices][["track_name", "artists", "track_genre"]]
    return recommended_songs

**Test**

In [None]:
cancion_base = "So Lonely"
print("Canción base:", cancion_base)
print("\nRecomendaciones:")
print(recomendar_por_titulo_PCA(cancion_base, n=5))

Canción base: So Lonely

Recomendaciones:


NameError: name 'sim_matrix' is not defined

In [11]:
recs = recomendar_canciones("Danza Kuduro", 10)
print(recs)

              track_name                                            artists  \
55634      Tu Hi Toh Hai                                 Pritam;Benny Dayal   
80570            Dholida  Udit Narayan;Neha Kakkar;Palak Muchhal;Raja Hasan   
21561         We Outside                                            Olamide   
18016      I'm On A Boat                           The Lonely Island;T-Pain   
113254            Let Go                              Hillsong Young & Free   
85180     Vahos del Ayer                                              Flema   
86275     Vahos del Ayer                                              Flema   
71541     Vahos del Ayer                                              Flema   
113736  My Reason - Live                                      Planetshakers   
23512              Alane                                   Robin Schulz;Wes   

        track_genre  
55634        indian  
80570      pop-film  
21561     dancehall  
18016        comedy  
113254  world-music 