**Importar librerías**

In [1]:
%pip install pandas numpy scikit-learn tables tqdm

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


**Crear CSV con database**

In [2]:
import os
import glob
import pandas as pd
import numpy as np
import tables
import hdf5_getters as GETTERS
from tqdm import tqdm

# Ruta al subset del Million Song Dataset
DATASET_PATH = "MillionSongSubset/"

# Buscar todos los archivos .h5
files = glob.glob(os.path.join(DATASET_PATH, '**/*.h5'), recursive=True)
print(f"Archivos encontrados: {len(files)}")

data = []

for f in tqdm(files, desc="Procesando canciones"):
    try:
        h5 = tables.open_file(f, 'r')

        # Metadata básica
        track_id = GETTERS.get_track_id(h5).decode("utf-8")
        title = GETTERS.get_title(h5).decode("utf-8")
        artist = GETTERS.get_artist_name(h5).decode("utf-8")
        release = GETTERS.get_release(h5).decode("utf-8")
        year = GETTERS.get_year(h5)

        # Features acústicas
        tempo = GETTERS.get_tempo(h5)
        key = GETTERS.get_key(h5)
        mode = GETTERS.get_mode(h5)
        ts = GETTERS.get_time_signature(h5)
        loudness = GETTERS.get_loudness(h5)
        duration = GETTERS.get_duration(h5)
        danceability = GETTERS.get_danceability(h5)
        energy = GETTERS.get_energy(h5)

        # Popularidad
        artist_hotttnesss = GETTERS.get_artist_hotttnesss(h5)
        song_hotttnesss = GETTERS.get_song_hotttnesss(h5)

        # Promedio segments_timbre y segments_pitches (12 dimensiones cada uno)
        segments_timbre = GETTERS.get_segments_timbre(h5)
        segments_pitches = GETTERS.get_segments_pitches(h5)

        timbre_avg = np.mean(segments_timbre, axis=0) if segments_timbre.size else np.zeros(12)
        pitches_avg = np.mean(segments_pitches, axis=0) if segments_pitches.size else np.zeros(12)

        # Guardar todo en una fila
        row = [
            track_id, title, artist, release, year,
            tempo, key, mode, ts, loudness,
            duration, danceability, energy,
            artist_hotttnesss, song_hotttnesss
        ] + timbre_avg.tolist() + pitches_avg.tolist()

        data.append(row)
        h5.close()

    except Exception as e:
        print(f"Error procesando {f}: {e}")

# Columnas
timbre_cols = [f"timbre_{i}" for i in range(12)]
pitches_cols = [f"pitch_{i}" for i in range(12)]
columns = [
    "track_id", "title", "artist_name", "release", "year",
    "tempo", "key", "mode", "time_signature", "loudness",
    "duration", "danceability", "energy",
    "artist_hotttnesss", "song_hotttnesss"
] + timbre_cols + pitches_cols

# Crear DataFrame
df = pd.DataFrame(data, columns=columns)

# Guardar CSV
output_file = "msd_subset_features_full.csv"
df.to_csv(output_file, index=False)

print(f"\n✅ CSV guardado como {output_file} con {len(df)} canciones.")


Archivos encontrados: 10000


Procesando canciones: 100%|██████████| 10000/10000 [04:02<00:00, 41.28it/s]



✅ CSV guardado como msd_subset_features_full.csv con 10000 canciones.


In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity


**Carga del dataset**

In [2]:
df = pd.read_csv("msd_subset_features_full.csv")
print(df.shape)
df.head()

(10000, 39)


Unnamed: 0,track_id,title,artist_name,release,year,tempo,key,mode,time_signature,loudness,...,pitch_2,pitch_3,pitch_4,pitch_5,pitch_6,pitch_7,pitch_8,pitch_9,pitch_10,pitch_11
0,TRAAAAW128F429D538,I Didn't Mean To,Casual,Fear Itself,0,92.198,1,0,4,-11.197,...,0.357945,0.284739,0.367941,0.306885,0.35035,0.298666,0.344612,0.343937,0.30668,0.324174
1,TRAAABD128F429CF47,Soul Deep,The Box Tops,Dimensions,1969,121.274,6,0,4,-9.843,...,0.309996,0.173218,0.443,0.206716,0.297736,0.156291,0.277931,0.564291,0.218373,0.291395
2,TRAAADZ128F9348C2E,Amor De Cabaret,Sonora Santanera,Las Numero 1 De La Sonora Santanera,0,100.07,8,1,1,-9.689,...,0.256429,0.427212,0.262032,0.240454,0.169489,0.297359,0.273988,0.430112,0.453607,0.197391
3,TRAAAEF128F4273421,Something Girls,Adam Ant,Friend Or Foe,1982,119.293,0,1,4,-9.013,...,0.396124,0.29246,0.399649,0.419144,0.323315,0.422413,0.282447,0.3973,0.253177,0.413878
4,TRAAAFD128F92F423A,Face the Ashes,Gob,Muertos Vivos,2007,129.738,2,1,4,-4.501,...,0.368544,0.260059,0.396923,0.423939,0.382548,0.481116,0.454322,0.6269,0.3383,0.278859


**Limpieza de Datos**

In [3]:
print(df.isna().sum())
df.fillna(0, inplace=True)


track_id                0
title                   1
artist_name             0
release                 0
year                    0
tempo                   0
key                     0
mode                    0
time_signature          0
loudness                0
duration                0
danceability            0
energy                  0
artist_hotttnesss       0
song_hotttnesss      4352
timbre_0                0
timbre_1                0
timbre_2                0
timbre_3                0
timbre_4                0
timbre_5                0
timbre_6                0
timbre_7                0
timbre_8                0
timbre_9                0
timbre_10               0
timbre_11               0
pitch_0                 0
pitch_1                 0
pitch_2                 0
pitch_3                 0
pitch_4                 0
pitch_5                 0
pitch_6                 0
pitch_7                 0
pitch_8                 0
pitch_9                 0
pitch_10                0
pitch_11    

**Variables numéricas**

Aquí podria afegir-li més variables numèriques

In [4]:
features = [
    "tempo", "key", "mode", "time_signature", "loudness",
    "duration", "danceability", "energy",
    "artist_hotttnesss", "song_hotttnesss"
] + [f"timbre_{i}" for i in range(12)] + [f"pitch_{i}" for i in range(12)]
X = df[features]
X.head()


Unnamed: 0,tempo,key,mode,time_signature,loudness,duration,danceability,energy,artist_hotttnesss,song_hotttnesss,...,pitch_2,pitch_3,pitch_4,pitch_5,pitch_6,pitch_7,pitch_8,pitch_9,pitch_10,pitch_11
0,92.198,1,0,4,-11.197,218.93179,0.0,0.0,0.401998,0.60212,...,0.357945,0.284739,0.367941,0.306885,0.35035,0.298666,0.344612,0.343937,0.30668,0.324174
1,121.274,6,0,4,-9.843,148.03546,0.0,0.0,0.4175,0.0,...,0.309996,0.173218,0.443,0.206716,0.297736,0.156291,0.277931,0.564291,0.218373,0.291395
2,100.07,8,1,1,-9.689,177.47546,0.0,0.0,0.343428,0.0,...,0.256429,0.427212,0.262032,0.240454,0.169489,0.297359,0.273988,0.430112,0.453607,0.197391
3,119.293,0,1,4,-9.013,233.40363,0.0,0.0,0.454231,0.0,...,0.396124,0.29246,0.399649,0.419144,0.323315,0.422413,0.282447,0.3973,0.253177,0.413878
4,129.738,2,1,4,-4.501,209.60608,0.0,0.0,0.401724,0.604501,...,0.368544,0.260059,0.396923,0.423939,0.382548,0.481116,0.454322,0.6269,0.3383,0.278859


**Normalización de las variables**

In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

**Modelo de recomendación**

In [8]:
sim_matrix = cosine_similarity(X_scaled)
pca = PCA(n_components=30)
X_pca = pca.fit_transform(X_scaled)
print("Varianza explicada acumulada:", pca.explained_variance_ratio_.cumsum()[-1])

def recomendar_por_titulo(title, n=5):
    # Verificar si el título existe
    if title not in df["title"].values:
        return f"❌ La canción '{title}' no se encuentra en el dataset"
    
    # Obtener índice de la primera coincidencia
    idx = df.index[df["title"] == title][0]
    
    # Calcular similitudes
    sim_scores = list(enumerate(sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Seleccionar top-n (excluyendo la propia canción)
    top = sim_scores[1:n+1]
    
    return df.iloc[[i for i, _ in top]][["track_id", "title", "artist_name"]]

def recomendar_por_titulo_PCA(title, n=5):
    if title not in df["title"].values:
        return f"❌ La canción '{title}' no está en el dataset"
    
    idx = df.index[df["title"] == title][0]
    sim_scores = list(enumerate(sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top = sim_scores[1:n+1]  # saltar la propia canción
    
    return df.iloc[[i for i, _ in top]][["track_id", "title", "artist_name"]]

Varianza explicada acumulada: 0.9963233908170839


**Test**

In [None]:
cancion_base = "So Lonely"
print("Canción base:", cancion_base)
print("\nRecomendaciones:")
print(recomendar_por_titulo_PCA(cancion_base, n=5))



Canción base: So Lonely

Recomendaciones:
                track_id                            title  \
269   TRAAUZX128F92F3D53  The Last Word (Amended Version)   
552   TRABUVN128F930633C                       Es dificil   
2728  TRAJDGB128F42882AC                  Call Of Da Wild   
3400  TRALOCJ128F427D106                        Judgement   
1689  TRAFTDN128F427C29D                          Hold On   

                          artist_name  
269    Rah Digga (Featuring Outsidaz)  
552                      De La Ghetto  
2728                          OutKast  
3400  The Bug Featuring Ricky Ranking  
1689                         Hot Chip  
