In [23]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Cargo datos
df = pd.read_csv("data.csv")
df['artists'] = df['artists'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df['artist_primary'] = df['artists'].apply(lambda lst: lst[0] if lst else None)
df["artist_primary"] = df["artist_primary"].astype('category').cat.codes

In [24]:
# Columnas numéricas de audio / contexto que vamos a usar
feature_cols = [
    "acousticness",
    "danceability",
    "energy",
    "instrumentalness",
    "liveness",
    "loudness",
    "speechiness",
    "valence",
    "tempo",
    "duration_ms",
    "year",
    "popularity",
    "explicit",
    "key",
    "mode",
]

X = df[feature_cols].copy()

# Escalado
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA para espacio latente
pca = PCA(n_components=10, random_state=42)
X_latent = pca.fit_transform(X_scaled)

# Guardamos en el dataframe
for i in range(X_latent.shape[1]):
    df[f"z{i+1}"] = X_latent[:, i]

In [25]:
from sklearn.neighbors import NearestNeighbors

latent_cols = [f"z{i+1}" for i in range(10)]

nn_model = NearestNeighbors(
    n_neighbors=11,      # 1 es la propia canción + 10 similares
    metric="cosine",
    algorithm="auto"
)
nn_model.fit(df[latent_cols].values)

# Helper para buscar canción por nombre (aprox)
def search_track(query, topn=5):
    mask = df["name"].str.contains(query, case=False, na=False)
    return df[mask][["id", "name", "artists", "year", "popularity"]].head(topn)

def recommend_by_track_id(track_id, n_recs=10):
    # Localizo índice
    idx_list = df.index[df["id"] == track_id].tolist()
    if not idx_list:
        raise ValueError("Track ID no encontrado")
    idx = idx_list[0]

    track_vec = df.loc[idx, latent_cols].values.reshape(1, -1)
    distances, indices = nn_model.kneighbors(track_vec, n_neighbors=n_recs+1)

    indices = indices[0]
    distances = distances[0]

    # Descarta el propio tema (distancia 0)
    indices = indices[1:]
    distances = distances[1:]
#-  recs = df.loc[indices, ["id", "name", "artists", "year", "popularity"]].copy()
    recs = df.loc[indices, ["id", "name", "artists", "year", "popularity"]].copy()
    recs["similarity"] = 1 - distances  # 1 - cos distance
    return recs

In [26]:
# Buscar "Dakiti" y obtener recomendaciones relacionadas
res = search_track("Watermelon Sugar", topn=10)
print(res.head(10))

# Seleccionar el id de la canción "Dakiti" (preferir match exacto en df)
if "Dakiti" in df["name"].values:
    some_id = df.loc[df["name"] == "Dakiti", "id"].values[0]
else:
    if isinstance(res, pd.DataFrame) and 'id' in res.columns and len(res) > 0:
        some_id = res.iloc[0]['id']
    else:
        matches = df[df["name"].str.contains("Dakiti", case=False, na=False)]
        if len(matches) == 0:
            raise ValueError("No se encontró 'Dakiti' en el dataset ni en los resultados de búsqueda.")
        some_id = matches.iloc[0]['id']

print("Usando id:", some_id)
display(df.loc[df['id'] == some_id, ['name', 'artists', 'year', 'popularity']])

# Obtener recomendaciones por id de la pista
recommendations = recommend_by_track_id(some_id, n_recs=15)
recommendations

                           id              name         artists  year  \
19407  6UelLqGlWMcVH1E5c4H7lY  Watermelon Sugar  [Harry Styles]  2019   
19591  1e9oZCCiX42nJl0AcqriVo  Watermelon Sugar  [Harry Styles]  2019   

       popularity  
19407          94  
19591          82  
Usando id: 47EiUVwUp4C9fGccaPuUCS


Unnamed: 0,name,artists,year,popularity
19611,Dakiti,"[Bad Bunny, Jhay Cortez]",2020,100


Unnamed: 0,id,name,artists,year,popularity,similarity
19418,30bqVoKjX479ab90a8Pafp,Star Shopping,[Lil Peep],2019,86,0.983335
19047,5tz69p7tJuGPeMGwNTxYuV,1-800-273-8255,"[Logic, Alessia Cara, Khalid]",2017,80,0.983225
108362,5RLDbZPiqULNl4cr17eoIV,Low,[Lund],2018,63,0.979691
75225,5IUtvfNvOyVYZUa6AJFrnP,Spicy (feat. Post Malone),"[Ty Dolla $ign, Post Malone]",2020,74,0.977374
57029,1s59X35jDULAyOGmBuTAnd,Yo Ya No Vuelvo Contigo - En Vivo,"[Lenin Ramírez, Grupo Firme]",2019,77,0.975949
139939,0T4YziOpifV4Eo9XsMPp2X,Snow White,[Dennis Lloyd],2015,64,0.974704
19223,4jvjzW7Hm0yK4LvvE0Paz9,Falling Down - Bonus Track,"[Lil Peep, XXXTENTACION]",2018,84,0.974136
56991,3mRlFZHUyvJbPTlkzg4LyJ,Roses,[SAINt JHN],2018,71,0.974027
124759,02UJ1sCanP94fS2MdsWafh,PERSIAN RUGS,[PARTYNEXTDOOR],2020,70,0.971573
38461,2tFwfmceQa1Y6nRPhYbEtC,CÓMO SE SIENTE - Remix,"[Jhay Cortez, Bad Bunny]",2020,85,0.970633


In [27]:
# Ordenar recommendations segun la popularidad que mas se acerque a la cancion dada
recommendations = recommendations.reindex(
    recommendations['popularity']
    .sub(df.loc[df['id'] == some_id, 'popularity'].values[0])
    .abs()
    .sort_values()
    .index
).reset_index(drop=True)
recommendations


Unnamed: 0,id,name,artists,year,popularity,similarity
0,30bqVoKjX479ab90a8Pafp,Star Shopping,[Lil Peep],2019,86,0.983335
1,2tFwfmceQa1Y6nRPhYbEtC,CÓMO SE SIENTE - Remix,"[Jhay Cortez, Bad Bunny]",2020,85,0.970633
2,4jvjzW7Hm0yK4LvvE0Paz9,Falling Down - Bonus Track,"[Lil Peep, XXXTENTACION]",2018,84,0.974136
3,5tz69p7tJuGPeMGwNTxYuV,1-800-273-8255,"[Logic, Alessia Cara, Khalid]",2017,80,0.983225
4,1s59X35jDULAyOGmBuTAnd,Yo Ya No Vuelvo Contigo - En Vivo,"[Lenin Ramírez, Grupo Firme]",2019,77,0.975949
5,7DcvwMAiqKJQD1rrdfxSDx,The London (feat. J. Cole & Travis Scott),"[Young Thug, J. Cole, Travis Scott]",2019,76,0.966464
6,5IUtvfNvOyVYZUa6AJFrnP,Spicy (feat. Post Malone),"[Ty Dolla $ign, Post Malone]",2020,74,0.977374
7,3mRlFZHUyvJbPTlkzg4LyJ,Roses,[SAINt JHN],2018,71,0.974027
8,02UJ1sCanP94fS2MdsWafh,PERSIAN RUGS,[PARTYNEXTDOOR],2020,70,0.971573
9,0T4YziOpifV4Eo9XsMPp2X,Snow White,[Dennis Lloyd],2015,64,0.974704
