In [None]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Cargo datos
df = pd.read_csv("data.csv")
# Feature engineering
df['decade'] = (df['year'] // 10) * 10
current_year = pd.Timestamp.now().year
df['years_since_release'] = current_year - df['year']
df["year/recency"] = df["year"] / (current_year - df["year"] + 1)
df['key_sin'] = np.sin(2 * np.pi * df['key'] / 12)
df['key_cos'] = np.cos(2 * np.pi * df['key'] / 12)

In [266]:
df['artists'] = df['artists'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df['artist_primary'] = df['artists'].apply(lambda lst: lst[0] if lst else None)

# encoder a artist primary
df["artist_primary"] = df["artist_primary"].astype('category').cat.codes

In [267]:
# Columnas numéricas de audio / contexto que vamos a usar
feature_cols = [
    'valence', 
    'year', 
    'acousticness', 
    'danceability',
    'duration_ms', 
    'energy', 
    'instrumentalness', 
    'liveness', 
    'loudness', 
    'popularity',
    'speechiness', 
    'tempo', 
    'decade', 
    'years_since_release',
    'year/recency',
    'artist_primary',
]

X = df[feature_cols].copy()

# Escalado
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA para espacio latente
pca = PCA(n_components=10, random_state=42)
X_latent = pca.fit_transform(X_scaled)

# Guardamos en el dataframe
for i in range(X_latent.shape[1]):
    df[f"z{i+1}"] = X_latent[:, i]

In [269]:
from sklearn.neighbors import NearestNeighbors

latent_cols = [f"z{i+1}" for i in range(10)]

nn_model = NearestNeighbors(
    n_neighbors=11,      # 1 es la propia canción + 10 similares
    metric="cosine",
    algorithm="auto"
)
nn_model.fit(df[latent_cols].values)

# Helper para buscar canción por nombre (aprox)
def search_track(query, topn=5):
    mask = df["name"].str.contains(query, case=False, na=False)
    return df[mask][["id", "name", "artists", "year", "popularity"]].head(topn)

def recommend_by_track_id(track_id, n_recs=10):
    # Localizo índice
    idx_list = df.index[df["id"] == track_id].tolist()
    if not idx_list:
        raise ValueError("Track ID no encontrado")
    idx = idx_list[0]

    track_vec = df.loc[idx, latent_cols].values.reshape(1, -1)
    distances, indices = nn_model.kneighbors(track_vec, n_neighbors=n_recs+1)

    indices = indices[0]
    distances = distances[0]

    # Descarta el propio tema (distancia 0)
    indices = indices[1:]
    distances = distances[1:]

    recs = df.loc[indices, ["id", "name", "artists", "year", "popularity"]].copy()
    recs["similarity"] = 1 - distances  # 1 - cos distance
    return recs

In [270]:
# Buscar "Dakiti" y obtener recomendaciones relacionadas
res = search_track("Dakiti", topn=10)
print(res.head(10))

# Seleccionar el id de la canción "Dakiti" (preferir match exacto en df)
if "Dakiti" in df["name"].values:
    some_id = df.loc[df["name"] == "Dakiti", "id"].values[0]
else:
    if isinstance(res, pd.DataFrame) and 'id' in res.columns and len(res) > 0:
        some_id = res.iloc[0]['id']
    else:
        matches = df[df["name"].str.contains("Dakiti", case=False, na=False)]
        if len(matches) == 0:
            raise ValueError("No se encontró 'Dakiti' en el dataset ni en los resultados de búsqueda.")
        some_id = matches.iloc[0]['id']

print("Usando id:", some_id)
display(df.loc[df['id'] == some_id, ['name', 'artists', 'year', 'popularity']])

# Obtener recomendaciones por id de la pista
recommendations = recommend_by_track_id(some_id, n_recs=15)
recommendations

                           id    name                   artists  year  \
19611  47EiUVwUp4C9fGccaPuUCS  Dakiti  [Bad Bunny, Jhay Cortez]  2020   

       popularity  
19611         100  
Usando id: 47EiUVwUp4C9fGccaPuUCS


Unnamed: 0,name,artists,year,popularity
19611,Dakiti,"[Bad Bunny, Jhay Cortez]",2020,100


Unnamed: 0,id,name,artists,year,popularity,similarity
19727,41LhQUkElADQ5YUbpYv2D0,"...And To Those I Love, Thanks For Sticking Ar...",[$uicideBoy$],2020,82,0.994296
38034,23c9gmiiv7RCu7twft0Mym,Who Hurt You?,[Daniel Caesar],2018,72,0.992405
107932,6GBMbvX7sqyOxT5wWK4hgN,Dangerously,[Charlie Puth],2016,55,0.990559
170410,6z8sQFj47s9ZG0Ls1k9Gct,Dime tú,[Danny Ocean],2019,73,0.988791
19738,3TNSVsiFngfe68UJpMq1oS,Starting Over,[Chris Stapleton],2020,78,0.988634
91785,4c2xt1trwYZpMqPWY35Xi9,Jaded,[Drake],2018,65,0.987503
19678,35RJhm1pEovTBwnNR0zWad,GREECE (feat. Drake),"[DJ Khaled, Drake]",2020,87,0.987281
19495,2nC3QhMI9reBIOWutbU3Tj,Moral of the Story,[Ashe],2019,81,0.986989
74851,5jYe1mOKg5zUx0enf1DOdz,Cold,"[Boy In Space, unheard]",2019,70,0.986966
38170,1VJwtWR6z7SpZRwipI12be,Candy,[Doja Cat],2018,74,0.986697


In [271]:
# probamos recomendacion con una cancion con baja popularidad
some_id = df.loc[df['popularity'] < 20, 'id'].values[0]

# obtener recomendaciones por id de la pista
recommendations = recommend_by_track_id(some_id, n_recs=15)
# Ordenar recommendations segun la popularidad que mas se acerque a la cancion dada
print(df.loc[df['id'] == some_id]["name"].values[0])
recommendations


Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve


Unnamed: 0,id,name,artists,year,popularity,similarity
39,1SCWBjhk5WmXPxhDduD3HM,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...","[Sergei Rachmaninoff, James Levine, Berliner P...",1921,1,0.999974
39041,2SPOwMPEXB4Jm1MKzDH8Wc,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...","[Sergei Rachmaninoff, Vladimir Horowitz, New Y...",1928,0,0.993514
109097,7mqytyJkLxRlywbwSHFvic,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...","[Sergei Rachmaninoff, Vladimir Horowitz, New Y...",1928,0,0.993514
40265,1YnmAenKanDp1Cg9lARum7,"Beethoven: Symphony No. 9 in D Minor, Op. 125 ...","[Ludwig van Beethoven, Karl Böhm, Staatskapell...",1935,0,0.972486
42474,1Tm6GpZZa24ToTP0AUPb1u,"Violin Concerto, Op. 24: Allegro non troppo ma...","[Miklós Rózsa, Jascha Heifetz, Walter Hendl]",1946,0,0.972382
1633,34y6Gie19NTeapGMoqCHhl,"Années de pèlerinage, Première année (Suisse),...","[Franz Liszt, Vladimir Horowitz]",1930,14,0.970938
60776,2ARk50zQKS2RUtqN2wnrUk,"Violin Concerto in D Major, Op. 61: III. Rondo...","[Ludwig van Beethoven, Joseph Szigeti, Bruno W...",1948,0,0.970647
38905,7cXjrcVIhug5vZxQ9IAhG8,"Symphony No. 5 in C Minor, Op. 67: 2. Andante ...","[Ludwig van Beethoven, Berliner Philharmoniker...",1927,0,0.968622
5611,12pe2mL5SQGMI1gBGcEWKE,Night on Bald Mountain,"[Modest Mussorgsky, Leonard Bernstein, New Yor...",1950,32,0.967393
109085,7heYzjLe261HXzNvACeyE6,"Fantaisie in F Minor, Op. 49","[Frédéric Chopin, Vladimir Horowitz]",1928,1,0.967264


In [272]:
# # Ordenar recommendations segun la popularidad que mas se acerque a la cancion dada
recommendations = recommendations.reindex(
    recommendations['popularity']
    .sub(df.loc[df['id'] == some_id, 'popularity'].values[0])
    .abs()
    .sort_values()
    .index
).reset_index(drop=True)
recommendations

Unnamed: 0,id,name,artists,year,popularity,similarity
0,1SCWBjhk5WmXPxhDduD3HM,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...","[Sergei Rachmaninoff, James Levine, Berliner P...",1921,1,0.999974
1,7heYzjLe261HXzNvACeyE6,"Fantaisie in F Minor, Op. 49","[Frédéric Chopin, Vladimir Horowitz]",1928,1,0.967264
2,2SPOwMPEXB4Jm1MKzDH8Wc,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...","[Sergei Rachmaninoff, Vladimir Horowitz, New Y...",1928,0,0.993514
3,7mqytyJkLxRlywbwSHFvic,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...","[Sergei Rachmaninoff, Vladimir Horowitz, New Y...",1928,0,0.993514
4,1YnmAenKanDp1Cg9lARum7,"Beethoven: Symphony No. 9 in D Minor, Op. 125 ...","[Ludwig van Beethoven, Karl Böhm, Staatskapell...",1935,0,0.972486
5,1Tm6GpZZa24ToTP0AUPb1u,"Violin Concerto, Op. 24: Allegro non troppo ma...","[Miklós Rózsa, Jascha Heifetz, Walter Hendl]",1946,0,0.972382
6,2ARk50zQKS2RUtqN2wnrUk,"Violin Concerto in D Major, Op. 61: III. Rondo...","[Ludwig van Beethoven, Joseph Szigeti, Bruno W...",1948,0,0.970647
7,7cXjrcVIhug5vZxQ9IAhG8,"Symphony No. 5 in C Minor, Op. 67: 2. Andante ...","[Ludwig van Beethoven, Berliner Philharmoniker...",1927,0,0.968622
8,4Vua5pUPtnF1FptSPwOjMY,"Symphony No. 3 in E-Flat Major, Op. 55 ""Eroica...","[Ludwig van Beethoven, Arturo Toscanini]",1939,0,0.966653
9,3Ra8OWEDaAZwnQgQ1LLAb6,"Siegfried Idyll, WWV 103","[Richard Wagner, Arturo Toscanini]",1936,0,0.96658
