In [1]:
import pandas as pd
import numpy as np
import ast
import re
from sklearn.preprocessing import StandardScaler, normalize, MultiLabelBinarizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# ---------------------------
# 0) Asegurar df en el entorno
# ---------------------------
if 'df' not in globals():
    df = pd.read_csv('data.csv')
# Si artists vienen como strings tipo "['A','B']" intenta convertirlos
if 'artists' in df.columns and df['artists'].dtype == object:
    sample = df['artists'].dropna().head(1)
    if not sample.empty and isinstance(sample.iloc[0], str):
        try:
            df['artists'] = df['artists'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
        except Exception:
            # si falla, déjalo como está
            pass

# ---------------------------
# 1) Feature engineering básica
# ---------------------------
# decade, recency
if 'year' in df.columns:
    df['decade'] = (df['year'] // 10) * 10
    current_year = pd.Timestamp.now().year
    df['years_since_release'] = current_year - df['year']
    df['year/recency'] = df['year'] / (current_year - df['year'] + 1)
else:
    df['decade'] = 0
    df['years_since_release'] = 0
    df['year/recency'] = 0

# artists_str (tokens: spaces -> underscore)
def artists_to_str(lst):
    if not isinstance(lst, (list, tuple)):
        return ''
    clean = [re.sub(r'\s+', '_', a.strip().lower()) for a in lst if isinstance(a, str) and a.strip()]
    return ' '.join(clean)

if 'artists' in df.columns:
    df['artists'] = df['artists'].apply(lambda x: x if isinstance(x, (list, tuple)) else [])
    df['artist_primary'] = df['artists'].apply(lambda lst: lst[0] if lst else None)
    df['artists_str'] = df['artists'].apply(artists_to_str)
else:
    df['artists'] = [[] for _ in range(len(df))]
    df['artist_primary'] = None
    df['artists_str'] = ''

# ---------------------------
# 2) Release date cyclical encodings (if existe)
# ---------------------------
if 'release_date' in df.columns:
    df['release_date_parsed'] = pd.to_datetime(df['release_date'], errors='coerce')
    df['release_month'] = df['release_date_parsed'].dt.month.fillna(0).astype(int)
    df['release_dayofyear'] = df['release_date_parsed'].dt.dayofyear.fillna(0).astype(int)
    df['release_month_sin'] = np.sin(2*np.pi*(df['release_month']/12.0))
    df['release_month_cos'] = np.cos(2*np.pi*(df['release_month']/12.0))
    df['release_doy_sin'] = np.sin(2*np.pi*(df['release_dayofyear']/365.0))
    df['release_doy_cos'] = np.cos(2*np.pi*(df['release_dayofyear']/365.0))
else:
    df['release_month_sin'] = 0
    df['release_month_cos'] = 0
    df['release_doy_sin'] = 0
    df['release_doy_cos'] = 0

# ---------------------------
# 3) Key encodings (circle-of-fifths + key+mode 24-bin)
# ---------------------------
cof_order = {0:0, 7:1, 2:2, 9:3, 4:4, 11:5, 6:6, 1:7, 8:8, 3:9, 10:10, 5:11}
df['key'] = pd.to_numeric(df['key'], errors='coerce') if 'key' in df.columns else np.nan
df['key_cof_pos'] = df['key'].map(cof_order)
mask_cof = df['key_cof_pos'].notna()
df.loc[mask_cof, 'key_cof_sin'] = np.sin(2*np.pi*df.loc[mask_cof, 'key_cof_pos']/12)
df.loc[mask_cof, 'key_cof_cos'] = np.cos(2*np.pi*df.loc[mask_cof, 'key_cof_pos']/12)
df['key_cof_sin'] = df['key_cof_sin'].fillna(0)
df['key_cof_cos'] = df['key_cof_cos'].fillna(0)

# key+mode 24-bin
if 'mode' in df.columns:
    df['mode'] = df['mode'].fillna(0).astype(int)
    mask_k = df['key'].notna()
    df.loc[mask_k, 'key_mode_idx'] = df.loc[mask_k, 'key'].astype(int) + 12*df.loc[mask_k, 'mode']
    df.loc[mask_k, 'key24_sin'] = np.sin(2*np.pi*df.loc[mask_k, 'key_mode_idx']/24)
    df.loc[mask_k, 'key24_cos'] = np.cos(2*np.pi*df.loc[mask_k, 'key_mode_idx']/24)
    df['key24_sin'] = df['key24_sin'].fillna(0)
    df['key24_cos'] = df['key24_cos'].fillna(0)
else:
    df['key24_sin'] = 0
    df['key24_cos'] = 0

# ---------------------------
# 4) Numeric features -> PCA
# ---------------------------
numeric_feats = [
    'valence','acousticness','danceability','duration_ms','energy',
    'instrumentalness','liveness','loudness','speechiness','tempo','popularity',
    'decade','years_since_release','year/recency',
    'key_cof_sin','key_cof_cos','key24_sin','key24_cos',
    'release_month_sin','release_month_cos','release_doy_sin','release_doy_cos'
]
# add artist stats if exist
for col in ['artist_pop_mean','artist_count']:
    if col in df.columns and col not in numeric_feats:
        numeric_feats.append(col)
# ensure features exist in df (if not, create zeros)
for f in list(numeric_feats):
    if f not in df.columns:
        df[f] = 0

X_num = df[numeric_feats].fillna(0).values
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)
n_num_components = min(16, X_num_scaled.shape[1])
pca_num = PCA(n_components=n_num_components, random_state=42)
X_num_emb = pca_num.fit_transform(X_num_scaled)
print('X_num_emb shape:', X_num_emb.shape)

# ---------------------------
# 5) Text embedding (name + artists_str) -> TF-IDF + SVD
# ---------------------------
df['text_for_tfidf'] = df.get('name', '').fillna('').astype(str) + ' ' + df['artists_str'].fillna('')
tfidf_text = TfidfVectorizer(max_features=3000, ngram_range=(1,2), token_pattern=r'(?u)\b\w+\b')
X_text_tfidf = tfidf_text.fit_transform(df['text_for_tfidf'].fillna(''))
n_text_comp = min(64, X_text_tfidf.shape[1]-1 if X_text_tfidf.shape[1] > 1 else 1)
svd_text = TruncatedSVD(n_components=n_text_comp, random_state=42)
X_text_emb = svd_text.fit_transform(X_text_tfidf)
print('X_text_emb shape:', X_text_emb.shape)

# ---------------------------
# 6) Artist embedding robust (TF-IDF on artists_str with fallback)
# ---------------------------
artists_str_filled = df['artists_str'].fillna('')
n_non_empty = int(artists_str_filled.astype(str).str.strip().astype(bool).sum())
X_art_emb = None
if n_non_empty > 0:
    try:
        tfidf_art = TfidfVectorizer(max_features=500, token_pattern=r'(?u)\b\w+\b')
        X_art_tfidf = tfidf_art.fit_transform(artists_str_filled)
        if X_art_tfidf.shape[1] > 0:
            n_art = min(16, X_art_tfidf.shape[1]-1 if X_art_tfidf.shape[1] > 1 else 1)
            svd_art = TruncatedSVD(n_components=n_art, random_state=42)
            X_art_emb = svd_art.fit_transform(X_art_tfidf)
            print('X_art_emb (TF-IDF) shape:', X_art_emb.shape)
    except Exception as e:
        X_art_emb = None
if X_art_emb is None:
    # fallback to MultiLabelBinarizer
    try:
        mlb = MultiLabelBinarizer()
        artists_lists = df['artists'].apply(lambda x: x if isinstance(x, (list, tuple)) else [])
        X_mlb = mlb.fit_transform(artists_lists)
        if X_mlb.shape[1] == 0:
            X_art_emb = np.zeros((len(df), 1))
            print('X_art_emb: no artist columns -> zeros', X_art_emb.shape)
        else:
            n_art = min(16, X_mlb.shape[1]-1 if X_mlb.shape[1] > 1 else 1)
            svd_art = TruncatedSVD(n_components=n_art, random_state=42)
            X_art_emb = svd_art.fit_transform(X_mlb)
            print('X_art_emb (MLB+SVD) shape:', X_art_emb.shape)
    except Exception as e:
        X_art_emb = np.zeros((len(df), 1))
        print('X_art_emb fallback zeros shape:', X_art_emb.shape)

# ---------------------------
# 7) Normalize blocks, weight, concat
# ---------------------------
from sklearn.preprocessing import normalize
X_num_n  = normalize(X_num_emb, axis=1)
X_text_n = normalize(X_text_emb, axis=1)
X_art_n  = normalize(X_art_emb, axis=1)

# set weights (tune these)
num_w, text_w, art_w = 1.0, 1.0, 1.5
Xn = X_num_n * num_w
Xt = X_text_n * text_w
Xa = X_art_n * art_w

X_comb = np.hstack([Xn, Xt, Xa])
print('X_comb shape:', X_comb.shape)

# ---------------------------
# 8) PCA final + normalize
# ---------------------------
final_dim = min(64, X_comb.shape[1])
pca_final = PCA(n_components=final_dim, random_state=42)
X_final = pca_final.fit_transform(X_comb)
X_final_norm = normalize(X_final, norm='l2')
print('X_final_norm shape:', X_final_norm.shape)

# ---------------------------
# 9) Train NN and helper recommender
# ---------------------------
nn_model = NearestNeighbors(n_neighbors=11, metric='cosine')
nn_model.fit(X_final_norm)

def recommend_by_track_id_all(track_id, n_recs=10, rerank_by_artist=False, pop_weight=0.0):
    idx_list = df.index[df['id'] == track_id].tolist()
    if not idx_list:
        raise ValueError('Track ID no encontrado')
    idx = idx_list[0]
    vec = X_final_norm[idx].reshape(1,-1)
    distances, indices = nn_model.kneighbors(vec, n_neighbors=n_recs+1)
    indices = indices[0][1:]
    distances = distances[0][1:]
    recs = df.iloc[indices][['id','name','artists','artist_primary','year','popularity']].copy()
    recs['similarity'] = 1 - distances
    # optional rerank by same artist / popularity
    if rerank_by_artist:
        seed_artist = df.loc[idx, 'artist_primary']
        recs['score'] = recs['similarity'] + 0.1 * (recs['artist_primary'] == seed_artist).astype(float)
        if pop_weight:
            pop_min, pop_max = df['popularity'].min(), df['popularity'].max()
            recs['pop_norm'] = (recs['popularity'] - pop_min) / (pop_max - pop_min + 1e-9)
            recs['score'] += pop_weight * recs['pop_norm']
        recs = recs.sort_values('score', ascending=False).reset_index(drop=True)
    else:
        recs = recs.reset_index(drop=True)
    return recs

# ---------------------------
# 10) Small inspect helper
# ---------------------------
def inspect_neighbors(track_id, k=10, show_features=None):
    idx_list = df.index[df['id'] == track_id].tolist()
    if not idx_list:
        raise ValueError('Track ID no encontrado')
    idx = idx_list[0]
    vec = X_final_norm[idx].reshape(1,-1)
    dists, inds = nn_model.kneighbors(vec, n_neighbors=k+1)
    inds = inds[0][1:]; dists = dists[0][1:]
    print('Seed:', df.loc[idx, ['id','name','artists','year','popularity']].to_dict())
    neigh = df.iloc[inds][['id','name','artists','year','popularity']].copy()
    neigh['similarity'] = 1 - dists
    display(neigh.reset_index(drop=True))
    if show_features is None:
        show_features = ['valence','danceability','energy','popularity','year']
    print('\nSeed features:')
    print(df.loc[idx, show_features].to_dict())
    print('\nNeighbors mean:')
    print(df.iloc[inds][show_features].mean().to_dict())

print('Pipeline listo. Ejemplo: recommend_by_track_id_all(df[\"id\"].iloc[0])')

  df.loc[mask_k, 'key_mode_idx'] = df.loc[mask_k, 'key'].astype(int) + 12*df.loc[mask_k, 'mode']


X_num_emb shape: (170653, 16)
X_text_emb shape: (170653, 64)
X_art_emb (TF-IDF) shape: (170653, 16)
X_comb shape: (170653, 96)
X_final_norm shape: (170653, 64)
Pipeline listo. Ejemplo: recommend_by_track_id_all(df["id"].iloc[0])


In [6]:
# prueba con una cancion ingresada (no hace falta ingresar el nombre exacto) (algunas para usar: dakiti, watermelon sugar)
cancion = input("ingresar cancion: ")
some_id = df.loc[df['name'].str.contains(cancion, case=False, na=False), 'id'].iloc[0]
print(df.loc[df['id']==some_id, ['name','artists','key','mode']])
recoms = recommend_by_track_id_all(some_id, n_recs=10, rerank_by_artist=False, pop_weight=0.2)
recoms

         name                   artists  key  mode
19611  Dakiti  [Bad Bunny, Jhay Cortez]    4     0


Unnamed: 0,id,name,artists,artist_primary,year,popularity,similarity
0,2OWVCFTolecLiGZPquvWvT,Estamos Bien,[Bad Bunny],Bad Bunny,2018,74,0.960747
1,6C1RD7YQVvt3YQj0CmuTeu,Diles,"[Bad Bunny, Ozuna, Farruko, Arcangel, Ñengo Fl...",Bad Bunny,2016,74,0.954601
2,4UEuIEv9Wc3wtiWUplGJ7q,Ser Bichote,[Bad Bunny],Bad Bunny,2018,66,0.954481
3,5hcisvFMidkMJlElTO9Qmw,Sensualidad,"[Bad Bunny, Prince Royce, J Balvin, Mambo King...",Bad Bunny,2017,73,0.950014
4,7FfpP3YZ6fOWMdxkIAtud9,Krippy Kush,"[Farruko, Bad Bunny, Rvssian]",Farruko,2017,65,0.947539
5,278kSqsZIiYp8p3QjYAqa8,NI BIEN NI MAL,[Bad Bunny],Bad Bunny,2018,76,0.945579
6,2cpteAYHcd4cjSxAeCkA52,Yo Perreo Sola - Remix,"[Bad Bunny, Nesi, Ivy Queen]",Bad Bunny,2020,76,0.945031
7,2AY1UAimvTqjJC8vDJsOyy,Mayores,"[Becky G, Bad Bunny]",Becky G,2019,74,0.943505
8,5DxXgozhkPLgrbKFY91w0c,Vete,[Bad Bunny],Bad Bunny,2019,80,0.942408
9,2wRkBumdItthjYP9XknImg,Está Rico,"[Marc Anthony, Will Smith, Bad Bunny]",Marc Anthony,2018,68,0.936302
