In [15]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt

# =======================
# 1. Load dataset
# =======================
file_path = "spotify_songs.csv"
df = pd.read_csv(file_path)

# Select features for similarity
feature_cols = [
    'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo'
]

features = df[feature_cols]

# =======================
# 2. Normalize features
# =======================
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)


# =======================
# 4. Recommendation function (with scores)
# =======================
def recommend_songs_by_id(song_id, n=10, plot=True):
    """
    Recommend songs based on a given numeric song_id.
    Excludes:
      - Same song_id
      - Songs with identical feature vectors (similarity = 1.0)
    """
    try:
        song_id = int(song_id)  # ensure numeric
    except ValueError:
        return f"Invalid Song ID format: {song_id}"

    if song_id not in df['song_id'].values:
        return f"Song ID {song_id} not found in dataset."

    # 找到目标歌索引
    idx = df[df['song_id'] == song_id].index[0]
    target_id = df.loc[idx, 'song_id']

    # 特征向量
    song_vec = features_scaled[idx].reshape(1, -1)

    # 相似度
    sim_scores = cosine_similarity(song_vec, features_scaled)[0]
    similar_idx = np.argsort(sim_scores)[::-1]

    # 过滤条件：
    # 1. 排除相同 song_id
    # 2. 排除 similarity = 1.0 (完全一样的特征)
    filtered_idx = [
        i for i in similar_idx
        if (df.loc[i, 'song_id'] != target_id) and (sim_scores[i] < 0.999999)  # allow float tolerance
    ]

    # 推荐表，加上相似度分数
    recommendations = df[['song_id','track_name', 'track_artist', 'track_album_name', 'track_album_release_date']].iloc[filtered_idx].copy()
    recommendations["similarity_score"] = sim_scores[filtered_idx]

    # 去重（避免重复 remix/live 版本）
    recommendations = recommendations.drop_duplicates(subset=['track_name', 'track_artist'])
    top_recommendations = recommendations.head(n)

    return top_recommendations

# =======================
# 5. Example usage
# =======================
print("=== Recommend by song_id ===")
print(recommend_songs_by_id(1862594, n=10, plot=True))


=== Recommend by song_id ===
       song_id                     track_name     track_artist  \
1298   3655941                   Dance Monkey      Tones and I   
21622   245480  bum boy (feat. Orla Engstrøm)       una schram   
19365  2995290                      Todos Ven        Alex Rose   
11014  4180923                   Solo Contigo            Lodel   
10671   167364                         Caviar    Lenny Tavárez   
26665  4132289                          Music      Leela James   
10890  5011922              Carburando Ideias      Léo Rocatto   
1554   1883212                Johnny Run Away      Tones and I   
16487    18909                           Niña   Reyna Tropical   
17322  3494831                 Bailar Contigo  Monsieur Periné   

                            track_album_name track_album_release_date  \
1298                            Dance Monkey                10/5/2019   
21622          bum boy (feat. Orla Engstrøm)                1/11/2019   
19365                    