In [None]:
# ==========================================
# Hybrid Recommendation System
# Combines Collaborative Filtering + Content-Based Filtering
# ==========================================
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# --- Step 1: Load Datasets ---
cf_file = "piki_filtered.csv"
cbf_file = "spotify_songs.csv"

df_cf = pd.read_csv(cf_file)   # user-song interactions
df_cbf = pd.read_csv(cbf_file) # song features

#print("Datasets Loaded Successfully")
#print("CF Dataset Preview:\n", df_cf.head())
#print("CBF Dataset Preview:\n", df_cbf.head())

# --- Step 2: Build Collaborative Filtering Model ---
user_codes, user_idx = pd.factorize(df_cf["user_id"])
song_codes, song_idx = pd.factorize(df_cf["song_id"])

user_item_matrix = csr_matrix(
    (df_cf["liked"], (user_codes, song_codes)),
    shape=(len(user_idx), len(song_idx))
)

item_similarity_cf = cosine_similarity(user_item_matrix.T, dense_output=False)

# --- Step 3: Build Content-Based Filtering Model ---
feature_cols = [
    'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo'
]

# Check if there are any matching song IDs before filtering
#print(f"Number of songs in df_cbf before filtering: {len(df_cbf)}")
#print(f"Number of songs in song_idx: {len(song_idx)}")
#print(f"Sample song_id from df_cbf: {df_cbf['song_id'].iloc[0] if len(df_cbf) > 0 else 'None'}")
#print(f"Sample song_idx: {song_idx[0] if len(song_idx) > 0 else 'None'}")

# Make sure song_id formats match - convert both to strings for comparison
df_cbf['song_id'] = df_cbf['song_id'].astype(str)
song_idx_str = song_idx.astype(str)

# Check for any matches
matching_songs = df_cbf[df_cbf['song_id'].isin(song_idx_str)]
#print(f"Number of matching songs: {len(matching_songs)}")

# If there are no matches, you might need to fix the song ID format or check your data
if len(matching_songs) == 0:
    # Alternative approach: use all songs in df_cbf if no matches found
    #print("No matching songs found. Using all songs in df_cbf instead.")
    features = df_cbf[feature_cols]
else:
    # keep only rows with valid track_id
    df_cbf = matching_songs
    features = df_cbf[feature_cols]

# Make sure features is not empty before scaling
if len(features) > 0:
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    item_similarity_cbf = cosine_similarity(features_scaled)
else:
    # Handle the case where no features are available
    #print("Error: No feature data available for content-based filtering.")
    # Create a dummy similarity matrix of appropriate size
    item_similarity_cbf = np.zeros((len(song_idx), len(song_idx)))

# --- Step 4: Hybrid Recommendation Function ---
def hybrid_recommend_for_user(user_id, n=10, alpha=0.5):
    """
    Hybrid recommender combining CF + CBF
    alpha = weight for CF (0.0 = pure CBF, 1.0 = pure CF)
    """
    if user_id not in user_idx:
        return f"User {user_id} not found in dataset."
    
    # CF part
    user_internal_id = np.where(user_idx == user_id)[0][0]
    user_vector = user_item_matrix[user_internal_id].toarray().flatten()
    liked_songs = np.where(user_vector > 0)[0]
    
    if len(liked_songs) == 0:
        return f"User {user_id} has not liked any songs yet."
    
    sim_scores_cf = item_similarity_cf[liked_songs].sum(axis=0).A1
    sim_scores_cf[liked_songs] = -1  # remove already liked songs
    
    # CBF part (average of liked songs similarity)
    sim_scores_cbf = item_similarity_cbf[liked_songs].mean(axis=0)
    sim_scores_cbf[liked_songs] = -1
    
    # Hybrid score
    hybrid_scores = alpha * sim_scores_cf + (1 - alpha) * sim_scores_cbf
    
    # Get top-N recommendations
    top_indices = np.argsort(-hybrid_scores)[:n]
    top_song_ids = song_idx[top_indices]
    
    # Merge with df_cbf to get track_name and track_artist
    recs = pd.DataFrame({
        "song_id": top_song_ids,
        "Hybrid_Score": np.round(hybrid_scores[top_indices], 3)
    })
    
    # ensure ID formats align
    recs["song_id"] = recs["song_id"].astype(str)
    df_cbf["song_id"] = df_cbf["song_id"].astype(str)
    
    # merge to get track details
    recs = recs.merge(
        df_cbf[["song_id", "track_name", "track_artist", "track_album_name", "track_album_release_date"]],
        on="song_id",
        how="left"
    )
    
    return recs[["song_id", "track_name", "track_artist", "track_album_name", "track_album_release_date"]].reset_index(drop=True)

# --- Step 5: Example Usage ---
print("\nTop 10 hybrid recommendations for user_id = 3721095:")
print(hybrid_recommend_for_user(3721095, n=10, alpha=0.6))



Top 10 hybrid recommendations for user_id = 3721095:
   song_id                                   track_name         track_artist  \
0  4547091                                    Step Back        Black Paisley   
1  5087344                                         Cool               Alesso   
2  4838779       Runnin' with the Devil - 2015 Remaster            Van Halen   
3  2986034   All You Need To Know (feat. Calle Lehmann)              Gryffin   
4  5287275                                  Summer Song      The Cranberries   
5   409026                        Lightning In A Bottle       The Summer Set   
6  1136794  Only Love - Luca Schreiner Island House Mix               Shaggy   
7   111214           Don't You Worry Child - Radio Edit  Swedish House Mafia   
8  5337851                                  I'm So Hood            DJ Khaled   
9  5063762                           Andas En Mi Cabeza        Chino & Nacho   

                              track_album_name track_album_releas

: 