In [2]:
########################################
# Part 1: Data Loading and Preprocessing
########################################
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt

In [3]:
# Load data from new_features.csv
df = pd.read_csv("../data/new_features.csv")
print("Data loaded. Example rows:")
print(df.head())

Data loaded. Example rows:
                       id  danceability    energy       key  loudness  \
0  7lmeHLHBe4nmXzuXc0HDjk     -0.239643  1.545439  0.504262  0.855004   
1  1wsRitfRRtWyEapl0q22o8      0.448997  1.470214  1.632828  0.794662   
2  1hR0fIFK2qRG3f3RF70pb7     -1.067078  1.516782  0.504262  0.850871   
3  2lbASgTSoDO7MTuLAXlTW0     -0.399791  1.506035  1.632828  0.783751   
4  1MQTmpYOZ6fcMQc56Hdo7T     -0.474528  1.369913 -0.906446  0.635127   

   speechiness  acousticness  instrumentalness  liveness   valence  ...  \
0    -0.084450     -1.018675         -0.663029  0.731281  0.184434  ...   
1     0.998881     -1.054604         -0.662862 -0.286777  0.132294  ...   
2     3.770628     -1.026024         -0.663054 -0.453920 -0.310892  ...   
3     1.459273     -0.646043         -0.663049 -0.458985  0.448856  ...   
4    -0.108879     -1.085308         -0.369701 -0.672220  0.318507  ...   

   loc_pca_0  loc_pca_1  loc_pca_2  loc_pca_3  loc_pca_4  loc_pca_5  \
0  24.203403

In [4]:
# Extract the playlist location features (PCA-reduced), e.g., loc_pca_0 ... loc_pca_9
loc_pca_cols = [col for col in df.columns if col.startswith("loc_pca_")]
interaction_matrix = df[loc_pca_cols].values
print("\nInteraction matrix shape:", interaction_matrix.shape)
print(interaction_matrix[:5,:])


Interaction matrix shape: (134712, 10)
[[ 2.42034032e+01 -1.58920329e-01  2.47125516e-01  7.89664658e-01
  -6.00232886e-02 -6.40467692e-01  1.33038490e+00 -8.57547533e-01
  -4.23787206e-01  3.97033482e-02]
 [ 2.84938068e+01  2.28536035e-01  6.46449192e-01  1.70631088e+00
  -4.73284325e-03 -1.07777097e+00  7.02043500e-01 -1.45128731e+00
   9.63210248e-01  3.37378803e-01]
 [ 4.03621801e+00 -2.82504531e-01  2.66416716e-01  2.24057832e-01
  -2.29110575e-02 -1.09713456e+00 -7.52202908e-02 -1.47339233e-01
   3.20746836e-01 -1.47517915e-01]
 [ 1.41121268e-01 -1.80056725e-01  3.99448520e-02 -1.17471261e-01
   1.91789130e-01 -1.58619716e-02  1.20229957e-02 -2.56249095e-02
   2.26302518e-02 -7.68359042e-02]
 [ 1.18970673e+01 -6.46397813e-01  5.00255177e-01 -2.52341680e-01
   1.11723382e-01 -5.87307520e-01  7.65196968e-01 -7.37421723e-01
  -1.86457089e-01 -6.29838242e-02]]


In [5]:
########################################
# Part 2: Collaborative Filtering (CF) via SVD
########################################
# Apply Truncated SVD to the interaction matrix to obtain CF latent factors
n_components = min(50, interaction_matrix.shape[1])
svd = TruncatedSVD(n_components=n_components, random_state=42)
cf_latent = svd.fit_transform(interaction_matrix)
print("CF latent factors shape:", cf_latent.shape)

# Map each track ID to its CF latent vector
all_track_ids = df["id"].values
track_latent = {track_id: cf_latent[i] for i, track_id in enumerate(all_track_ids)}
print(cf_latent[5:])

CF latent factors shape: (134712, 10)
[[ 1.34396653 -0.23610125  0.56466556 ... -0.25026262  0.02991909
   0.00542487]
 [-0.30140504 -0.21641625  0.11910229 ...  0.12143172  0.0776299
   0.05102219]
 [ 0.10702813 -0.3446732   0.09637378 ...  0.09145588  0.01603204
  -0.04303788]
 ...
 [-0.98728123  0.00980382 -0.00592042 ... -0.01617723  0.00858322
  -0.01902092]
 [-0.98710829  0.01361991 -0.00282743 ...  0.03376629 -0.00749121
   0.01406205]
 [-0.74422675 -0.00149335  0.08163496 ...  0.05391664  0.07274814
  -0.10867274]]


In [6]:
# Recommendation function using dot product similarity (Pure CF)
def recommend_tracks_dot_with_scores(target_track, track_latent, top_n=10):
    if target_track not in track_latent:
        print(f"Track {target_track} not found.")
        return []
    target_vector = track_latent[target_track]
    all_track_ids_list = list(track_latent.keys())
    all_vectors = np.array(list(track_latent.values()))
    similarities = np.dot(all_vectors, target_vector)
    sorted_indices = np.argsort(similarities)[::-1]
    recommendations = [(all_track_ids_list[i], similarities[i])
                       for i in sorted_indices if all_track_ids_list[i] != target_track]
    return recommendations[:top_n]

In [7]:
########################################
# Part 3: Content-Based Feature Extraction
########################################
# Define audio feature columns; ensure popularity is present
audio_cols = ["danceability", "energy", "valence", "tempo", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]
#if "popularity" not in df.columns:
#    if "occurrence_count" in df.columns:
#        df["popularity"] = df["occurrence_count"]
#    else:
#        df["popularity"] = 1 # np.random.randint(100, 1000, size=len(df)) # Why? -David
#content_cols = audio_cols + ["popularity"]

# Standardize the content features
scaler = StandardScaler()
# content_scaled = scaler.fit_transform(df[content_cols].values)
content_scaled = scaler.fit_transform(df[audio_cols].values)
print("Content features shape:", content_scaled.shape)

content_scaled_dict = {df.iloc[i]["id"]: content_scaled[i] for i in range(len(df))}

Content features shape: (134712, 11)


In [8]:
########################################
# Part 4: Hybrid Vector Construction
########################################
# Hybrid vector = [CF latent vector, Content features]
hybrid_vectors = np.concatenate([cf_latent, content_scaled], axis=1) # cf_latent.shape = (134712, 10)
print("Hybrid vector shape:", hybrid_vectors.shape)
# Build a discionary based on hybrid_vectors by id
hybrid_dict = {df.iloc[i]["id"]: hybrid_vectors[i] for i in range(len(df))}

Hybrid vector shape: (134712, 21)


In [9]:
########################################
# Part 5: Recommendation Function for Hybrid Model
########################################
def recommend_hybrid(target_track, hybrid_dict, top_n=10):
    if target_track not in hybrid_dict:
        print(f"Track {target_track} not found.")
        return []
    target_vector = hybrid_dict[target_track]
    all_ids = list(hybrid_dict.keys())
    all_vectors = np.array(list(hybrid_dict.values()))
    similarities = np.dot(all_vectors, target_vector) # Interesting. This works? - David
    sorted_indices = np.argsort(similarities)[::-1]
    recommendations = [(all_ids[i], similarities[i])
                       for i in sorted_indices if all_ids[i] != target_track]
    return recommendations[:top_n]

In [10]:
########################################
# Part 6: Recommendation Function for Nearest Neighbors
########################################
# Can't use regression model because ids are categorical identifiers
knn = NearestNeighbors(n_neighbors=10, algorithm='ball_tree')
all_track_ids_list = list(track_latent.keys())
all_vectors = np.array(list(track_latent.values()))
# knn.fit(all_vectors, all_track_ids_list)
knn.fit(all_vectors)

def recommend_nearest(target_track, track_latent = track_latent, top_n=10):
    if target_track not in track_latent:
        print(f"Track {target_track} not found.")
        return []
    target_vector = track_latent[target_track].reshape(1, -1)
    distances, indices = knn.kneighbors(target_vector, n_neighbors=top_n)
    recommendations = [(all_track_ids_list[idx], distances[0][j]) for j, idx in enumerate(indices[0])]
    return recommendations[:top_n]

In [11]:
########################################
# Part 7: Recommendation Function for GMM and Nearest Neighbors
########################################
gmm = GaussianMixture(n_components=10, covariance_type='full', random_state=42)
all_track_ids_list = list(content_scaled_dict.keys())
all_vectors = np.array(list(content_scaled_dict.values()))
gmm.fit(all_vectors)
GMM_labels = gmm.predict(all_vectors)
GMM_probs = gmm.predict_proba(all_vectors)

In [24]:
def recommend_GMM_nearest(target_track_id, content_scaled_dict = content_scaled_dict, top_n=10):
    if target_track_id not in all_track_ids_list:
        print(f"Track {target_track_id} not found.")
        return []
    target_vector = content_scaled_dict[target_track_id].reshape(1, -1)
    predicted_label = gmm.predict(target_vector)

    cluster_indices = np.where(GMM_labels == predicted_label)[0]
    cluster_data = all_vectors[cluster_indices]

    print("predicted cluster label:", predicted_label)
    print("cluster_indices:", cluster_indices)
    print("cluster_indices label:", GMM_labels[cluster_indices])

    # using nearest neighbor search
    nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree')
    nbrs.fit(cluster_data)
    distances, indices = nbrs.kneighbors(target_vector, n_neighbors=top_n)

    recommendations = [(all_track_ids_list[idx], distances[0][j]) for j, idx in enumerate(indices[0])]
    return recommendations[:top_n]

In [13]:
########################################
# Part 8: Evaluation via Masking and Hit Rate Metrics
########################################
# Define ground truth: for each track, the nearest neighbor in the original loc_pca space
def ground_truth_top(target_index):
    target_vec = interaction_matrix[target_index].reshape(1, -1)
    sims = np.dot(interaction_matrix, target_vec.T).flatten() # Using dot product to find the closest vector
    sims[target_index] = -np.inf  # exclude the track itself
    best_idx = np.argmax(sims) # argmax = Returns the indices of the maximum values along an axis
    return all_track_ids[best_idx]

In [14]:
# Evaluate Pure CF model hit rate on a sample of tracks
sample_size = 1000
num_tracks = len(all_track_ids)
sample_indices = np.random.choice(num_tracks, sample_size, replace=False)

In [15]:
hits_cf = 0
for i in sample_indices:
    gt = ground_truth_top(i)
    target_id = all_track_ids[i]
    recs_cf = [rec for rec, score in recommend_tracks_dot_with_scores(target_id, track_latent, top_n=10)]
    # If ground truth top is in the top 10 recommended tracks, recognize it as a hit
    if gt in recs_cf:
        hits_cf += 1
hit_rate_cf = hits_cf / sample_size
print(f"Pure CF Hit Rate (Top-10): {hit_rate_cf:.4f}")

Pure CF Hit Rate (Top-10): 0.1480


In [16]:
# Evaluate Hybrid model (without weighting, i.e., pure concatenation)
hybrid_hit_rate = 0
for i in sample_indices:
    gt = ground_truth_top(i)
    target_id = all_track_ids[i]
    recs_hybrid = [rec for rec, score in recommend_hybrid(target_id, hybrid_dict, top_n=10)]
    # If ground truth top is in the top 10 recommended tracks, recognize it as a hit
    if gt in recs_hybrid:
        hybrid_hit_rate += 1
hybrid_hit_rate = hybrid_hit_rate / sample_size
print(f"Hybrid (α = 1.0, pure CF) Hit Rate (Top-10): {hybrid_hit_rate:.4f}")

Hybrid (α = 1.0, pure CF) Hit Rate (Top-10): 0.0660


In [17]:
# Evaluate Nearest Neighboring
hits_KNN = 0
for i in sample_indices:
    gt = ground_truth_top(i)
    target_id = all_track_ids[i]
    recs_KNN = [rec for rec, score in recommend_nearest(target_id, track_latent, top_n=10)]
    # If ground truth top is in the top 10 recommended tracks, recognize it as a hit
    if gt in recs_KNN:
        hits_KNN += 1
hit_rate_NNR = hits_KNN / sample_size
print(f"Nearest Neighboring Hit Rate (Top-10): {hit_rate_NNR:.4f}")

Nearest Neighboring Hit Rate (Top-10): 0.1800


In [25]:
# Evaluate GMM Nearest Neighboring
hits_GMM_NN = 0
for i in sample_indices:
    gt = ground_truth_top(i)
    target_id = all_track_ids[i]
    recs_GMM_NN = [rec for rec, score in recommend_GMM_nearest(target_id, content_scaled_dict, top_n=10)]
    # If ground truth top is in the top 10 recommended tracks, recognize it as a hit

    print("ground truth:", gt)
    print("recommendations:", recs_GMM_NN)
    
    if gt in recs_GMM_NN:
        hits_GMM_NN += 1
hit_rate_GMM_NN = hits_GMM_NN / sample_size
print(f"Gausian Mixture Nearest Neighboring Hit Rate (Top-10): {hit_rate_GMM_NN:.4f}")

predicted cluster label: [7]
cluster_indices: [    21     23     31 ... 134695 134696 134709]
cluster_indices label: [7 7 7 ... 7 7 7]
ground truth: 7BKLCZ1jbUBVqRi2FVlTVw
recommendations: ['5vzUijqlFQhopgfnzrelO2', '07X6ZeIpuHLNPbWs51ZCIs', '621F5yjzzYRNpjTwumQN5H', '7ux93P8s6JL1ValxJrY3iZ', '33WYMryvB4X20YortgFCcC', '0tmwj9sSn9IupNiaouQLPG', '3E5tseQMhbFBoVhCl1ZTav', '6wyZPyRiLJ0pey3HWxNcbx', '4El3abvnY5KUTMi9RMlVFe', '3ltJhnJsMkSqKpsed3ijYQ']
predicted cluster label: [9]
cluster_indices: [     8     16     18 ... 134646 134652 134699]
cluster_indices label: [9 9 9 ... 9 9 9]
ground truth: 7hGRwsILg5SRzQRv8GQMhM
recommendations: ['1LWeysY6JTGjGOAnPOb9TS', '1x63U5U00cfzifoQAilHxC', '5HBekg4MU9VeeT0g9PueK9', '2P24PuzOoNVvst1v8jl05S', '2ronNzFbdPNVygv4cIr3vO', '6R20qWyhC2pgvWfYw1gbDo', '4fVDjTjiehtOdTokOvIRt5', '3II0wx5m5bqVlARovLxJDW', '0DEdJCVjbY8FlvxPRv4z7l', '4WLcZboHbHdgARAIzOojRx']
predicted cluster label: [7]
cluster_indices: [    21     23     31 ... 134695 134696 134709]
cluste

In [19]:
########################################
# Part 9: Grid Search for Weighted Hybrid Model
########################################
# We combine CF and content features using a weight: 
# Weighted Hybrid = [α × CF_latent, (1-α) × content_scaled]
alphas = [0.6, 0.7, 0.8, 0.9, 1.0]
hit_rates = []
for alpha in alphas:
    weighted_hybrid_vectors = np.concatenate([alpha * cf_latent, (1 - alpha) * content_scaled], axis=1)
    weighted_hybrid_dict = {df.iloc[i]["id"]: weighted_hybrid_vectors[i] for i in range(len(df))}
    hits = 0
    for i in sample_indices:
        gt = ground_truth_top(i)
        target_id = all_track_ids[i]
        recs = [rec for rec, score in recommend_hybrid(target_id, weighted_hybrid_dict, top_n=10)]
        if gt in recs:
            hits += 1
    hit_rate = hits / sample_size
    hit_rates.append(hit_rate)
    print(f"Alpha: {alpha:.1f} -> Hit Rate: {hit_rate:.4f}")

Alpha: 0.6 -> Hit Rate: 0.0680
Alpha: 0.7 -> Hit Rate: 0.0680


KeyboardInterrupt: 

In [None]:
# Plot Hit Rate vs. Alpha
plt.figure(figsize=(8, 6))
plt.plot(alphas, hit_rates, marker='o', linestyle='-')
plt.xlabel("Alpha (CF weight)")
plt.ylabel("Hit Rate (Top-10)")
plt.title("Hit Rate vs. Alpha for Weighted Hybrid Model")
plt.grid(True)
plt.show()