In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import joblib
# Only for the app.py part

In [31]:

# --- 1. Load Data ---
url = "https://raw.githubusercontent.com/charith3903/context-aware-music-recommender/refs/heads/main/contextual_spotify_dataset.csv"
df = pd.read_csv(url)

print("Original Data Head:")
print(df.head())
print("\nShape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nMissing values:\n", df.isnull().sum())



Original Data Head:
         username                track_id    track_name    track_artist  \
0  cassandrakline  697MjF1454XKvZmTuqkWmD    I Miss You          Jeriqo   
1         ebutler  3x2bXiU0o4WbsPkawXlfDA   Who Are You         The Who   
2     gravesaaron  0jEaPBjFAWjQTMVouRwaHi         Happy  The Beef Seeds   
3        nathan50  5EKUb1FKsyYVaSXb41YBIj           ONE      Rev Theory   
4          mjones  300DUx4tdtCdGEUXR032jA  Palace/Curse    The Internet   

   track_popularity          track_album_id          track_album_name  \
0                45  2vJ6FDg6ZMS56U8Wbiw2Oz                I Miss You   
1                17  6LRJF97hgXHj8uMLHyCDbh  Who Are You (Remastered)   
2                30  4IQn9XpweytNX2cUe2NBUH          Keepin' it Beefy   
3                35  0gGic19XvEiHKKWBV7M4YM                       ONE   
4                62  69g3CtOVg98TPOwqmI2K7Q                 Ego Death   

  track_album_release_date            playlist_name             playlist_id  \
0          

In [32]:
# --- 2. Feature Engineering and Preprocessing ---
label_cols = ['activity', 'location', 'time_of_day', 'username']
encoders = {}

for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le


In [33]:
# Store a mapping of encoded track_name to original track_name for recommendations
# We need to encode 'track_name' to use it as a 'class' if we were doing classification,
# but for similarity, we just need its original value for output.
# However, we will use it for inverse transformation in the recommendation function.
track_name_encoder = LabelEncoder()
df['track_name_encoded'] = track_name_encoder.fit_transform(df['track_name'])
encoders['track_name'] = track_name_encoder # Store it for later decoding

In [34]:
numeric_features = ['tempo', 'energy', 'valence', 'danceability', 'speechiness', 'acousticness', 'instrumentalness', 'liveness']
context_features = ['activity', 'location', 'time_of_day']
user_feature = ['username']

In [35]:
# Combine all features for the recommendation engine
all_features_for_recommendation = context_features + numeric_features + user_feature

# Scale numeric features
scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])
joblib.dump(scaler, 'scaler.pkl') # Save the scaler

['scaler.pkl']

In [36]:
# --- 3. Save Processed Data and Encoders ---
# We no longer train a RandomForestClassifier for direct 'track_name' prediction.
# Instead, we'll use the processed DataFrame and encoders for similarity-based recommendations.
joblib.dump(df[all_features_for_recommendation], 'processed_features.pkl')
joblib.dump(df['track_name'], 'original_track_names.pkl')
joblib.dump(encoders, 'encoders.pkl')

print("\nProcessed data and encoders saved successfully.")
print("\nUnique activities (after encoding):", df['activity'].unique())
print("Unique locations (after encoding):", df['location'].unique())
print("Unique times (after encoding):", df['time_of_day'].unique())



Processed data and encoders saved successfully.

Unique activities (after encoding): [0 2 3 4 1]
Unique locations (after encoding): [2 3 0 1 4]
Unique times (after encoding): [2 1 0 3]


In [37]:
# --- 4. Recommendation Function ---
def recommend_playlist(activity, location, time_of_day, tempo, energy, valence, danceability, speechiness, acousticness, instrumentalness, liveness, username=None, top_n=5):
    # Load necessary components within the function for a standalone script/app
    try:
        loaded_encoders = joblib.load('encoders.pkl')
        loaded_processed_features = joblib.load('processed_features.pkl')
        loaded_original_track_names = joblib.load('original_track_names.pkl')
        loaded_scaler = joblib.load('scaler.pkl')
    except FileNotFoundError:
        return ["Error: Model files not found. Please run the full script to generate them."]

    try:
        # Encode categorical inputs
        encoded_activity = loaded_encoders['activity'].transform([activity])[0]
        encoded_location = loaded_encoders['location'].transform([location])[0]
        encoded_time_of_day = loaded_encoders['time_of_day'].transform([time_of_day])[0]

        # Handle unknown username: assign a default value (e.g., 0)
        if username and username in loaded_encoders['username'].classes_:
            encoded_username = loaded_encoders['username'].transform([username])[0]
        else:
            encoded_username = 0 # Default for unknown users or no username provided

        # Create a DataFrame for the single user input
        user_input_df = pd.DataFrame([{
            'activity': encoded_activity,
            'location': encoded_location,
            'time_of_day': encoded_time_of_day,
            'tempo': tempo,
            'energy': energy,
            'valence': valence,
            'danceability': danceability,
            'speechiness': speechiness,
            'acousticness': acousticness,
            'instrumentalness': instrumentalness,
            'liveness': liveness,
            'username': encoded_username
        }])

        # Scale the numeric features of the user input
        numeric_cols_user_input = ['tempo', 'energy', 'valence', 'danceability', 'speechiness', 'acousticness', 'instrumentalness', 'liveness']
        user_input_df[numeric_cols_user_input] = loaded_scaler.transform(user_input_df[numeric_cols_user_input])

        # Calculate cosine similarity between user input and all songs in the dataset
        # Ensure column order matches
        dataset_features = loaded_processed_features[all_features_for_recommendation]
        similarities = cosine_similarity(user_input_df, dataset_features)

        # Get top N song indices based on similarity
        # Use .flatten() because cosine_similarity returns a 2D array
        top_n_indices = np.argsort(similarities.flatten())[::-1][:top_n]

        # Get the original track names corresponding to the top indices
        recommended_songs = loaded_original_track_names.iloc[top_n_indices].tolist()

        return recommended_songs

    except ValueError as e:
        return [f"❌ Error: {str(e)}. Make sure your input values are valid for the selected categories."]
    except Exception as e:
        return [f"An unexpected error occurred: {str(e)}"]

In [38]:
# --- 5. Test the Recommendation Function ---
print("\n--- Testing Recommendation Function ---")

print("\n🎧 Playlist for new user (exercising in gym, morning):")
playlist_new_user = recommend_playlist(
    activity="exercising",
    location="gym",
    time_of_day="morning",
    tempo=130,
    energy=0.9,
    valence=0.7,
    danceability=0.8,
    speechiness=0.05,
    acousticness=0.1,
    instrumentalness=0.001,
    liveness=0.15,
    top_n=5
)
for i, song in enumerate(playlist_new_user, 1):
    print(f"{i}. {song}")


--- Testing Recommendation Function ---

🎧 Playlist for new user (exercising in gym, morning):
1. Symphony (feat. Zara Larsson)
2. Dancin (feat. Luvli) - Krono Remix
3. Love
4. Can't Back Down
5. Used To Love (with Dean Lewis)


In [39]:

print("\n🎧 Playlist for existing user (studying in office, evening, username 'mjones'):")
playlist_old_user = recommend_playlist(
    activity="studying",
    location="office",
    time_of_day="evening",
    tempo=100,
    energy=0.4,
    valence=0.3,
    danceability=0.5,
    speechiness=0.1,
    acousticness=0.5,
    instrumentalness=0.01,
    liveness=0.1,
    username="mjones", # Use an existing username from your dataset for a more realistic test
    top_n=5
)
for i, song in enumerate(playlist_old_user, 1):
    print(f"{i}. {song}")


🎧 Playlist for existing user (studying in office, evening, username 'mjones'):
1. Girl
2. Changes - 2015 Remaster
3. It's You
4. As It Sets
5. Stranger


In [40]:
# --- NEW: Evaluation for Content-Based Recommender ---
print("\n--- Evaluating Content-Based Recommender (Hit Rate@K) ---")

# We'll simulate user interactions by taking existing data points as "true" preferences.
# For simplicity, let's use a small subset of the original dataframe as our "test set" for evaluation.
# In a real-world scenario, you'd want a separate held-out test set of user-item interactions.

# Split data to simulate evaluation on unseen (but known) instances
# Note: For content-based, the "training" is essentially creating the feature vectors,
# so the train/test split here is more for evaluating the *recommendation process*
# on data points not used to "build" the feature space.
_, df_eval = train_test_split(df, test_size=0.2, random_state=42) # Use df here, not X, y directly

top_k = 5
hits = 0

# It's important to use the original, unscaled values from df_eval to pass to recommend_playlist
# and let the function handle encoding/scaling internally as it would for a new user input.
for index, row in df_eval.iterrows():
    actual_track_name = row['track_name']
    recommended_songs = recommend_playlist(
        activity=encoders['activity'].inverse_transform([row['activity']])[0],
        location=encoders['location'].inverse_transform([row['location']])[0],
        time_of_day=encoders['time_of_day'].inverse_transform([row['time_of_day']])[0],
        tempo=scaler.inverse_transform(row[numeric_features].values.reshape(1, -1))[0][0], # Inverse scale for input
        energy=scaler.inverse_transform(row[numeric_features].values.reshape(1, -1))[0][1],
        valence=scaler.inverse_transform(row[numeric_features].values.reshape(1, -1))[0][2],
        danceability=scaler.inverse_transform(row[numeric_features].values.reshape(1, -1))[0][3],
        speechiness=scaler.inverse_transform(row[numeric_features].values.reshape(1, -1))[0][4],
        acousticness=scaler.inverse_transform(row[numeric_features].values.reshape(1, -1))[0][5],
        instrumentalness=scaler.inverse_transform(row[numeric_features].values.reshape(1, -1))[0][6],
        liveness=scaler.inverse_transform(row[numeric_features].values.reshape(1, -1))[0][7],
        username=encoders['username'].inverse_transform([row['username']])[0],
        top_n=top_k
    )

    if actual_track_name in recommended_songs:
        hits += 1

hit_rate = hits / len(df_eval)
print(f"\nHit Rate@{top_k}: {hit_rate:.4f} (meaning {hits} out of {len(df_eval)} actual songs were found in the top {top_k} recommendations)")



--- Evaluating Content-Based Recommender (Hit Rate@K) ---

Hit Rate@5: 1.0000 (meaning 200 out of 200 actual songs were found in the top 5 recommendations)
