In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [3]:
file_path = '/mnt/data/spotifytoptracks.csv'
updated_data = pd.read_csv('spotifytoptracks.csv')

In [7]:

# Load the updated dataset (after normal preprocessing)
file_path = '/mnt/data/spotifytoptracks.csv'
updated_data = pd.read_csv('spotifytoptracks.csv')

# Feature selection for the model
model_features = [
    'Energy', 'Danceability', 'Key', 'Loudness', 'Acousticness',
    'Speechiness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Song Duration'
]
model_data = updated_data[model_features]

# Handle missing values
model_data.fillna(model_data.mean(), inplace=True)

# Scale the features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(model_data)
scaled_model_data = pd.DataFrame(scaled_data, columns=model_features)

# Display the preprocessed data (optional)
print("Preprocessing complete. Scaled data is ready.")


Preprocessing complete. Scaled data is ready.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data.fillna(model_data.mean(), inplace=True)


In [7]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# Step 1: Load Preprocessed Data
scaled_model_data = pd.read_csv('scaled_model_data.csv')
original_data = pd.read_csv('spotifytoptracks.csv')

# Ensure 'Track Name' column exists for indexing
track_names = original_data['Track Name']

# Step 2: Compute Cosine Similarity
cosine_similarity_matrix = cosine_similarity(scaled_model_data.values)
cosine_sim_df = pd.DataFrame(
    cosine_similarity_matrix,
    index=track_names,
    columns=track_names
)

# Step 3: Compute Euclidean Similarity (Negative Euclidean Distances)
euclidean_similarity_matrix = -euclidean_distances(scaled_model_data.values)
euclidean_sim_df = pd.DataFrame(
    euclidean_similarity_matrix,
    index=track_names,
    columns=track_names
)

# Optional: Save Similarity Matrices for Future Use
cosine_sim_df.to_csv('cosine_similarity_matrix.csv')
euclidean_sim_df.to_csv('euclidean_similarity_matrix.csv')

# Step 4: Combined Recommendation Logic
def combined_recommendations(song_name, data, cosine_df, euclidean_df, recommendation_type='By Similarity', top_n=10):
    """
    Provide recommendations based on similarity, artist, or genre.

    Parameters:
    - song_name (str): Name of the input song.
    - data (DataFrame): Original dataset with artist and genre information.
    - cosine_df (DataFrame): Cosine similarity matrix.
    - euclidean_df (DataFrame): Euclidean similarity matrix.
    - recommendation_type (str): Type of recommendation ('By Similarity', 'By Artist', 'By Genre').
    - top_n (int): Number of recommendations to return.

    Returns:
    - DataFrame or str: Recommendations with track names and similarity scores, or an error message.
    """
    # Similarity-based recommendations
    if recommendation_type == 'By Similarity':
        if song_name not in cosine_df.index or song_name not in euclidean_df.index:
            return f"Song '{song_name}' not found in the dataset."

        # Combine recommendations from cosine and Euclidean similarity matrices
        cosine_recs = cosine_df[song_name].sort_values(ascending=False).iloc[1:top_n//2 + 1]
        euclidean_recs = euclidean_df[song_name].sort_values(ascending=False).iloc[1:top_n//2 + 1]

        combined = pd.concat([cosine_recs, euclidean_recs]).groupby(level=0).mean().sort_values(ascending=False)
        combined = combined.head(top_n)

        return combined.reset_index().rename(columns={'index': 'Track Name', 0: 'Similarity Score'})

    # Artist-based recommendations
    elif recommendation_type == 'By Artist':
        if song_name not in data['Track Name'].values:
            return f"Song '{song_name}' not found in the dataset."

        # Get the artist of the input song
        artist = data.loc[data['Track Name'] == song_name, 'Artist'].values[0]

        # Filter songs by the same artist
        artist_songs = data[data['Artist'] == artist]

        # Rank by similarity using the cosine similarity matrix
        recommendations = []
        for track in artist_songs['Track Name']:
            if track in cosine_df.index:
                similarity_score = cosine_df[song_name][track]
                recommendations.append((track, similarity_score))

        # Sort by similarity score
        recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:top_n]
        return pd.DataFrame(recommendations, columns=['Track Name', 'Similarity Score'])

    # Genre-based recommendations
    elif recommendation_type == 'By Genre':
        if song_name not in data['Track Name'].values:
            return f"Song '{song_name}' not found in the dataset."

        # Get the genre of the input song
        genre = data.loc[data['Track Name'] == song_name, 'Genre'].values[0]

        # Filter songs by the same genre
        genre_songs = data[data['Genre'] == genre]

        # Rank by similarity using the cosine similarity matrix
        recommendations = []
        for track in genre_songs['Track Name']:
            if track in cosine_df.index:
                similarity_score = cosine_df[song_name][track]
                recommendations.append((track, similarity_score))

        # Sort by similarity score
        recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:top_n]
        return pd.DataFrame(recommendations, columns=['Track Name', 'Similarity Score'])

    # Invalid recommendation type
    else:
        return "Invalid recommendation type. Choose from 'By Similarity', 'By Artist', or 'By Genre'."

# Example Test for Cosine Similarity Matrix
print("Cosine Similarity Matrix (Sample):")
print(cosine_sim_df.head())

# Example Usage
recommendations = combined_recommendations(
    song_name="Blinding Lights",
    data=original_data,
    cosine_df=cosine_sim_df,
    euclidean_df=euclidean_sim_df,
    recommendation_type='By Similarity',
    top_n=10
)

print("Recommendations:")
print(recommendations)


Cosine Similarity Matrix (Sample):
Track Name             Blinding Lights  Dance Monkey   The Box  \
Track Name                                                       
Blinding Lights               1.000000     -0.642558 -0.422577   
Dance Monkey                 -0.642558      1.000000  0.013429   
The Box                      -0.422577      0.013429  1.000000   
Roses - Imanbek Remix        -0.189412     -0.274946  0.485386   
Don't Start Now              -0.067298     -0.225445  0.129910   

Track Name             Roses - Imanbek Remix  Don't Start Now  \
Track Name                                                      
Blinding Lights                    -0.189412        -0.067298   
Dance Monkey                       -0.274946        -0.225445   
The Box                             0.485386         0.129910   
Roses - Imanbek Remix               1.000000         0.725161   
Don't Start Now                     0.725161         1.000000   

Track Name             ROCKSTAR (feat. Roddy R

In [8]:
# Save cosine similarity matrix to a CSV file
cosine_sim_df.to_csv('cosine_similarity_matrix.csv')
print("Cosine similarity matrix saved as 'cosine_similarity_matrix.csv'.")
# Save Euclidean similarity matrix to a CSV file
euclidean_sim_df.to_csv('euclidean_similarity_matrix.csv')
print("Euclidean similarity matrix saved as 'euclidean_similarity_matrix.csv'.")


Cosine similarity matrix saved as 'cosine_similarity_matrix.csv'.
Euclidean similarity matrix saved as 'euclidean_similarity_matrix.csv'.
