In [19]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import json
import time
from openai import OpenAI
import os
from dotenv import load_dotenv

In [20]:
# Load environment variables from .env file
load_dotenv()
# Get the API key from the environment variable
api_key = os.getenv('OPENAI_API_KEY')

In [3]:
# Load the dataset
df = pd.read_csv("spotifydata.csv")
df.head()


Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,14/06/2019,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),13/12/2019,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),05/07/2019,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,19/07/2019,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),05/03/2019,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


In [4]:
# Select relevant features for content-based filtering
features = ["danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", 
            "instrumentalness", "liveness", "valence", "tempo", "duration_ms"]

# Drop non-numeric and irrelevant columns
df_filtered = df[["track_id", "track_name", "track_artist"] + features]

# Remove duplicates based on track_name
# df_filtered = df_filtered.drop_duplicates(subset="track_name")

# Handle missing values
df_filtered.dropna(inplace=True)

# Normalize feature values
scaler = StandardScaler()
df_filtered[features] = scaler.fit_transform(df_filtered[features])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[features] = scaler.fit_transform(df_filtered[features])


In [5]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(df_filtered[features])

In [6]:
# Function to recommend tracks
def recommend_tracks(track_name, n=5):
    if track_name not in df_filtered["track_name"].values:
        return "Track not found in the dataset."
    
    idx = df_filtered[df_filtered["track_name"] == track_name].index[0]
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get recommended tracks and also capture their similarity scores
    recommended = []
    for i in similarity_scores[1:n+1]:
        rec_track = df_filtered.iloc[i[0]]["track_name"]
        rec_track_artist = df_filtered.iloc[i[0]]["track_artist"]
        score = i[1]
        recommended.append((rec_track, score))
    return recommended

In [16]:
input_track = "SUBEME LA RADIO"

In [17]:
# Example usage: Get recommended tracks for a specific track.
recommended_tracks = recommend_tracks(input_track)
print("Recommended Tracks and Similarity Scores:")
print(recommended_tracks)

Recommended Tracks and Similarity Scores:
[('El Perdedor', 0.9999999999999999), ('Pecado y Religión', 0.9577140959801139), ('Me Reclama', 0.9534077150064924), ('Piensan', 0.9443708331433276), ('Piensan', 0.9443708331433276)]


In [18]:
# Build a string that includes recommended tracks and their cosine similarity scores.
report_text = f"For the track {input_track}', the following tracks were recommended based on cosine similarity:\n\n"
for track, score in recommended_tracks:
    report_text += f"- {track} (Similarity Score: {score:.3f})\n"
report_text += "\nPlease explain why these songs were recommended based on the cosine similarity of features such as danceability, energy, tempo, and others. Take about each of the songs that were recommended in a report format and relate it to their similarity score. Approach it like your talking to users that use music streaming platforms like Spotify."

In [21]:
# Step 1: Initialize OpenAI client
client = OpenAI(api_key=api_key)


In [20]:
# Create an assistant
assistant = client.beta.assistants.create(
    name="SpotifyAiAssistant",
    instructions="Assist in creating and improving a recommendation system using cosine similarity to recommend tracks based on feature similarity, and provide explanations of the recommendations in an easily understandable way for users. Answer questions briefly, in a sentence or less.",
    model="gpt-4o",
)

In [21]:
# Step 2: Create a thread for the conversation
thread = client.beta.threads.create()

# Step 3: Create a user message with the report text as input.
message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content=report_text,
)

In [22]:
# Step 4: Execute the run for the assistant
run = client.beta.threads.runs.create(
    thread_id=thread.id,
    assistant_id=assistant.id,
)

In [23]:
# Step 5: Wait for the run to complete
time.sleep(5)

In [24]:
# Step 6: Retrieve the messages added after the user message
messages = client.beta.threads.messages.list(
    thread_id=thread.id,
    order="asc",  # Or "desc" if you want the newest first
    after=message.id
)

# Check if messages are returned
print("Messages returned:", messages.data)


Messages returned: [Message(id='msg_7DlmrJeySqdoKBGSEnSlBXfL', assistant_id='asst_sp0SRbXgzoi2uKNPPHhi6JMK', attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value='Certainly! Here\'s a breakdown of the recommended tracks for "SUBEME LA RADIO," explained in a user-friendly way:\n\n**1. El Perdedor (Similarity Score: 1.000)**\n- **Explanation:** "El Perdedor" is an excellent match because it shares almost identical features with "SUBEME LA RADIO," making it a perfect choice if you enjoyed the danceable rhythms and energetic vibes.\n\n**2. Pecado y Religión (Similarity Score: 0.958)**\n- **Explanation:** With a high similarity score, "Pecado y Religión" closely matches the song\'s danceability and tempo, providing a similarly engaging beat and energy level.\n\n**3. Me Reclama (Similarity Score: 0.953)**\n- **Explanation:** Known for a similar dynamic feel and catchy tempo, "Me Reclama" promises an energetic and lively listening experience akin to "SU

In [25]:
# If there are messages, proceed with extracting the reply
if messages.data:
    message = messages.data[0]  # Access the first message in the list
    ai_reply = message.content[0].text.value
    print("AI Explanation Report:")
    print(ai_reply)
else:
    print("No messages found yet.")

AI Explanation Report:
Certainly! Here's a breakdown of the recommended tracks for "SUBEME LA RADIO," explained in a user-friendly way:

**1. El Perdedor (Similarity Score: 1.000)**
- **Explanation:** "El Perdedor" is an excellent match because it shares almost identical features with "SUBEME LA RADIO," making it a perfect choice if you enjoyed the danceable rhythms and energetic vibes.

**2. Pecado y Religión (Similarity Score: 0.958)**
- **Explanation:** With a high similarity score, "Pecado y Religión" closely matches the song's danceability and tempo, providing a similarly engaging beat and energy level.

**3. Me Reclama (Similarity Score: 0.953)**
- **Explanation:** Known for a similar dynamic feel and catchy tempo, "Me Reclama" promises an energetic and lively listening experience akin to "SUBEME LA RADIO."

**4. Piensan (Similarity Score: 0.944)**
- **Explanation:** Despite appearing twice, "Piensan" consistently matches the key characteristics of energy and rhythm, offering lis