# Section 3 : Vector Search (Song Recommendation)
- In this section you'll see:
    - Get an input from an user playlist uri link and gather all the track uri in the playlist;
    - Perform an aggregation on the playlist's tracks and get a playlist vector;
    - Use the playlist vector to perform a similarity search from the vector database and return 20 most similar tracks;
    - Recommend these 20 new songs that are most likely to fit into the playlist to the user.

In [1]:
import numpy as np
import pandas as pd
import random

from scipy.spatial import distance 
from sklearn.preprocessing import StandardScaler
import time
import urllib.parse as parse
from sklearn.preprocessing import StandardScaler
import pickle

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import os

import weaviate
import weaviate.classes.config as wc
import weaviate.classes.query as wq
from weaviate.util import generate_uuid5

In [2]:
# Get credentials
os.environ['WCS_URL'] = "enter_credentials"
os.environ['WCS_API_KEY'] = "enter_credentials"
os.environ['OPENAI_APIKEY'] = "enter_credentials"
os.environ['SPOTIPY_CLIENT_ID']='enter_credentials'  # "SPOTIPY" is not a typo
os.environ['SPOTIPY_CLIENT_SECRET']='enter_credentials'
os.environ['SPOTIPY_REDIRECT_URI']='enter_credentials'
spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [3]:
# Import the saved scaler for use
import pickle
scalerfile = 'resources/scaler.sav'
scaler = pickle.load(open(scalerfile, 'rb'))

  scaler = pickle.load(open(scalerfile, 'rb'))


In [4]:
# Instantiate your client (not shown). e.g.:
headers = {
    "X-OpenAI-Api-Key": os.environ['OPENAI_APIKEY']
}  # Replace with your OpenAI API key

client = weaviate.connect_to_wcs(
    cluster_url=os.environ['WCS_URL'],  # Replace with your WCS URL
    auth_credentials=weaviate.auth.AuthApiKey(
        os.environ['WCS_API_KEY']
    ),  # Replace with your WCS key
    headers=headers,
)

try: 
    # Create function which converts a playlist into its mean without touching the categorical variables
    def playlist_mean(df):   
        df_avg=pd.DataFrame()
        df_avg.at[0,'danceability']=df['danceability'].mean()
        df_avg.at[0,'energy']=df['energy'].mean()
        df_avg.at[0,'loudness']=df['loudness'].mean()
        df_avg.at[0,'speechiness']=df['speechiness'].mean()
        df_avg.at[0,'acousticness']=df['acousticness'].mean()
        df_avg.at[0,'instrumentalness']=df['instrumentalness'].mean()
        df_avg.at[0,'liveness']=df['liveness'].mean()
        df_avg.at[0,'valence']=df['valence'].mean()
        df_avg.at[0,'tempo']=df['tempo'].mean()
        df_avg.at[0,'duration_ms']=df['duration_ms'].mean()
        df_avg.at[0,'key']=df['key'].mode().iloc[0]
        df_avg.at[0,'mode']=df['mode'].mode().iloc[0]
        df_avg.at[0,'time_signature']=df['time_signature'].mode().iloc[0]
    
    
        df_avg=df_avg.astype({"key":'int',"mode":'int',"time_signature":'int'})     # Does not remove the decimal ".0" even if it is an integer!
        df_avg['key']=df_avg['key'].astype(str)     # adding this line seems to convert "key", "mode", and "time_signature" to objects...
    
    
        df_avg=df_avg.assign(key_none=0,key_0=0,key_1=0,key_2=0,key_3=0,key_4=0,key_5=0,key_6=0,key_7=0,key_8=0,key_9=0,key_10=0,key_11=0,\
                    mode_minor=0,mode_major=0,\
                    time_signature_0=0,time_signature_1=0,time_signature_2=0,time_signature_3=0,time_signature_4=0,time_signature_5=0,time_signature_6=0,time_signature_7=0,)
    
    
        if df_avg.iloc[0]['key']==-1:
            df_avg.at[0,'key_none']=1
        else:
            col_name='key_'+str(df_avg.iloc[0]['key']) 
            df_avg.at[0,col_name]=1
    
        col_name='time_signature_'+str(df_avg.iloc[0]['time_signature']) 
        df_avg.at[0,col_name]=1
    
        if df_avg.iloc[0]['mode']==0:
            df_avg.at[0,'mode_minor']=1
        else:
            df_avg.at[0,'mode_major']=1   
    
        return df_avg
    
    # Create function which converts a playlist into its weighted average via stddev without touching the categorical variables
    def  playlist_mean_std(df):   
        df_avg=pd.DataFrame()
        df_avg.at[0,'danceability']=df['danceability'].mean()
        df_avg.at[0,'danceability_std']=df['danceability'].std()
        df_avg.at[0,'energy']=df['energy'].mean()
        df_avg.at[0,'energy_std']=df['energy'].std()
        df_avg.at[0,'loudness']=df['loudness'].mean()
        df_avg.at[0,'loudness_std']=df['loudness'].std()
        df_avg.at[0,'speechiness']=df['speechiness'].mean()
        df_avg.at[0,'speechiness_std']=df['speechiness'].std()
        df_avg.at[0,'acousticness']=df['acousticness'].mean()
        df_avg.at[0,'acousticness_std']=df['acousticness'].std()
        df_avg.at[0,'instrumentalness']=df['instrumentalness'].mean()
        df_avg.at[0,'instrumentalness_std']=df['instrumentalness'].std()
        df_avg.at[0,'liveness']=df['liveness'].mean()
        df_avg.at[0,'liveness_std']=df['liveness'].std()
        df_avg.at[0,'valence']=df['valence'].mean()
        df_avg.at[0,'valence_std']=df['valence'].std()
        df_avg.at[0,'tempo']=df['tempo'].mean()
        df_avg.at[0,'tempo_std']=df['tempo'].std()
        df_avg.at[0,'duration_ms']=df['duration_ms'].mean()
        df_avg.at[0,'duration_ms_std']=df['duration_ms'].std()
        df_avg.at[0,'key']=df['key'].mode().iloc[0]
        df_avg.at[0,'mode']=df['mode'].mode().iloc[0]
        df_avg.at[0,'time_signature']=df['time_signature'].mode().iloc[0]
    
    
        df_avg=df_avg.astype({"key":'int',"mode":'int',"time_signature":'int'})     # Does not remove the decimal ".0" even if it is an integer!
        df_avg['key']=df_avg['key'].astype(str)     # adding this line seems to convert "key", "mode", and "time_signature" to objects...
    
    
        df_avg=df_avg.assign(key_none=0,key_0=0,key_1=0,key_2=0,key_3=0,key_4=0,key_5=0,key_6=0,key_7=0,key_8=0,key_9=0,key_10=0,key_11=0,\
                    mode_minor=0,mode_major=0,\
                         time_signature_0=0,time_signature_1=0,time_signature_2=0,time_signature_3=0,time_signature_4=0,time_signature_5=0,time_signature_6=0,time_signature_7=0)
       
        if df_avg.iloc[0]['key']==-1:
            df_avg.at[0,'key_none']=1
        else:
            col_name='key_'+str(df_avg.iloc[0]['key']) 
            # df_avg.at[0,col_name]=df_avg.iloc[0]['key']
            df_avg.at[0,col_name]=1
    
        col_name='time_signature_'+str(df_avg.iloc[0]['time_signature']) 
        # df_avg.at[0,col_name]=df_avg.iloc[0]['time_signature']
        df_avg.at[0,col_name]=1
    
        if df_avg.iloc[0]['mode']==0:
            df_avg.at[0,'mode_minor']=1
        else:
            df_avg.at[0,'mode_major']=1   
    
        return df_avg
    
    # Function used to create dummy variables for input to ML model from base track feature data
    def dummy_variables(data):
        key_to_add = ['key_none', 'key_0', 'key_1', 'key_2',
        'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
        'key_11']
        mode_to_add = ['mode_minor', 'mode_major']
        signature_to_add = ['time_signature_0', 'time_signature_1', 'time_signature_2', 'time_signature_3', 
                            'time_signature_4', 'time_signature_5', 'time_signature_6', 'time_signature_7']
        y = -1
        for x in key_to_add:
            for i in data:
                if i['key'] == y:
                    i[x] = 1
                else:
                    i[x] = 0
            y+=1
        for i in data:
            if i['mode'] == 1:
                i[mode_to_add[0]] = 0
                i[mode_to_add[1]] = 1
            else:
                i[mode_to_add[0]] = 1
                i[mode_to_add[1]] = 0
        time_signature = 0
        for x in signature_to_add:
            for i in data:
                if i['time_signature'] == time_signature:
                    i[x] = 1
                else:
                    i[x] = 0
            time_signature +=1
    
    # Function used to gather playlist information from Spotify
    def gather_playlist_data(playlist_uri):
        spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())
        user_playist_track_uri = []
        for i in range (0,1000,100):
            playlist_info = spotify.playlist_items(playlist_uri, offset=i, limit=100)
            for x in range(0,len(playlist_info['items'])):
                user_playist_track_uri.append(playlist_info['items'][x]['track']['uri'])
            if len(playlist_info['items']) < 100:
                break
                
        return user_playist_track_uri

    # Function used to gather track feature data
    def gather_track_features(uri_track_list):
        spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())
        b = len(uri_track_list)
        results_full = []
        for i in range(0,b,100):
            if (b - i) < 100:
                x=uri_track_list[i:i+(b-i)]
                y=spotify.audio_features(x)
                results_full = results_full + y
            else:
                x=uri_track_list[i:i+100]
                y=spotify.audio_features(x)
                results_full = results_full + y
            time.sleep(0.5)
    
        return results_full
        
    # Function for query vector 
    def query(playlist_uri):    
        # Parse out playlist uri from playlist link
        
        res = parse.urlparse(playlist_uri)
        res.path[10:-1]

        # Establish a connection with the Spotify API
        spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

        # Gather all track URI's from the user's playlist
        global user_playist_track_uri
        user_playist_track_uri = gather_playlist_data(playlist_uri)

        # Request track data from spotify on the 5 track slice from the user playlist
        data = gather_track_features(user_playist_track_uri)

        # Create dummy variable cells for later input into the model
        dummy_variables(data)

        # Create dataframe from data and create a copy for later use
        user_playlist_data_df = pd.DataFrame(data, columns=['danceability','energy', 'loudness', 'speechiness',
                    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
                    'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms','key','mode','time_signature', 
                    'mode_minor', 'mode_major','key_none', 'key_0', 'key_1', 
                    'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 
                    'key_9', 'key_10', 'key_11', 'time_signature_0', 'time_signature_1',
                    'time_signature_2', 'time_signature_3', 'time_signature_4', 'time_signature_5',
                    'time_signature_6', 'time_signature_7'])
        
        # Globalize the df for other use
        global user_playlist_vector_df
        user_playlist_vector_df = user_playlist_data_df.copy()
        # Create the weighted data from the users_playlist
        mean_user_playlist_vector_df = playlist_mean(user_playlist_vector_df)
        # Drop columns not needed for scaling
        mean_user_playlist_vector_df = mean_user_playlist_vector_df.drop(['key', 'mode', 'time_signature'], axis=1)
        scaled_mean_user_playlist_data = scaler.transform(mean_user_playlist_vector_df)
        properties_list = ['track_uri',"track_href","analysis_url",'uuid']
        # playlist_vector = properties_list + scaled_mean_user_playlist_data.tolist()[0]
        playlist_vector = scaled_mean_user_playlist_data.tolist()[0]

        return playlist_vector
        
    # Get user playlist uri from Spotify
    playlist_uri = "https://open.spotify.com/playlist/3uq1jrrLNf3nbqO9cV4NCK?si=cdb54672a7a74d89" # Here I'm using a work-out song album. Feel free to use your own album uri
    # Get vectors using query function
    query_vector = query(playlist_uri)
    # Get Tracks Collection
    tracks = client.collections.get("Tracks")
    # Perform query
    response = tracks.query.near_vector(
        near_vector=query_vector,  # A list of floating point numbers
        limit=20,
        return_metadata=wq.MetadataQuery(distance=True),
    )
    
    # Inspect the response
    recommend_track_uri_list = []
    for o in response.objects:
        print(o.properties["track_uri"])  # Print the title and release year (note the release date is a datetime object)
        print(f"Distance to query: {o.metadata.distance:.3f}\n")  # Print the distance of the object from the query
        recommend_track_uri_list.append(o.properties["track_uri"])
finally:
    client.close()



spotify:track:7LrzljpGbiAAZv49dgpdgX
Distance to query: 0.004

spotify:track:0sxvP9bpa1HwKUHHxycXDD
Distance to query: 0.006

spotify:track:19zB4G07FM9qUv4Bu7u00K
Distance to query: 0.007

spotify:track:6PRepYglwH3CUJHjMmi4iU
Distance to query: 0.007

spotify:track:4FNIdikmTbmOJUcHpVz5q0
Distance to query: 0.008

spotify:track:2yYAssiozt8n4XLHF76BhY
Distance to query: 0.008

spotify:track:0IEaVu1xwNXlHxOISKQM7f
Distance to query: 0.008

spotify:track:4FS4vam8F127BVUIbwGz7H
Distance to query: 0.008

spotify:track:1xpATRwB1iCXJcQS6ZIv0S
Distance to query: 0.009

spotify:track:1uX5xnLjG5s1J7trXE3hhb
Distance to query: 0.009

spotify:track:1UEnkCAFczvnRbr9MLvPGA
Distance to query: 0.009

spotify:track:16g2NeAvQclqpZFXYsosMg
Distance to query: 0.009

spotify:track:2cbqKiWhE4tmaWpehE5Yys
Distance to query: 0.009

spotify:track:0PnIaEIySANyjaLQc2v0UJ
Distance to query: 0.010

spotify:track:6Oqa0Hw8LdPxZAuLtc3jIA
Distance to query: 0.010

spotify:track:5xSovooilmqwv2NwGQlJWZ
Distance to query:

  return lib.map_infer(values, mapper, convert=convert)
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [5]:
# Check if recommended song already exist in user's playlist
recommend_track_uri_df = pd.DataFrame(recommend_track_uri_list)
recommend_track_uri_df = recommend_track_uri_df[~recommend_track_uri_df.isin(user_playist_track_uri)]   
recommend_track_list = list(recommend_track_uri_df[0])

# Gather information from spotify on the 20 songs to recommend
tracks_info = spotify.tracks(recommend_track_list)

# Compile recommended track data
results = []
for x in range(0,len(recommend_track_list)):
    result_dict = {}
    result_dict['Track URI'] = tracks_info['tracks'][x]['uri']
    result_dict['Album Cover'] = tracks_info['tracks'][x]['album']['images'][0]['url']
    result_dict['Track Name'] = tracks_info['tracks'][x]['name']
    result_dict['Artist Name'] = tracks_info['tracks'][x]['artists'][0]['name']
    result_dict['Preview URL'] = tracks_info['tracks'][x]['preview_url']
    results.append(result_dict)
# Check the recommendation results
results

[{'Track URI': 'spotify:track:7LrzljpGbiAAZv49dgpdgX',
  'Album Cover': 'https://i.scdn.co/image/ab67616d0000b273aced39a123f6fbf5a74fc826',
  'Track Name': 'Move',
  'Artist Name': 'G',
  'Preview URL': 'https://p.scdn.co/mp3-preview/f9f02ab4bd88ce0d95aebe5903cd37933ac9064d?cid=9f172ceff97148c787ef9e867e28a19f'},
 {'Track URI': 'spotify:track:0sxvP9bpa1HwKUHHxycXDD',
  'Album Cover': 'https://i.scdn.co/image/ab67616d0000b273f55d4c4fd64cbe9932a00bc2',
  'Track Name': "All The Things You Said You'd Never Tell",
  'Artist Name': 'Cold World',
  'Preview URL': None},
 {'Track URI': 'spotify:track:19zB4G07FM9qUv4Bu7u00K',
  'Album Cover': 'https://i.scdn.co/image/ab67616d0000b273588f7fd6dd4c05e1391031e5',
  'Track Name': 'We Are One - Radio Edit',
  'Artist Name': 'DJ Assad',
  'Preview URL': 'https://p.scdn.co/mp3-preview/5f9199df23fa44d5dbddb35d6ee6a1a7c3dadaa4?cid=9f172ceff97148c787ef9e867e28a19f'},
 {'Track URI': 'spotify:track:6PRepYglwH3CUJHjMmi4iU',
  'Album Cover': 'https://i.scdn.c