### Installing necessary packages

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=5e17a772e43f7398b6132ae8e60bb04b81eee41ee7bf112dacc93b591e85afe9
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
!pip install load_dotenv

Collecting load_dotenv
  Downloading load_dotenv-0.1.0-py3-none-any.whl (7.2 kB)
Collecting python-dotenv (from load_dotenv)
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, load_dotenv
Successfully installed load_dotenv-0.1.0 python-dotenv-1.0.1


In [3]:
!pip install spotipy

Collecting spotipy
  Downloading spotipy-2.23.0-py3-none-any.whl (29 kB)
Collecting redis>=3.5.3 (from spotipy)
  Downloading redis-5.0.4-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.0/252.0 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: redis, spotipy
Successfully installed redis-5.0.4 spotipy-2.23.0


### Importing libraries

In [4]:
import pandas as pd
import json
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StringType
import re
import pyspark.sql.functions as F
from pyspark.sql.functions import avg, max, min
from pyspark.sql.functions import mean, col, when
from pyspark.sql.functions import udf
import requests
from dotenv import load_dotenv
import os
import base64
from requests import post, get
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, ArrayType, MapType
import numpy as np
from pyspark.sql.functions import col, sqrt, sum as sql_sum
from pyspark.ml.feature import VectorAssembler
from sklearn.metrics.pairwise import cosine_similarity
from pyspark.sql.functions import desc
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col, collect_list, concat_ws
from pyspark.ml.stat import Summarizer

### Initializing a spark session.

In [5]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

spark = init_spark()

# Loading Dataset

In [6]:
from google.colab import drive
drive.mount('/content/drive/')
# spotfy_songs_df = spark.read.csv('/content/drive/MyDrive/SOEN471_files/spotify_1million_tracks_data.csv', header = True, inferSchema=True)
spotfy_songs_df = spark.read.option("delimiter", "\t").csv('/content/drive/MyDrive/SOEN471_files/spotify_1million_tracks_data_tab_sep.txt', header = True, inferSchema=True)

Mounted at /content/drive/


In [7]:
spotfy_songs_df.show(10,False)

+---+---------------------+------------------------------+----------------------+----------+----+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------------+
|_c0|artist_name          |track_name                    |track_id              |popularity|year|genre   |danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo  |duration_ms|time_signature|
+---+---------------------+------------------------------+----------------------+----------+----+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------------+
|0  |Jason Mraz           |I Won't Give Up               |53QF56cjZA9RTuuMZDrSA6|68        |2012|acoustic|0.483       |0.303 |4  |-10.058 |1   |0.0429     |0.694       |0.0             |0.115   |0.139  |133.406|240166     |3             |
|1  |Jason Mraz           |93 Million Miles 

# Data Cleaning

In [8]:
# Rename columns names in first and second dataframes to the same names for consistency
spotfy_songs_df = spotfy_songs_df.withColumnRenamed('_c0', 'index')

# Drop unneeded columns from the first dataframe
columns_to_remove = ['Unnamed: 0','duration_ms','year', 'time_signature','popularity']
for column in columns_to_remove:
  spotfy_songs_df = spotfy_songs_df.drop(column)


In [9]:
# Testing if dataframe for first list of songs has right output
spotfy_songs_df.show(5)

+-----+-------------+----------------+--------------------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+
|index|  artist_name|      track_name|            track_id|   genre|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|
+-----+-------------+----------------+--------------------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+
|    0|   Jason Mraz| I Won't Give Up|53QF56cjZA9RTuuMZ...|acoustic|       0.483| 0.303|  4| -10.058|   1|     0.0429|       0.694|             0.0|   0.115|  0.139|133.406|
|    1|   Jason Mraz|93 Million Miles|1s8tP3jP4GZcyHDsj...|acoustic|       0.572| 0.454|  3| -10.286|   1|     0.0258|       0.477|         1.37E-5|  0.0974|  0.515|140.182|
|    2|Joshua Hyslop|Do Not Let Me Go|7BRCa8MPiyuvr2VU3...|acoustic|       0.409| 0.234|  3| -13.711|   1|     0.0323|       0.338

# Spotify API manipulation to get User Playlist tracks


In [10]:
# Load the .env file
# load_dotenv('user.env')
load_dotenv('/content/drive/MyDrive/SOEN471_files/user.env')
client_id = os.getenv('CLIENT_ID')
client_secret = os.getenv('CLIENT_SECRET')

def get_token():
    auth_string = client_id + ":" + client_secret
    auth_bytes = auth_string.encode('utf-8')
    auth_base64 = str(base64.b64encode(auth_bytes),'utf-8')
    url = "https://accounts.spotify.com/api/token"
    headers = {
        "Authorization": "Basic " + auth_base64,
        "Content-Type": "application/x-www-form-urlencoded"
    }
    data = {"grant_type": "client_credentials"}
    result = post(url, headers=headers, data=data)
    json_result = json.loads(result.content)
    token = json_result["access_token"]
    return token

def get_auth_header(token):
    return {"Authorization": "Bearer " + token}

token = get_token()

## This one is improved version of def get_playlist(token, seeds)
def playlist_tracks(token, seeds):
    url = "https://api.spotify.com/v1/playlists/"
    headers = get_auth_header(token)
    seed_playlist = seeds['playlist']
    query_url = url + seed_playlist + "/tracks"
    print(f'url: {query_url}')

    all_tracks = []  # List to store all tracks

    limit = 100  # Maximum number of tracks per request
    offset = 0   # Initial offset

    while True:
        # Make request to fetch tracks with the current offset
        params = {'limit': limit, 'offset': offset}
        result = requests.get(query_url, headers=headers, params=params)
        json_result = result.json()

        # Check if there are tracks in the response
        if 'items' in json_result:
            tracks = json_result['items']
            all_tracks.extend(tracks)  # Add tracks to the list

            # Check if there are more tracks to fetch
            if len(tracks) < limit:
                break  # No more tracks to fetch

            # Increment offset for next request
            offset += limit
        else:
            break  # No tracks found in the response

    return all_tracks


# ---GET ALL SONGS FROM PLAYLIST---
# 3N4iBPX7sIvTisnRfxENYh
# 36X9xBefQM03QOI7FE2vb5 its about drive
# 6yLF7RyxeQgnyXWfxRYqWx all
# https://open.spotify.com/playlist/63JPeRGc9H7XBY1T1oRrkZ?si=22d0c4a024644300
# https://open.spotify.com/playlist/3KVkwNoITp21prEUbHVOFG?si=edef88e7bb2d489c
# https://open.spotify.com/playlist/6e0hGtrPplc5xixvDSEg32?si=a50a22fcc4b944e4
seeds = { 'playlist':  '6yLF7RyxeQgnyXWfxRYqWx'}
json_string = playlist_tracks(token, seeds)
print(f'Total tracks fetched: {len(json_string)}')

url: https://api.spotify.com/v1/playlists/6yLF7RyxeQgnyXWfxRYqWx/tracks
Total tracks fetched: 310


In [11]:
# Create a spark dataframe by extracting song id from playlist
user_playlist_df = spark.createDataFrame(re.findall(r'/track/(\w+)', json.dumps(json_string)), StringType()).toDF("track_id")
user_playlist_df.show(10,False)

+----------------------+
|track_id              |
+----------------------+
|02q0ZnV2L4XByzEvWZJqBC|
|15jxI2XwZfYo7FGKex8IEf|
|1RMJOxR6GRPsBHL8qeC2ux|
|1iEzGFEux7t1Wk41LOaCCr|
|6z1kLsntE7FuzKZHZWrXYN|
|4WJNAvpI9FwZJSNrkIyvGZ|
|33i3xxHB4YSYGYbtJrwwO8|
|7KA4W4McWYRpgf0fWsJZWB|
|5SxlUF7J8tyFIEF22EomeP|
|5WNYg3usc6H8N3MBEp4zVk|
+----------------------+
only showing top 10 rows



In [12]:
# Join tracks from user playlist with dataset to get features from dataset
merge_df = user_playlist_df.join(spotfy_songs_df, on = 'track_id', how='left')

# show current dataframe
merge_df.show(10, False)

+----------------------+------+------------------+-----------------------------------------------------+-------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+
|track_id              |index |artist_name       |track_name                                           |genre  |danceability|energy|key |loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo  |
+----------------------+------+------------------+-----------------------------------------------------+-------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+
|02q0ZnV2L4XByzEvWZJqBC|NULL  |NULL              |NULL                                                 |NULL   |NULL        |NULL  |NULL|NULL    |NULL|NULL       |NULL        |NULL            |NULL    |NULL   |NULL   |
|2tlJ22iQwiO1CWBQSma23n|NULL  |NULL              |NULL                                                 |NULL   |NULL        

In [13]:
# Extract audio features from songs that aren't in the dataset using the Spotify API

null_track_ids_df = merge_df.filter(merge_df.danceability.isNull())
track_ids_list = null_track_ids_df.select(collect_list('track_id')).collect()[0][0]
# Limit list to 100 songs max if it has more than 100 strings (api has a limit of 100 strings in the request)
if len(track_ids_list) > 100:
  track_ids_list = track_ids_list[:100]

sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id,client_secret))

missing_track_audio_features_json = sp.audio_features(track_ids_list)


extracted_audio_features_df = spark.read.json(spark.sparkContext.parallelize(missing_track_audio_features_json))

In [14]:
#Reorder columns
extracted_audio_features_df = extracted_audio_features_df.select('id', 'danceability', 'energy', 'key', 'loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo')
extracted_audio_features_df.show(10, False)

+----------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+
|id                    |danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo  |
+----------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+
|02q0ZnV2L4XByzEvWZJqBC|0.539       |0.846 |8  |-3.192  |1   |0.0354     |0.0063      |1.68E-6         |0.13    |0.46   |106.992|
|06EMBzxDm2hueehobAlMtm|0.388       |0.323 |5  |-7.158  |1   |0.0348     |0.851       |2.03E-6         |0.143   |0.232  |118.885|
|0ErvwtalACwcpvEDJVZX7O|0.639       |0.895 |0  |-3.192  |1   |0.0558     |6.21E-4     |0.0             |0.0381  |0.943  |148.004|
|0GLXQAdrh4tdvz0JLw8DX8|0.749       |0.553 |10 |-6.385  |1   |0.0374     |0.42        |1.98E-4         |0.31    |0.641  |96.044 |
|0Y8F9OzVKj0kpyXqbTr4ZE|0.591       |0.655 |1  |-4.599  |1   |0.0499     |0.0182      |0.0

# Dropping songs from the dataset that do *not* fit the user's genres

In [15]:
#Collect all genres from user playlist
distinct_genres_df = merge_df.select("genre").distinct()
distinct_genres_df = distinct_genres_df.dropna()

# transform the distinct genre values into a list
distinct_genres = [row.genre for row in distinct_genres_df.collect()]

# Print the distinct genre values
print(distinct_genres)

# Drop unrelated genres from dataset
spotfy_songs_df = spotfy_songs_df.filter(col('genre').isin(distinct_genres))

['pop', 'k-pop', 'ambient', 'chill', 'hip-hop', 'indie-pop']


In [16]:
# Drop all rows with nulls in spotfy_songs_df
spotfy_songs_df = spotfy_songs_df.dropna()

In [17]:
# Drop unnecessary columns
columns_to_drop = ['artist_name','track_name','popularity','year','index','time_signature','genre']
for column in columns_to_drop:
  merge_df = merge_df.drop(column)

merge_df.show(10)

+--------------------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+
|            track_id|danceability|energy| key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|
+--------------------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+
|02q0ZnV2L4XByzEvW...|        NULL|  NULL|NULL|    NULL|NULL|       NULL|        NULL|            NULL|    NULL|   NULL|   NULL|
|2tlJ22iQwiO1CWBQS...|        NULL|  NULL|NULL|    NULL|NULL|       NULL|        NULL|            NULL|    NULL|   NULL|   NULL|
|33i3xxHB4YSYGYbtJ...|       0.734| 0.671|   1|  -7.107|   1|     0.0315|       0.118|         0.00317|   0.352|  0.113| 94.025|
|3iqlzKw1tLt6tXZyK...|       0.649| 0.461|   8|  -8.401|   1|     0.0449|       0.442|             0.0|   0.108|  0.487|141.088|
|4LaZ8RpIP6DIgN73b...|       0.619|  0.71|   4|  -7.661|   0|     0.0686|       0.224|         3.

In [18]:
# Combine the user's playlist dataframe with the dataframe containing the missing audio features
merge_df = merge_df.union(extracted_audio_features_df)
merge_df = merge_df.dropna()

merge_df.show(10, False)

+----------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+
|track_id              |danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo  |
+----------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+
|01ZedLPM6De5HPt5crdRQf|0.661       |0.845 |2  |-5.607  |1   |0.0348     |0.0123      |0.0             |0.163   |0.835  |130.06 |
|07wOBgBXdrrhs3LOCy2RpM|0.656       |0.348 |5  |-12.556 |1   |0.0347     |0.422       |0.00527         |0.0852  |0.533  |178.046|
|0DYvTdqBqW6erA1a7pFzVo|0.807       |0.501 |10 |-6.397  |0   |0.0377     |0.645       |1.53E-5         |0.262   |0.401  |94.958 |
|0JL7DoEqAUcOntWmBuOSdh|0.588       |0.452 |7  |-7.778  |1   |0.039      |0.724       |0.0             |0.107   |0.267  |83.936 |
|0XFCzxunyrtisfiZALPVSR|0.732       |0.826 |8  |-4.715  |1   |0.0589     |0.564       |0.0

In [19]:
# Create a list of features (this will be used multiple times)
feature_columns = [ 'danceability', 'energy', 'key','loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence','tempo']

# Calculate the mean for each column
mean_values = merge_df.select([mean(col(column)).alias(column) for column in feature_columns]).collect()[0].asDict()

# Replace null values in each column with its respective mean value
updated_df = merge_df.fillna(mean_values, subset=feature_columns)

updated_df.show(10)

+--------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+
|            track_id|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|
+--------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+
|01ZedLPM6De5HPt5c...|       0.661| 0.845|  2|  -5.607|   1|     0.0348|      0.0123|             0.0|   0.163|  0.835| 130.06|
|07wOBgBXdrrhs3LOC...|       0.656| 0.348|  5| -12.556|   1|     0.0347|       0.422|         0.00527|  0.0852|  0.533|178.046|
|0DYvTdqBqW6erA1a7...|       0.807| 0.501| 10|  -6.397|   0|     0.0377|       0.645|         1.53E-5|   0.262|  0.401| 94.958|
|0JL7DoEqAUcOntWmB...|       0.588| 0.452|  7|  -7.778|   1|      0.039|       0.724|             0.0|   0.107|  0.267| 83.936|
|0XFCzxunyrtisfiZA...|       0.732| 0.826|  8|  -4.715|   1|     0.0589|       0.564|             0.0|  

# Create user-profile (from user input)

In [20]:
# User profile
def create_user_profile(playlist_df):
    # Calculate the mean for each feature column
    avg_features_dict = playlist_df.select([mean(col(column)).alias(column) for column in feature_columns]).collect()[0].asDict()

    # Return dictionary of all features and their average
    return Row(avg_features=avg_features_dict).asDict()

# Creating a user profile
user_1_profile = create_user_profile(updated_df)

user_1_profile

{'avg_features': {'danceability': 0.6142788844621513,
  'energy': 0.6150597609561753,
  'key': 5.422310756972111,
  'loudness': -6.464756972111554,
  'mode': 0.7330677290836654,
  'speechiness': 0.07777928286852588,
  'acousticness': 0.3255154541832669,
  'instrumentalness': 0.018291775657370522,
  'liveness': 0.18478804780876493,
  'valence': 0.44881752988047807,
  'tempo': 121.88837051792831}}

In [21]:
# Converting average features from a dictionary to a list
user_profile_values = list(user_1_profile['avg_features'].values())
# Printing the list of user profile values
print(user_profile_values)


# Casting all feature values to double
for feature_column in feature_columns:
    updated_df = updated_df.withColumn(feature_column, col(feature_column).cast('double'))
    spotfy_songs_df = spotfy_songs_df.withColumn(feature_column, col(feature_column).cast('double'))

[0.6142788844621513, 0.6150597609561753, 5.422310756972111, -6.464756972111554, 0.7330677290836654, 0.07777928286852588, 0.3255154541832669, 0.018291775657370522, 0.18478804780876493, 0.44881752988047807, 121.88837051792831]


# Vector assembly and Standard Scaler for user playlist and spotify playlist

In [22]:
# Creating a vector assembler for user and spotify playlist
# We will transform all track features into a vector
vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol='features_vector', handleInvalid='skip')

user_vector_df = vector_assembler.transform(updated_df)
spotify_vector_df = vector_assembler.transform(spotfy_songs_df)

spotify_vector_df.show()

+-----+--------------------+--------------------+--------------------+-------+------------+-------+----+--------+----+-----------+------------+----------------+--------+-------+-------+--------------------+
|index|         artist_name|          track_name|            track_id|  genre|danceability| energy| key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|     features_vector|
+-----+--------------------+--------------------+--------------------+-------+------------+-------+----+--------+----+-----------+------------+----------------+--------+-------+-------+--------------------+
| 2724|Cigarettes After Sex|Nothing's Gonna H...|3GhsBdS9ulPK3KCdw...|ambient|       0.509|  0.331| 4.0| -14.083| 1.0|     0.0267|       0.272|           0.136|   0.114| 0.0957| 96.848|[0.509,0.331,4.0,...|
| 2725|    Sleeping At Last|        Turning Page|2kfGoV9a5dbSKCNmU...|ambient|       0.307|  0.371| 3.0|  -8.498| 1.0|     0.0288|       0.923|          0.0253|    0.11|  0

In [23]:

# After getting a vector of all features, we will normalize the vector
# We will use StandardScaler for this process
scaler = StandardScaler(inputCol="features_vector", outputCol="scaled_features", withMean=True, withStd=True)

#inserting the user and spotify vectors
scaler_model = scaler.fit(user_vector_df)
scaler_model = scaler.fit(spotify_vector_df)

#Scaling the vectors
scaled_user_vector = scaler_model.transform(user_vector_df)
scaled_spotify_vector = scaler_model.transform(spotify_vector_df)

#Scale the user profile
summarizer = Summarizer.metrics("mean")
scaled_user_vector = scaler_model.transform(user_vector_df)
user_profile_values_scaled = scaled_user_vector.select(Summarizer.mean(scaled_user_vector.scaled_features)).collect()[0][0]

print(f'User profile scaled: {user_profile_values_scaled}')

User profile scaled: [0.28852720200962456,0.29953426227777286,0.05227644597888388,0.5738942340372074,0.28060517289219356,-0.1491317443641616,-0.19626615705610256,-0.7010600558980861,0.028906276495867418,0.13699767129202348,0.15979950885778796]


In [24]:
scaled_user_vector.show(10,False)

+----------------------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+-----------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|track_id              |danceability|energy|key |loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo  |features_vector                                                        |scaled_features                                                                                                                                                                                                           |
+----------------------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+--------------------------

In [25]:
# Show only regular and scaled vector columns
drop_col = ['index','genre','danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
for column in drop_col:
  scaled_user_vector = scaled_user_vector.drop(column)
  scaled_spotify_vector = scaled_spotify_vector.drop(column)

# Calculate cosine similarity between the user profile and the Spotify items

In [26]:
# Define a UDF(user defined function) to compute cosine similarity
# input: array that will be compared with user's vector
def cosine_similarity_udf(x):
    # Check for NaN values and replace them with 0
    x_array = np.nan_to_num(x.toArray())

    # Compute cosine similarity and return the result
    return float(cosine_similarity([x_array], [user_profile_values_scaled])[0][0])

cosine_similarity_udf = udf(cosine_similarity_udf, DoubleType())

In [27]:
# Perform cosine similarity between user's playlist and scaled spotify vector
Cosine_sim_results = scaled_spotify_vector.withColumn("Cosine similarity", cosine_similarity_udf(col("scaled_features")))

# Sort by highest cosine similarity (descending)
top_50_df = Cosine_sim_results.sort("Cosine similarity", ascending=False)
top_50_df = top_50_df.dropna()

#Reorder DF
top_50_df = top_50_df.select('Cosine similarity','artist_name', 'track_name', 'track_id', 'scaled_features')

In [28]:
# Show Top 50 songs recommended
top_50_df.show(50, False)

+------------------+---------------+---------------------------------+----------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Cosine similarity |artist_name    |track_name                       |track_id              |scaled_features                                                                                                                                                                                                              |
+------------------+---------------+---------------------------------+----------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.8952346083528387|Dvwn           |Insomnia (Feat. 