### Pre-installs

In [1]:
# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install seaborn
# %pip install sklearn
# %pip install -U scikit-learn
# %pip install spotipy

### Inputs

In [2]:
input_song = input("Enter your desired song name here!")
input_playlist = input("Enter your desired playlist name here!s")
recc_num = int(input("Enter how many songs you want to be recommended!"))
client_id = input("Enter your Spotify API Client Key here!")
client_secret = input("Enter your Spotify API Secret Key here!")

### CSV & Spotify Information

In [3]:
import pandas as pd
import ast

# Importing the datasets
dataset = pd.read_csv("data/tracks_with_genres.csv")
df = pd.read_csv("data/tracks.csv")

# Creating the dataframe
df_generated = pd.DataFrame(dataset)
df_generated['genres'] = df_generated['genres'].apply(ast.literal_eval)

In [4]:
# Spotify API Authentication Information
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
# api key
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id,
                                               client_secret=client_secret,
                                               redirect_uri="http://localhost/",
                                               scope="playlist-modify-private",
                                               show_dialog=True,
                                               cache_path="token.txt"
                                               ))

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt

# # Assuming df_generated is your DataFrame
# # Adjust this part based on your actual DataFrame structure
# # For example, you might need to select specific columns or rows
# # or convert categorical variables to numerical values for visualization

# df_generated.columns
# df_graph = df_generated[['popularity', 'duration_ms', 'explicit',
#        'danceability', 'energy', 'key', 'loudness',
#        'speechiness', 'acousticness', 'instrumentalness', 'liveness',
#        'valence', 'tempo', 'time_signature']]

# sns.set(style="white")  # Set the style of the plot

# # Create a heatmap using seaborn with annotated values for the entire correlation matrix
# plt.figure(figsize=(12, 10))  # Adjust the figure size as needed
# heatmap = sns.heatmap(df_graph.corr(numeric_only=True), annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)

# # Customize the appearance of the heatmap
# heatmap.set_title('Heatmap of df_generated', fontsize=16)

# plt.show()

### Processing the Input Song

In [51]:
# Using Spotify API to search for a song's information based on input and adding the necessary information in a DataFrame.
def search_track(track_name):
    # Search for the track
    results = sp.search(q=track_name, type='track')

    # Check if the track exists
    if results['tracks']['total'] > 0:
        # Get the first track from the results
        track = results['tracks']['items'][0]
        explicit = int(track["explicit"] == True)
        artists = []
        id_artists = []
        for i in range (len(track["artists"])):
            artists.append(track["artists"][i]["name"])
            id_artists.append(track["artists"][i]["id"])

        print(f'Found track: {track["name"]} by {track["artists"][0]["name"]} from the album {track["album"]["name"]}.')
        track_dict = {"id": track["id"], "name": track["name"], "popularity": track["popularity"], 
                      "duration_ms": track["duration_ms"], "explicit": explicit, "artists": str(artists),
                      "id_artists": str(id_artists), "release_date": track["album"]["release_date"]}
        return track_dict
    else:
        print('Track not found')
        return None

# Creating a Single Row DataFrame for the input song.
track_result = search_track(input_song)
td = pd.DataFrame(track_result, index=[0])
td

Found track: Timber (feat. Ke$ha) by Pitbull from the album Global Warming: Meltdown (Deluxe Version).


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date
0,3cHyrEgdyYRjgJKSOiOtcS,Timber (feat. Ke$ha),85,204160,0,"['Pitbull', 'Kesha']","['0TnOYISbd1XYRBk9myaseg', '6LqNN22kT3074XbTVU...",2012


In [52]:
# Obtaining Feature Data from song based on its song_id from previous function and adding them to a DataFrame.
def get_audio_features(track_result):
    song_id = track_result["id"]
    results = sp.audio_features(song_id)

    if results:
        return results[0]
    else:
        print(f'No audio features found for song ID: {song_id}')
        return None

audio_features = get_audio_features(track_result)
af = pd.DataFrame(audio_features, index=[0])
# Taking out the irrevelant features.
af_formatted = af.drop(["type", "id", "uri", "track_href", "analysis_url", "duration_ms"], axis=1)
# Merging both DataFrames to sync up with the dataset's layout.
td = pd.concat([td, af_formatted], axis=1)
td['year'] = td['release_date'].str.extract(r'(\d{4})').astype(int)

In [46]:
td

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,year
0,3cHyrEgdyYRjgJKSOiOtcS,Timber (feat. Ke$ha),85,204160,0,"['Pitbull', 'Kesha']","['0TnOYISbd1XYRBk9myaseg', '6LqNN22kT3074XbTVU...",2012,0.581,0.963,...,-4.087,1,0.0981,0.0295,0,0.139,0.788,129.992,4,2012


In [53]:
import ast

td['artists'] = td['artists'].apply(ast.literal_eval)

td

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,year
0,3cHyrEgdyYRjgJKSOiOtcS,Timber (feat. Ke$ha),85,204160,0,"[Pitbull, Kesha]","['0TnOYISbd1XYRBk9myaseg', '6LqNN22kT3074XbTVU...",2012,0.581,0.963,...,-4.087,1,0.0981,0.0295,0,0.139,0.788,129.992,4,2012


In [61]:
td['artists_count'] = len(td['artists'][0])
td

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,year,artists_count
0,3cHyrEgdyYRjgJKSOiOtcS,Timber (feat. Ke$ha),85,204160,0,"[Pitbull, Kesha]","['0TnOYISbd1XYRBk9myaseg', '6LqNN22kT3074XbTVU...",2012,0.581,0.963,...,1,0.0981,0.0295,0,0.139,0.788,129.992,4,2012,2


In [62]:
expanded_artists = td['artists'].apply(lambda x: pd.Series(x))

# Rename the columns
expanded_artists.columns = [f"artist_{i+1}" for i in range(len(expanded_artists.columns))]

# Concatenate the expanded columns with the original DataFrame
td = pd.concat([td, expanded_artists], axis=1)
td.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,year,artists_count,artist_1,artist_2
0,3cHyrEgdyYRjgJKSOiOtcS,Timber (feat. Ke$ha),85,204160,0,"[Pitbull, Kesha]","['0TnOYISbd1XYRBk9myaseg', '6LqNN22kT3074XbTVU...",2012,0.581,0.963,...,0.0295,0,0.139,0.788,129.992,4,2012,2,Pitbull,Kesha


In [73]:
def get_artist_genres(artist_name):
    try:
        results = sp.search(q='artist:' + artist_name, type='artist')
        genres = results['artists']['items'][0]['genres']
        return genres if genres else []
    except IndexError:
        return []
    
for i in range(td['artists_count'][0]):
    td['genres' + str(i+1)] = td['artist_' + str(i+1)].apply(get_artist_genres)
    td['genres' + str(i+1)] = td['genres' + str(i+1)].fillna('[]')
td

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,year,artists_count,artist_1,artist_2,genres
0,3cHyrEgdyYRjgJKSOiOtcS,Timber (feat. Ke$ha),85,204160,0,"[Pitbull, Kesha]","['0TnOYISbd1XYRBk9myaseg', '6LqNN22kT3074XbTVU...",2012,0.581,0.963,...,0,0.139,0.788,129.992,4,2012,2,Pitbull,Kesha,"[dance pop, pop, dance pop, pop]"


In [75]:
for i in range(td['artists_count'][0]):
    td['genres'] = td['genres' + str(i+1)] + td['genres' + str(i+1)]
    td.drop(['genres' + str(i+1)], axis=1, inplace=True)
td

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,year,artists_count,artist_1,artist_2,genres
0,3cHyrEgdyYRjgJKSOiOtcS,Timber (feat. Ke$ha),85,204160,0,"[Pitbull, Kesha]","['0TnOYISbd1XYRBk9myaseg', '6LqNN22kT3074XbTVU...",2012,0.581,0.963,...,0,0.139,0.788,129.992,4,2012,2,Pitbull,Kesha,"[dance pop, pop, dance pop, pop]"


In [83]:
def get_unique(row):
    return list(set(row))

# Apply the function to each row of the DataFrame
td['genres'] = td['genres'].apply(get_unique)

In [84]:
td['song_name_artist'] = td['name'] + str(td['artists'])

### Adding Input to Dataset and further processing

In [85]:
# Adding the Input Song to the Dataset DataFrame. Added to the very front.
new_df = pd.concat([td, df_generated], ignore_index=True)
new_df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,tempo,time_signature,year,artists_count,artist_1,artist_2,genres,song_name_artist,artist_3,artist_4
0,3cHyrEgdyYRjgJKSOiOtcS,Timber (feat. Ke$ha),85,204160,0,"[Pitbull, Kesha]","['0TnOYISbd1XYRBk9myaseg', '6LqNN22kT3074XbTVU...",2012,0.581,0.963,...,129.992,4,2012,2,Pitbull,Kesha,"[dance pop, pop]","Timber (feat. Ke$ha)0 [Pitbull, Kesha]\nNam...",,
1,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,...,113.564,3,2008,1,Gerry & The Pacemakers,,"[adult standards, british invasion, merseybeat...",You'll Never Walk Alone - Mono; 2002 Remaster[...,,
2,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,...,120.689,4,2020,1,The Toys,,[],A Lover's Concerto['The Toys'],,
3,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,...,120.689,4,2020,1,The Toys,,[thai pop],A Lover's Concerto['The Toys'],,
4,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,...,104.536,4,2008,1,Gerry & The Pacemakers,,"[adult standards, british invasion, merseybeat...",Ferry Cross the Mersey - Mono; 2002 Remaster['...,,


In [86]:
new_df['song_id_name'] = new_df['id'] + new_df['name']
print(new_df.shape)
new_df.drop_duplicates(subset=["song_id_name"], keep='first', inplace=True)
print(new_df.shape)

(427550, 29)
(358322, 29)


In [87]:
def count_items_in_list(lst):
    return len(lst)

# Apply the function to the DataFrame column
new_df['genres_count'] = new_df['genres'].apply(count_items_in_list)
new_df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,year,artists_count,artist_1,artist_2,genres,song_name_artist,artist_3,artist_4,song_id_name,genres_count
0,3cHyrEgdyYRjgJKSOiOtcS,Timber (feat. Ke$ha),85,204160,0,"[Pitbull, Kesha]","['0TnOYISbd1XYRBk9myaseg', '6LqNN22kT3074XbTVU...",2012,0.581,0.963,...,2012,2,Pitbull,Kesha,"[dance pop, pop]","Timber (feat. Ke$ha)0 [Pitbull, Kesha]\nNam...",,,3cHyrEgdyYRjgJKSOiOtcSTimber (feat. Ke$ha),2
1,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,...,2008,1,Gerry & The Pacemakers,,"[adult standards, british invasion, merseybeat...",You'll Never Walk Alone - Mono; 2002 Remaster[...,,,6catF1lDhNTjjGa2GxRQNNYou'll Never Walk Alone ...,9
2,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,...,2020,1,The Toys,,[],A Lover's Concerto['The Toys'],,,6Pkt6qVikqPBt9bEQy8iTzA Lover's Concerto,0
4,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,...,2008,1,Gerry & The Pacemakers,,"[adult standards, british invasion, merseybeat...",Ferry Cross the Mersey - Mono; 2002 Remaster['...,,,4aSw1QJIMwYSoDEgzgdCJLFerry Cross the Mersey -...,9
5,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,...,2008,1,Gerry & The Pacemakers,,"[adult standards, british invasion, merseybeat...",Don't Let the Sun Catch You Crying (Main) - Mo...,,,0ZMMtH875IR2TfkyC4PolDDon't Let the Sun Catch ...,9


In [88]:
print(new_df.shape)
new_df = new_df[(new_df['genres_count'] != 0)]
print(new_df.shape)

(358322, 30)
(323358, 30)


### Supersetting genres

In [89]:
# Determining the frequency of the genre. Giving the input song's genre priority to go first.
genre_extract = new_df['genres'].reset_index()
genre_wo_index = genre_extract['genres']
tally = genre_wo_index.count()
genre_dict = {}

for i in range (tally):
    input = genre_wo_index.loc[i]
    if len(input) > 0:
        for n in input:
            if n in genre_dict.keys():
                genre_dict[n] += 1
            else:
                genre_dict[n] = 1

print(genre_dict)

{'dance pop': 7892, 'pop': 7639, 'adult standards': 3657, 'british invasion': 902, 'merseybeat': 814, 'rockabilly': 948, 'rock-and-roll': 1178, 'brill building pop': 1470, 'classic uk pop': 1491, 'folk rock': 2938, 'bubblegum pop': 733, 'lounge': 938, 'easy listening': 616, 'big band': 272, 'canadian blues': 85, 'british blues': 479, 'mellow gold': 6556, 'folk': 2017, 'singer-songwriter': 1756, 'lilith': 1236, 'canadian singer-songwriter': 376, 'rock': 13706, 'stride': 224, 'harlem renaissance': 77, 'jazz piano': 308, 'jazz': 796, 'swing': 289, 'vocal jazz': 1036, 'cool jazz': 694, 'free jazz': 108, 'italian jazz': 25, 'contemporary jazz': 208, 'jazz fusion': 905, 'jazz quartet': 25, 'avant-garde jazz': 117, 'swedish jazz': 514, 'jazz saxophone': 283, 'contemporary post-bop': 187, 'classic soundtrack': 364, 'soundtrack': 1155, 'vintage italian soundtrack': 170, 'italian soundtrack': 150, 'jazz trumpet': 333, 'hard bop': 438, 'acid rock': 84, 'roots rock': 1528, 'classic rock': 6552, 'p

In [18]:
import matplotlib.pyplot as plt

# # Plot the frequency distribution
# plt.bar(genre_dict.keys(), genre_dict.values())
# plt.xlabel('Genres')
# plt.ylabel('Frequency')
# plt.title('Frequency of Various Genres')
# plt.show()

In [90]:
genres_df = pd.Series(genre_dict)
genres_df.head()

dance pop           7892
pop                 7639
adult standards     3657
british invasion     902
merseybeat           814
dtype: int64

In [91]:
# Summary Statistics of genres.
genres_df.T.describe()

count     4557.000000
mean       241.353303
std        729.040105
min          1.000000
25%          7.000000
50%         33.000000
75%        146.000000
max      13706.000000
dtype: float64

In [21]:
# genres_df.to_csv("data/genre_frequency.csv")

In [92]:
# Creating the genre supersets, with priority to the input song's genre.
# Since the input song is the first one to be run, its genre will be able to start the classification of other sub-genres under it.

# This can be further improved. Allowing for later genres to take over earlier genres as the superset if it more general.
# E.g.: Detroit Hip Hop is found before general hip hop. Currently they will be seperated into 2 seperate genres because the first word 'detroit' is
# not 'hip hop' and is thus overlooked.

def generate_superset_mapping(genre_dict):
    superset_mapping = {}

    for genre, frequency in genre_dict.items():
        subset_added = False
        for superset in list(superset_mapping.keys()):
            if genre in superset:
                superset_mapping[genre] = [genre]
                superset_mapping[genre].extend(superset_mapping[superset])
                del superset_mapping[superset]
                subset_added = True
                break
            elif superset in genre:
                if superset in superset_mapping:
                    superset_mapping[superset].append(genre)
                    subset_added = True
                    break
        if not subset_added:
            superset_mapping[genre] = [genre]

    return superset_mapping

# Generate superset mapping
superset_mapping = generate_superset_mapping(genre_dict)

# Output the superset mapping
for superset, subsets in superset_mapping.items():
    print(f"{superset}: {subsets}")


pop: ['pop', 'dance pop', 'brill building pop', 'classic uk pop', 'bubblegum pop', 'sunshine pop', 'baroque pop', 'classic country pop', 'swedish pop', 'europop', 'new wave pop', 'art pop', 'pop rock', 'synthpop', 'spanish pop', 'power pop', 'australian pop', 'early synthpop', 'latin pop', 'mexican pop', 'pop romantico', 'french pop', 'sophisti-pop', 'popping', 'bow pop', 'italian adult pop', 'classic italian pop', 'dream pop', 'candy pop', 'indonesian pop', 'classic indo pop', 'classic nz pop', 'classic turkish pop', 'j-pop', 'danish pop', 'classic danish pop', 'deep turkish pop', 'turkish pop', 'classic swedish pop', 'sertanejo pop', 'musica popular colombiana', 'classic french pop', 'jangle pop', 'nederpop', 'dutch pop', 'c-pop', 'vintage chinese pop', 'classic cantopop', 'afropop', 'mande pop', 'pop nacional antigas', 'french synthpop', 'classic peruvian pop', 'beninese pop', 'classic venezuelan pop', 'classic city pop', 'canadian pop', 'cantopop', 'indonesian city pop', 'russian p

In [93]:
# Now with the superset mapped out, apply it onto the all the songs to determine their supersetted genre(s).
# This can be improved. Instead of only returning the first superset genre, it should return multiple in an array.

import pandas as pd

genres_df = pd.DataFrame(genres_df, columns=['genres'])

def group_into_supersets(new_df, superset_mapping):
    def get_superset(genres):
        for superset, subsets in superset_mapping.items():
            if any(subset in genre for genre in genres for subset in subsets):
                return superset
        return 'other'  # If no superset is found

    new_df['supersetted_genres'] = new_df['genres'].apply(get_superset)
    return new_df

# Group into supersets and create new column
new_df = group_into_supersets(new_df, superset_mapping)

In [94]:
new_df['supersetted_genres'].value_counts().count()

982

### Encoding

In [95]:
# One-hot Encoding of all the superset genres of each song in the dataframe.

genres_dummies = pd.get_dummies(new_df['supersetted_genres'].apply(pd.Series).stack()).groupby(level=0).sum()

# Concatenate the binary columns with the original DataFrame
df_2 = pd.concat([new_df, genres_dummies], axis=1)

In [96]:
df_2.iloc[:,0:20].describe()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,323358.0,323358.0,323358.0,323358.0,323358.0,323358.0,323358.0,323358.0,323358.0,323358.0,323358.0,323358.0,323358.0,323358.0,323358.0
mean,34.391009,234244.3,0.066586,0.587702,0.611222,5.280788,-8.974548,0.636369,0.105935,0.339579,0.069502,0.214811,0.553588,120.568295,3.903086
std,16.47107,105752.8,0.249303,0.160546,0.231299,3.549788,4.461369,0.481045,0.175595,0.306277,0.210803,0.189172,0.254241,29.442346,0.418985
min,0.0,4937.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,188027.0,0.0,0.485,0.448,2.0,-11.245,0.0,0.0334,0.0517,0.0,0.0959,0.349,97.445,4.0
50%,34.0,226493.0,0.0,0.601,0.633,5.0,-8.028,1.0,0.0446,0.256,7e-06,0.136,0.562,119.985,4.0
75%,45.0,269867.0,0.0,0.704,0.801,9.0,-5.831,1.0,0.0816,0.594,0.00135,0.281,0.767,138.66,4.0
max,100.0,4995083.0,1.0,0.991,1.0,11.0,2.854,1.0,0.969,0.996,1.0,1.0,1.0,239.906,5.0


In [97]:
# Removing unnecessary features.

df_2 = df_2.drop(['id_artists', 'release_date', 'popularity', 'mode', 'danceability', 'artists_count', 'song_name_artist', 'artist_1', 'artist_2', 'artist_3', 'artist_4', 'genres', 'supersetted_genres', 'explicit', 'time_signature'], axis=1)
df_2.head()

Unnamed: 0,id,name,duration_ms,artists,energy,key,loudness,speechiness,acousticness,instrumentalness,...,zen,zespol dzieciecy,zhongguo feng,zikir,zillertal,zim urban groove,zolo,zouglou,zouk,zydeco
0,3cHyrEgdyYRjgJKSOiOtcS,Timber (feat. Ke$ha),204160,"[Pitbull, Kesha]",0.963,11,-4.087,0.0981,0.0295,0.0,...,0,0,0,0,0,0,0,0,0,0
1,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,160187,,0.265,0,-11.101,0.0322,0.394,0.0,...,0,0,0,0,0,0,0,0,0,0
4,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,141987,,0.365,6,-10.226,0.0289,0.255,5e-06,...,0,0,0,0,0,0,0,0,0,0
5,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,157093,,0.352,1,-14.165,0.03,0.406,0.0,...,0,0,0,0,0,0,0,0,0,0
6,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,187333,,0.201,7,-17.796,0.0623,0.887,0.0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
# Filling in any null values. (In case)

df_2 = df_2.fillna(0)
df_2.head()

Unnamed: 0,id,name,duration_ms,artists,energy,key,loudness,speechiness,acousticness,instrumentalness,...,zen,zespol dzieciecy,zhongguo feng,zikir,zillertal,zim urban groove,zolo,zouglou,zouk,zydeco
0,3cHyrEgdyYRjgJKSOiOtcS,Timber (feat. Ke$ha),204160,"[Pitbull, Kesha]",0.963,11,-4.087,0.0981,0.0295,0.0,...,0,0,0,0,0,0,0,0,0,0
1,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,160187,0,0.265,0,-11.101,0.0322,0.394,0.0,...,0,0,0,0,0,0,0,0,0,0
4,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,141987,0,0.365,6,-10.226,0.0289,0.255,5e-06,...,0,0,0,0,0,0,0,0,0,0
5,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,157093,0,0.352,1,-14.165,0.03,0.406,0.0,...,0,0,0,0,0,0,0,0,0,0
6,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,187333,0,0.201,7,-17.796,0.0623,0.887,0.0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
df_2.shape

(323358, 998)

### Cosine Similarity & Output Extraction

In [100]:
import numpy as np
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Feature selection, removing non-useful rows.
def processing(df_2):
    # Normalize numerical features
    numerical_features = ['loudness', 'tempo']
    scaler = MinMaxScaler()
    df_2[numerical_features] = scaler.fit_transform(df_2[numerical_features])
    
    # Standardize Year
    df_2['standardized_year'] = scaler.fit_transform(df_2[['year']])
    df_2 = df_2.drop(['year'], axis=1)

    # Create a feature matrix
    feature_matrix = df_2.drop(['id', 'name', 'artists', 'song_id_name'], axis=1)

    # Apply normalization to the entire feature matrix
    feature_matrix = scaler.fit_transform(feature_matrix)

    # This is the input song. We are isolating it from the dataframe first.
    input_song = feature_matrix[0]
    
    arr = []

    # Returns comparison value of first song in dataset to all others in an array.
    # DataFrame can be customized before Feature Selection to change which song the subject of comparison should be.
    for i in range(1, len(feature_matrix)):
        # Take the i-th song from the feature matrix
        compare_song = feature_matrix[i]
        # Perform cosine similarity.
        cosine_sim = cosine_similarity([input_song], [compare_song])
        cs_value = cosine_sim[0][0]  # Take the single similarity value between the input song and the i-th song.
        arr.append(cs_value)

    return arr

arr = processing(df_2)
arr

[0.7950356019801965,
 0.9380582223470441,
 0.8476048440854451,
 0.5571825234427455,
 0.5336601195977362,
 0.9686057056811974,
 0.9492014748662885,
 0.7936365276537098,
 0.9055781180525164,
 0.9503385728468214,
 0.8780904349835044,
 0.6527982667029153,
 0.8169608028074814,
 0.9294314390530127,
 0.5014116437270316,
 0.6308430226209288,
 0.4942991384387653,
 0.6135450777820284,
 0.6378344835790737,
 0.550025242446227,
 0.6139995413795686,
 0.5715809929520388,
 0.9856062352247658,
 0.9603647793885138,
 0.6043337287269568,
 0.9252457381446154,
 0.6425817868634889,
 0.4991120447495807,
 0.5537344737959267,
 0.5805402022203001,
 0.5320647850057998,
 0.6036281805705599,
 0.6482962260092096,
 0.8898565776223497,
 0.8537772786367634,
 0.6459014051886729,
 0.9673958596553891,
 0.6760171257977392,
 0.6930050287547098,
 0.8258220264644232,
 0.9129800411273744,
 0.8996720412114652,
 0.935030732917543,
 0.9528043240817761,
 0.9014961830161976,
 0.7716951534355381,
 0.654580131613889,
 0.7172662288780

In [101]:
max(arr[1:])

0.9995980306553384

In [104]:
def compile_suggestions(arr):
    suggestions = []
    
    # Compiling all song ids that are above the pre-determined threshold for 'like the first song'.
    for i in range(len(arr)):
        if arr[i] > 0.99:
            suggestions.append(i)
    
    print(suggestions)
    return suggestions
suggestions = compile_suggestions(arr)

[18125, 18294, 18312, 18747, 19510, 19523, 19656, 19664, 19940, 20288, 20289, 20296, 20304, 20352, 20377, 20378, 20465, 20539, 20540, 20670, 20676, 20916, 21010, 21177, 21208, 21301, 21388, 21428, 21514, 21557, 21605, 21618, 21689, 21728, 21729, 21869, 21883, 22158, 22163, 22225, 22239, 22332, 22415, 22452, 22514, 22535, 22536, 22548, 22645, 22853, 22882, 22928, 22950, 22969, 22977, 22979, 23031, 23071, 23188, 23271, 23328, 23343, 23450, 23487, 23508, 23555, 23585, 23592, 23607, 23680, 23735, 23744, 23854, 23915, 23921, 23997, 24010, 24053, 24075, 24231, 24266, 24397, 24457, 24654, 24692, 24776, 24784, 24826, 24958, 24969, 25017, 25105, 25121, 25137, 25233, 25272, 25357, 25476, 25508, 25553, 25580, 25585, 25586, 25587, 25600, 25659, 25692, 25724, 25751, 25759, 25761, 25764, 25794, 25811, 25910, 25924, 25988, 26069, 26122, 26139, 26150, 26159, 26174, 26191, 26207, 26282, 26305, 26326, 26338, 26346, 26466, 26667, 26711, 26865, 26980, 27005, 27150, 27255, 27264, 27308, 27371, 27401, 27452

In [105]:
len(suggestions)

2915

In [106]:
# Extracting out the end result information.
df_main = df_2[df_2.index.isin(suggestions)]

df_main.head()

Unnamed: 0,id,name,duration_ms,artists,energy,key,loudness,speechiness,acousticness,instrumentalness,...,zespol dzieciecy,zhongguo feng,zikir,zillertal,zim urban groove,zolo,zouglou,zouk,zydeco,standardized_year
18125,52mdbM1tueoC8UnYvZ9uTX,Shout to the Lord - Live,279507,0,0.305,9,0.75529,0.0252,0.223,0.0,...,0,0,0,0,0,0,0,0,0,0.390244
18312,2RzNpQEcatMIym0AFLJUsF,Muévelo,261133,0,0.838,7,0.870684,0.0422,0.0776,2.9e-05,...,0,0,0,0,0,0,0,0,0,0.390244
18747,3llKUN9le04V1enu8MHudF,Besame Mucho,324040,0,0.633,7,0.817195,0.0307,0.384,3e-06,...,0,0,0,0,0,0,0,0,0,0.414634
19510,4LSolq1CjLOVnya9zb0pU1,How Your Love Makes Me Feel,245400,0,0.787,0,0.825421,0.0313,0.351,0.0,...,0,0,0,0,0,0,0,0,0,0.414634
19523,0EgOQEj1fHUyoPAkkp03Hp,Hundred Mile High City,238200,0,0.94,11,0.892401,0.0458,0.00012,0.00357,...,0,0,0,0,0,0,0,0,0,0.414634


In [107]:
len(df_main) # There is one less for the last 3 features as the input song did not go through the extracted year's steps.
# df_final['standardized_year'].describe()

2231

In [108]:
df_main = df_main.iloc[1:]

df_main = df_main.drop('artists', axis=1)

id_artist_mapping = df.set_index('id')['artists'].to_dict()

# Add 'artist' column to df1 using the mapping
df_main['artists'] = df_main['id'].map(id_artist_mapping)
df_main.head()

Unnamed: 0,id,name,duration_ms,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,...,zhongguo feng,zikir,zillertal,zim urban groove,zolo,zouglou,zouk,zydeco,standardized_year,artists
18312,2RzNpQEcatMIym0AFLJUsF,Muévelo,261133,0.838,7,0.870684,0.0422,0.0776,2.9e-05,0.365,...,0,0,0,0,0,0,0,0,0.390244,['Los Piojos']
18747,3llKUN9le04V1enu8MHudF,Besame Mucho,324040,0.633,7,0.817195,0.0307,0.384,3e-06,0.201,...,0,0,0,0,0,0,0,0,0.414634,['Luis Miguel']
19510,4LSolq1CjLOVnya9zb0pU1,How Your Love Makes Me Feel,245400,0.787,0,0.825421,0.0313,0.351,0.0,0.065,...,0,0,0,0,0,0,0,0,0.414634,['Diamond Rio']
19523,0EgOQEj1fHUyoPAkkp03Hp,Hundred Mile High City,238200,0.94,11,0.892401,0.0458,0.00012,0.00357,0.667,...,0,0,0,0,0,0,0,0,0.414634,['Ocean Colour Scene']
19656,31bf9SEOppLU6lQ85d8om6,Ghetto Supastar (That is What You Are) (feat. ...,261133,0.653,1,0.821014,0.204,0.0335,5e-06,0.22,...,0,0,0,0,0,0,0,0,0.439024,"['Pras', 'ODB', 'Mýa']"


### Recommendations Output

In [109]:
# Generating recc_num of the matched songs at random.
def recommend(df_main, recc_num):
    recs = []
    if len(df_main) == 0:
        print("Sorry! There are no songs similar enough to " + input_song + "!")
    elif len(df_main) < recc_num:
        recc_num = len(df_main)
        print("Uh oh! We only found " + str(recc_num) + " recommendations!")
    print('Here are your recommendations!')
    for c in range(recc_num):
        recc = df_main.sample().to_dict()
        name = list(recc['name'].values())[0]
        artists = (list(recc['artists'].values())[0])[1:len(list(recc['artists'].values())[0])-1]
        year = str(list(recc['year'].values())[0])
        print(str(c + 1) + '. ' + name + ' by ' + artists + ' published in ' + year)
        df_main = df_main.drop(list(recc['id'])[0])
        recs.append((name, year))
    return recs
# May want to try a filter for repreventing duplicate songs in the list.
recs_for_user = recommend(df_main, recc_num) # End Product

Here are your recommendations!
1. Canzone - Live From Milan,Italy/1989 by 'Vasco Rossi' published in 1990
2. Strong Enough by 'Cher' published in 1998
3. Stupid Humans by 'Nobodys' published in 1996
4. Knock On Wood by 'Amii Stewart' published in 2012
5. Я Свободен by 'Кипелов' published in 2003
6. It's tha Subta by 'Subterranean' published in 1997
7. Pig of the Year 2006 by 'The Baboon Show' published in 2005
8. Madalena do Jucú / Ô Irene - Ao Vivo by 'Samba De Raiz' published in 2002
9. Has Your Man Got Soul by 'Milk & Sugar', 'Nicole Tyler' published in 2007
10. Formula Mágica da Paz (Ao Vivo) by "Racionais MC's" published in 2006


In [38]:
user_id = sp.current_user()["id"]
song_uris = []
for song in recs_for_user:
    result = sp.search(q=f"track:{song[0]} year:{song[1]}", type="track")
    print(result)
    try:
        uri = result["tracks"]["items"][0]["uri"]
        song_uris.append(uri)
    except IndexError:
        print(f"{song[0]} doesn't exist in Spotify. Skipped.")

playlist = sp.user_playlist_create(user=user_id, name=input_playlist, public=False)
sp.playlist_add_items(playlist_id=playlist["id"], items=song_uris)

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=track%3AHigh+and+Mighty+year%3A1986&type=track&offset=0&limit=10', 'items': [{'album': {'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/14RXohtx6NiBGFTW8IdmAK'}, 'href': 'https://api.spotify.com/v1/artists/14RXohtx6NiBGFTW8IdmAK', 'id': '14RXohtx6NiBGFTW8IdmAK', 'name': 'John Scofield', 'type': 'artist', 'uri': 'spotify:artist:14RXohtx6NiBGFTW8IdmAK'}], 'available_markets': ['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', '

{'snapshot_id': 'Miw4NmY2MGMzOGVjNDIzMWEwZWU4MDE3OTgwNzg5YTVkOTc5Mzc3Mzcx'}