### Pre-installs

In [1]:
# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install seaborn
# %pip install sklearn
# %pip install -U scikit-learn
# %pip install spotipy

### Inputs

In [2]:
input_song = input("Enter your desired song name here!")
input_playlist = input("Enter your desired playlist name here!s")
recc_num = int(input("Enter how many songs you want to be recommended!"))
client_id = input("Enter your Spotify API Client Key here!")
client_secret = input("Enter your Spotify API Secret Key here!")
sp_dc = input("Enter your Spotify sp_dc here!")
sp_key = input("Enter your Spotify sp_key here!")

### CSV & Spotify Information

In [3]:
import pandas as pd
import ast

# Importing the datasets
dataset = pd.read_csv("data/tracks_with_genres_&_language.csv")
df = pd.read_csv("data/tracks.csv")

# Creating the dataframe
df_generated = pd.DataFrame(dataset)
df_generated['genres'] = df_generated['genres'].apply(ast.literal_eval)

In [4]:
df_generated.iloc[:,3:19].head()

Unnamed: 0,duration_ms,explicit,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,160187,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,0,-11.101,1,0.0322,0.394,0.0,0.149,0.285,113.564,3
1,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,2,-2.706,1,0.0571,0.436,0.0,0.139,0.839,120.689,4
2,141987,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,6,-10.226,0,0.0289,0.255,5e-06,0.163,0.588,104.536,4
3,157093,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,1,-14.165,1,0.03,0.406,0.0,0.122,0.478,106.773,4
4,187333,0,['1Mxqyy3pSjf8kZZL4QVxS0'],2018-05-04,0.319,0.201,7,-17.796,1,0.0623,0.887,0.0,0.904,0.239,117.153,3


In [5]:
# Spotify API Authentication Information
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
# api key
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id,
                                               client_secret=client_secret,
                                               redirect_uri="http://localhost/",
                                               scope="playlist-modify-private",
                                               show_dialog=True,
                                               cache_path="token.txt"
                                               ))

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt

# # Assuming df_generated is your DataFrame
# # Adjust this part based on your actual DataFrame structure
# # For example, you might need to select specific columns or rows
# # or convert categorical variables to numerical values for visualization

# df_generated.columns
# df_graph = df_generated[['popularity', 'duration_ms', 'explicit',
#        'danceability', 'energy', 'key', 'loudness',
#        'speechiness', 'acousticness', 'instrumentalness', 'liveness',
#        'valence', 'tempo', 'time_signature']]

# sns.set(style="white")  # Set the style of the plot

# # Create a heatmap using seaborn with annotated values for the entire correlation matrix
# plt.figure(figsize=(12, 10))  # Adjust the figure size as needed
# heatmap = sns.heatmap(df_graph.corr(numeric_only=True), annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)

# # Customize the appearance of the heatmap
# heatmap.set_title('Heatmap of df_generated', fontsize=16)

# plt.show()

### Processing the Input Song

In [7]:
# Using Spotify API to search for a song's information based on input and adding the necessary information in a DataFrame.
def search_track(track_name):
    # Search for the track
    results = sp.search(q=track_name, type='track')

    # Check if the track exists
    if results['tracks']['total'] > 0:
        # Get the first track from the results
        track = results['tracks']['items'][0]
        explicit = int(track["explicit"] == True)
        artists = []
        id_artists = []
        for i in range (len(track["artists"])):
            artists.append(track["artists"][i]["name"])
            id_artists.append(track["artists"][i]["id"])

        print(f'Found track: {track["name"]} by {track["artists"][0]["name"]} from the album {track["album"]["name"]}.')
        track_dict = {"id": track["id"], "name": track["name"], "popularity": track["popularity"], 
                      "duration_ms": track["duration_ms"], "explicit": explicit, "artists": str(artists),
                      "id_artists": str(id_artists), "release_date": track["album"]["release_date"]}
        return track_dict
    else:
        print('Track not found')
        return None

# Creating a Single Row DataFrame for the input song.
track_result = search_track(input_song)
td = pd.DataFrame(track_result, index=[0])
td

Found track: Uptown Funk (feat. Bruno Mars) by Mark Ronson from the album Uptown Special.


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date
0,32OlwWuMpZ6b0aN2RZOeMS,Uptown Funk (feat. Bruno Mars),86,269666,1,"['Mark Ronson', 'Bruno Mars']","['3hv9jJF3adDNsBSIQDqcjp', '0du5cEVh5yTK9QJze8...",2015-01-12


In [8]:
# Obtaining Feature Data from song based on its song_id from previous function and adding them to a DataFrame.
def get_audio_features(track_result):
    song_id = track_result["id"]
    results = sp.audio_features(song_id)

    if results:
        return results[0]
    else:
        print(f'No audio features found for song ID: {song_id}')
        return None

audio_features = get_audio_features(track_result)
af = pd.DataFrame(audio_features, index=[0])
# Taking out the irrevelant features.
af_formatted = af.drop(["type", "id", "uri", "track_href", "analysis_url", "duration_ms"], axis=1)
# Merging both DataFrames to sync up with the dataset's layout.
td = pd.concat([td, af_formatted], axis=1)
td['year'] = td['release_date'].str.extract(r'(\d{4})').astype(int)

In [9]:
td

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,year
0,32OlwWuMpZ6b0aN2RZOeMS,Uptown Funk (feat. Bruno Mars),86,269666,1,"['Mark Ronson', 'Bruno Mars']","['3hv9jJF3adDNsBSIQDqcjp', '0du5cEVh5yTK9QJze8...",2015-01-12,0.856,0.609,...,-7.223,1,0.0824,0.00801,8.2e-05,0.0344,0.928,114.988,4,2015


In [10]:
import ast

# Converting string array into regular array.
td['artists'] = td['artists'].apply(ast.literal_eval)

td

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,year
0,32OlwWuMpZ6b0aN2RZOeMS,Uptown Funk (feat. Bruno Mars),86,269666,1,"[Mark Ronson, Bruno Mars]","['3hv9jJF3adDNsBSIQDqcjp', '0du5cEVh5yTK9QJze8...",2015-01-12,0.856,0.609,...,-7.223,1,0.0824,0.00801,8.2e-05,0.0344,0.928,114.988,4,2015


In [11]:
# Viewing the input song features.
td['artists_count'] = len(td['artists'][0])
td

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,year,artists_count
0,32OlwWuMpZ6b0aN2RZOeMS,Uptown Funk (feat. Bruno Mars),86,269666,1,"[Mark Ronson, Bruno Mars]","['3hv9jJF3adDNsBSIQDqcjp', '0du5cEVh5yTK9QJze8...",2015-01-12,0.856,0.609,...,1,0.0824,0.00801,8.2e-05,0.0344,0.928,114.988,4,2015,2


In [12]:
# Extracting the artists from the input artists array.
expanded_artists = td['artists'].apply(lambda x: pd.Series(x))

# Rename the columns
expanded_artists.columns = [f"artist_{i+1}" for i in range(len(expanded_artists.columns))]

# Concatenate the expanded columns with the original DataFrame
td = pd.concat([td, expanded_artists], axis=1)
td.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,year,artists_count,artist_1,artist_2
0,32OlwWuMpZ6b0aN2RZOeMS,Uptown Funk (feat. Bruno Mars),86,269666,1,"[Mark Ronson, Bruno Mars]","['3hv9jJF3adDNsBSIQDqcjp', '0du5cEVh5yTK9QJze8...",2015-01-12,0.856,0.609,...,0.00801,8.2e-05,0.0344,0.928,114.988,4,2015,2,Mark Ronson,Bruno Mars


In [13]:
# Obtaining input artist(s) genre(s) through the Spotify API.
def get_artist_genres(artist_name):
    try:
        results = sp.search(q='artist:' + artist_name, type='artist')
        genres = results['artists']['items'][0]['genres']
        return genres if genres else []
    except IndexError:
        return []
    
for i in range(td['artists_count'][0]):
    td['genres' + str(i+1)] = td['artist_' + str(i+1)].apply(get_artist_genres)
    td['genres' + str(i+1)] = td['genres' + str(i+1)].fillna('[]')
td

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,liveness,valence,tempo,time_signature,year,artists_count,artist_1,artist_2,genres1,genres2
0,32OlwWuMpZ6b0aN2RZOeMS,Uptown Funk (feat. Bruno Mars),86,269666,1,"[Mark Ronson, Bruno Mars]","['3hv9jJF3adDNsBSIQDqcjp', '0du5cEVh5yTK9QJze8...",2015-01-12,0.856,0.609,...,0.0344,0.928,114.988,4,2015,2,Mark Ronson,Bruno Mars,[pop soul],"[dance pop, pop]"


In [14]:
# Combining all genres into one list.
for i in range(td['artists_count'][0]):
    td['genres'] = td['genres' + str(i+1)] + td['genres' + str(i+1)]
    td.drop(['genres' + str(i+1)], axis=1, inplace=True)
td

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,year,artists_count,artist_1,artist_2,genres
0,32OlwWuMpZ6b0aN2RZOeMS,Uptown Funk (feat. Bruno Mars),86,269666,1,"[Mark Ronson, Bruno Mars]","['3hv9jJF3adDNsBSIQDqcjp', '0du5cEVh5yTK9QJze8...",2015-01-12,0.856,0.609,...,8.2e-05,0.0344,0.928,114.988,4,2015,2,Mark Ronson,Bruno Mars,"[dance pop, pop, dance pop, pop]"


In [15]:
# Returning only the unique genres for the input.
def get_unique(row):
    return list(set(row))

td['genres'] = td['genres'].apply(get_unique)

In [16]:
# Determining the language from a Lyrics Scraping Library.
import spotify_lyrics_scraper as spotify

token = spotify.getToken(sp_dc, sp_key)
lyrics_data = spotify.getLyrics(token, songName=td['name'][0])
if lyrics_data['status'] == False:
        language = "null"
else:
    # Extract the language from the lyrics data
    language = lyrics_data['message']['lyrics']['language']

td['language'] = language
td

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,liveness,valence,tempo,time_signature,year,artists_count,artist_1,artist_2,genres,language
0,32OlwWuMpZ6b0aN2RZOeMS,Uptown Funk (feat. Bruno Mars),86,269666,1,"[Mark Ronson, Bruno Mars]","['3hv9jJF3adDNsBSIQDqcjp', '0du5cEVh5yTK9QJze8...",2015-01-12,0.856,0.609,...,0.0344,0.928,114.988,4,2015,2,Mark Ronson,Bruno Mars,"[dance pop, pop]",en


In [17]:
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

# Many songs do not have lyrics enabled by spotify. So the alternative to that is to use their name as a benchmark for the language.
# Uses language detect library to identify the song name's expected language. Not exactly accurate but it is an acceptable alternative for now until NLP.
def replace_nan_with_language(row):
    if pd.isna(row['language']):  # Check if the value is NaN
        name = row['name']
        if isinstance(name, str) and len(name) > 0:  # Check if name is a non-empty string
            try:
                return detect(name)  # Use detect() output as the value if it's NaN
            except LangDetectException:
                pass  # Handle LangDetectException, e.g., return 'unknown' or None
        # Return None or any other default value if language detection fails or input text is empty
        return None  
    else:
        return row['language']  # Otherwise, keep the original value

td['language'] = td.apply(replace_nan_with_language, axis=1)

td.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,liveness,valence,tempo,time_signature,year,artists_count,artist_1,artist_2,genres,language
0,32OlwWuMpZ6b0aN2RZOeMS,Uptown Funk (feat. Bruno Mars),86,269666,1,"[Mark Ronson, Bruno Mars]","['3hv9jJF3adDNsBSIQDqcjp', '0du5cEVh5yTK9QJze8...",2015-01-12,0.856,0.609,...,0.0344,0.928,114.988,4,2015,2,Mark Ronson,Bruno Mars,"[dance pop, pop]",en


In [18]:
# Creating a unique identifier.
td['song_name_artist'] = td['name'] + str(td['artists'])

In [19]:
# Defining the values to give priority to later.
language_prority = td['language'][0]
key_priority = td['key'][0]
genre_priority = td['genres'][0]

### Adding Input to Dataset and further processing

In [20]:
# Adding the Input Song to the Dataset DataFrame. Added to the very front.
new_df = pd.concat([td, df_generated], ignore_index=True)
new_df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,time_signature,year,artists_count,artist_1,artist_2,genres,language,song_name_artist,artist_3,artist_4
0,32OlwWuMpZ6b0aN2RZOeMS,Uptown Funk (feat. Bruno Mars),86,269666,1,"[Mark Ronson, Bruno Mars]","['3hv9jJF3adDNsBSIQDqcjp', '0du5cEVh5yTK9QJze8...",2015-01-12,0.856,0.609,...,4,2015,2,Mark Ronson,Bruno Mars,"[dance pop, pop]",en,Uptown Funk (feat. Bruno Mars)0 [Mark Ronso...,,
1,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,...,3,2008,1,Gerry & The Pacemakers,,"[adult standards, bubblegum pop, merseybeat, r...",en,You'll Never Walk Alone - Mono; 2002 Remaster[...,,
2,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,...,4,2020,1,The Toys,,[],en,A Lover's Concerto['The Toys'],,
3,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,...,4,2008,1,Gerry & The Pacemakers,,"[adult standards, bubblegum pop, merseybeat, r...",en,Ferry Cross the Mersey - Mono; 2002 Remaster['...,,
4,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,...,4,2008,1,Gerry & The Pacemakers,,"[adult standards, bubblegum pop, merseybeat, r...",en,Don't Let the Sun Catch You Crying (Main) - Mo...,,


In [21]:
new_df['langauge'] = new_df['language'].fillna("null")
new_df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,year,artists_count,artist_1,artist_2,genres,language,song_name_artist,artist_3,artist_4,langauge
0,32OlwWuMpZ6b0aN2RZOeMS,Uptown Funk (feat. Bruno Mars),86,269666,1,"[Mark Ronson, Bruno Mars]","['3hv9jJF3adDNsBSIQDqcjp', '0du5cEVh5yTK9QJze8...",2015-01-12,0.856,0.609,...,2015,2,Mark Ronson,Bruno Mars,"[dance pop, pop]",en,Uptown Funk (feat. Bruno Mars)0 [Mark Ronso...,,,en
1,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,...,2008,1,Gerry & The Pacemakers,,"[adult standards, bubblegum pop, merseybeat, r...",en,You'll Never Walk Alone - Mono; 2002 Remaster[...,,,en
2,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,...,2020,1,The Toys,,[],en,A Lover's Concerto['The Toys'],,,en
3,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,...,2008,1,Gerry & The Pacemakers,,"[adult standards, bubblegum pop, merseybeat, r...",en,Ferry Cross the Mersey - Mono; 2002 Remaster['...,,,en
4,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,...,2008,1,Gerry & The Pacemakers,,"[adult standards, bubblegum pop, merseybeat, r...",en,Don't Let the Sun Catch You Crying (Main) - Mo...,,,en


In [22]:
new_df['song_id_name'] = new_df['id'] + new_df['name']
print(new_df.shape)
new_df.drop_duplicates(subset=["song_id_name"], keep='first', inplace=True)
print(new_df.shape)

(186012, 31)
(186011, 31)


In [23]:
# Counting the number of genres in for each song.
def count_items_in_list(lst):
    return len(lst)

# Apply the function to the DataFrame column
new_df['genres_count'] = new_df['genres'].apply(count_items_in_list)
new_df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,artist_1,artist_2,genres,language,song_name_artist,artist_3,artist_4,langauge,song_id_name,genres_count
0,32OlwWuMpZ6b0aN2RZOeMS,Uptown Funk (feat. Bruno Mars),86,269666,1,"[Mark Ronson, Bruno Mars]","['3hv9jJF3adDNsBSIQDqcjp', '0du5cEVh5yTK9QJze8...",2015-01-12,0.856,0.609,...,Mark Ronson,Bruno Mars,"[dance pop, pop]",en,Uptown Funk (feat. Bruno Mars)0 [Mark Ronso...,,,en,32OlwWuMpZ6b0aN2RZOeMSUptown Funk (feat. Bruno...,2
1,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,...,Gerry & The Pacemakers,,"[adult standards, bubblegum pop, merseybeat, r...",en,You'll Never Walk Alone - Mono; 2002 Remaster[...,,,en,6catF1lDhNTjjGa2GxRQNNYou'll Never Walk Alone ...,9
2,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,...,The Toys,,[],en,A Lover's Concerto['The Toys'],,,en,6Pkt6qVikqPBt9bEQy8iTzA Lover's Concerto,0
3,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,...,Gerry & The Pacemakers,,"[adult standards, bubblegum pop, merseybeat, r...",en,Ferry Cross the Mersey - Mono; 2002 Remaster['...,,,en,4aSw1QJIMwYSoDEgzgdCJLFerry Cross the Mersey -...,9
4,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,...,Gerry & The Pacemakers,,"[adult standards, bubblegum pop, merseybeat, r...",en,Don't Let the Sun Catch You Crying (Main) - Mo...,,,en,0ZMMtH875IR2TfkyC4PolDDon't Let the Sun Catch ...,9


In [24]:
print(new_df.shape)
new_df = new_df[(new_df['genres_count'] != 0)]
print(new_df.shape)

(186011, 32)
(166251, 32)


### Supersetting genres

In [25]:
# Determining the frequency of the genre. Giving the input song's genre priority to go first.
genre_extract = new_df['genres'].reset_index()
genre_wo_index = genre_extract['genres']
tally = genre_wo_index.count()
genre_dict = {}

for i in range (tally):
    input = genre_wo_index.loc[i]
    if len(input) > 0:
        for n in input:
            if n in genre_dict.keys():
                genre_dict[n] += 1
            else:
                genre_dict[n] = 1

print(genre_dict)

{'dance pop': 6012, 'pop': 6880, 'adult standards': 1216, 'bubblegum pop': 416, 'merseybeat': 529, 'rock-and-roll': 650, 'british invasion': 495, 'classic uk pop': 704, 'brill building pop': 763, 'rockabilly': 550, 'folk rock': 933, 'easy listening': 126, 'lounge': 283, 'british blues': 265, 'canadian blues': 54, 'singer-songwriter': 312, 'lilith': 307, 'canadian singer-songwriter': 107, 'folk': 393, 'rock': 3193, 'mellow gold': 1141, 'jazz': 304, 'swing': 56, 'big band': 94, 'jazz piano': 73, 'stride': 75, 'vocal jazz': 355, 'cool jazz': 274, 'harlem renaissance': 23, 'jazz saxophone': 71, 'contemporary jazz': 73, 'jazz quartet': 20, 'jazz fusion': 297, 'swedish jazz': 134, 'free jazz': 58, 'avant-garde jazz': 49, 'contemporary post-bop': 57, 'italian jazz': 15, 'soundtrack': 456, 'italian soundtrack': 28, 'vintage italian soundtrack': 34, 'classic soundtrack': 89, 'jazz trumpet': 176, 'hard bop': 204, 'roots rock': 250, 'psychedelic rock': 592, 'classic rock': 1358, 'acid rock': 67, 

In [26]:
genres_df = pd.Series(genre_dict)
genres_df.head()

dance pop          6012
pop                6880
adult standards    1216
bubblegum pop       416
merseybeat          529
dtype: int64

In [27]:
# Summary Statistics of genres.
genres_df.T.describe()

count    4229.00000
mean      130.13171
std       393.30760
min         1.00000
25%         4.00000
50%        19.00000
75%        85.00000
max      6880.00000
dtype: float64

In [28]:
# genres_df.to_csv("data/genre_frequency.csv")

In [29]:
# Creating the genre supersets, with priority to the input song's genre.
# Since the input song is the first one to be run, its genre will be able to start the classification of other sub-genres under it.

# This can be further improved. Allowing for later genres to take over earlier genres as the superset if it more general.
# E.g.: Detroit Hip Hop is found before general hip hop. Currently they will be seperated into 2 seperate genres because the first word 'detroit' is
# not 'hip hop' and is thus overlooked.

def generate_superset_mapping(genre_dict):
    superset_mapping = {}

    for genre, frequency in genre_dict.items():
        subset_added = False
        for superset in list(superset_mapping.keys()):
            if genre in superset:
                superset_mapping[genre] = [genre]
                superset_mapping[genre].extend(superset_mapping[superset])
                del superset_mapping[superset]
                subset_added = True
                break
            elif superset in genre:
                if superset in superset_mapping:
                    superset_mapping[superset].append(genre)
                    subset_added = True
                    break
        if not subset_added:
            superset_mapping[genre] = [genre]

    return superset_mapping

# Generate superset mapping
superset_mapping = generate_superset_mapping(genre_dict)

# Output the superset mapping
for superset, subsets in superset_mapping.items():
    print(f"{superset}: {subsets}")


pop: ['pop', 'dance pop', 'bubblegum pop', 'classic uk pop', 'brill building pop', 'sunshine pop', 'baroque pop', 'pop rock', 'post-teen pop', 'pop rap', 'new wave pop', 'finnish dance pop', 'finnish pop', 'uk pop punk', 'europop', 'canadian pop', 'barbadian pop', 'latin pop', 'acoustic pop', 'spanish pop rock', 'mexican pop', 'latin arena pop', 'spanish pop', 'bow pop', 'pop nacional', 'socal pop punk', 'britpop', 'sophisti-pop', 'hip pop', 'south african pop', 'south african pop dance', 'bahamian pop', 'puerto rican pop', 'power-pop punk', 'deep power-pop punk', 'canadian pop punk', 'antiviral pop', 'j-pop girl group', 'australian pop', 'art pop', 'indie pop', 'pop punk', 'synthpop', 'norwegian pop', 'candy pop', 'ambient pop', 'glitch pop', 'french pop', 'dream pop', 'italian adult pop', 'indonesian pop', 'electropop', 'german pop', 'german pop rock', 'tatar pop', 'collage pop', 'pop r&b', 'belarusian pop', 'pop rock brasileiro', 'c-pop', 'classic mandopop', 'mandopop', 'cantopop', 

In [30]:
# Supersetting the weighted genres for later.

superset_genre_priority = []
if len(genre_priority) != 0:
    for i in genre_priority:
        for n in superset_mapping:
            if i in n:
                superset_genre_priority.append(n)
print(superset_genre_priority)

['pop']


In [31]:
# Now with the superset mapped out, apply it onto the all the songs to determine their supersetted genre(s).
# This can be improved. Instead of only returning the first superset genre, it should return multiple in an array.

import pandas as pd

new_df['supersetted_genres'] = new_df['genres'].apply(lambda genres: [m for n in genres for m in superset_mapping if n in m])

In [32]:
new_df['supersetted_genres'].value_counts().count()

5998

In [33]:
new_df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,artist_2,genres,language,song_name_artist,artist_3,artist_4,langauge,song_id_name,genres_count,supersetted_genres
0,32OlwWuMpZ6b0aN2RZOeMS,Uptown Funk (feat. Bruno Mars),86,269666,1,"[Mark Ronson, Bruno Mars]","['3hv9jJF3adDNsBSIQDqcjp', '0du5cEVh5yTK9QJze8...",2015-01-12,0.856,0.609,...,Bruno Mars,"[dance pop, pop]",en,Uptown Funk (feat. Bruno Mars)0 [Mark Ronso...,,,en,32OlwWuMpZ6b0aN2RZOeMSUptown Funk (feat. Bruno...,2,[pop]
1,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,...,,"[adult standards, bubblegum pop, merseybeat, r...",en,You'll Never Walk Alone - Mono; 2002 Remaster[...,,,en,6catF1lDhNTjjGa2GxRQNNYou'll Never Walk Alone ...,9,"[adult standards, merseybeat, british invasion..."
3,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,...,,"[adult standards, bubblegum pop, merseybeat, r...",en,Ferry Cross the Mersey - Mono; 2002 Remaster['...,,,en,4aSw1QJIMwYSoDEgzgdCJLFerry Cross the Mersey -...,9,"[adult standards, merseybeat, british invasion..."
4,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,...,,"[adult standards, bubblegum pop, merseybeat, r...",en,Don't Let the Sun Catch You Crying (Main) - Mo...,,,en,0ZMMtH875IR2TfkyC4PolDDon't Let the Sun Catch ...,9,"[adult standards, merseybeat, british invasion..."
5,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,,['1Mxqyy3pSjf8kZZL4QVxS0'],2018-05-04,0.319,0.201,...,,"[easy listening, adult standards, lounge]",en,The September Of My Years - Live At The Sands ...,,,en,1hx7X9cMXHWJjknb9O6AvaThe September Of My Year...,3,"[easy listening, adult standards, lounge]"


### Encoding

In [34]:
# One-hot Encoding of all the superset genres of each song in the dataframe.

genres_dummies = pd.get_dummies(new_df['supersetted_genres'].apply(pd.Series).stack(), prefix="genre").groupby(level=0).sum()

# Concatenate the binary columns with the original DataFrame
df_2 = pd.concat([new_df, genres_dummies], axis=1)

In [35]:
df_2.iloc[:,0:20].describe()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,166251.0,166251.0,166251.0,166251.0,166251.0,166251.0,166251.0,166251.0,166251.0,166251.0,166251.0,166251.0,166251.0,166251.0,166251.0
mean,40.123001,232836.4,0.10729,0.606616,0.653451,5.330061,-7.320542,0.609121,0.091714,0.3,0.063975,0.207367,0.535137,121.775868,3.929324
std,17.117062,101313.5,0.309482,0.157524,0.216638,3.565041,3.642162,0.487949,0.112539,0.292923,0.206059,0.183005,0.249318,29.153115,0.361896
min,0.0,4937.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,30.0,191160.0,0.0,0.504,0.511,2.0,-8.686,0.0,0.0342,0.0392,0.0,0.0969,0.335,98.006,4.0
50%,41.0,223713.0,0.0,0.617,0.681,5.0,-6.603,1.0,0.0478,0.199,2e-06,0.131,0.536,121.927,4.0
75%,52.0,262427.0,0.0,0.721,0.829,9.0,-5.028,1.0,0.0928,0.514,0.000489,0.265,0.741,139.952,4.0
max,100.0,4995083.0,1.0,0.991,1.0,11.0,2.854,1.0,0.966,0.996,1.0,1.0,1.0,220.099,5.0


In [36]:
# Determining how many keys there are.
df_2.groupby('key').count()

Unnamed: 0_level_0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,genre_zen,genre_zespol dzieciecy,genre_zhongguo feng,genre_zikir,genre_zillertal,genre_zim urban groove,genre_zolo,genre_zouglou,genre_zouk,genre_zurich indie
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,19586,19586,19586,19586,19586,1,19586,19586,19586,19586,...,11433,11433,11433,11433,11433,11433,11433,11433,11433,11433
1,14502,14502,14502,14502,14502,0,14502,14502,14502,14502,...,7742,7742,7742,7742,7742,7742,7742,7742,7742,7742
2,17337,17337,17337,17337,17337,0,17337,17337,17337,17337,...,9871,9871,9871,9871,9871,9871,9871,9871,9871,9871
3,5024,5024,5024,5024,5024,0,5024,5024,5024,5024,...,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872
4,13626,13626,13626,13626,13626,0,13626,13626,13626,13626,...,7601,7601,7601,7601,7601,7601,7601,7601,7601,7601
5,13199,13199,13199,13199,13199,0,13199,13199,13199,13199,...,7728,7728,7728,7728,7728,7728,7728,7728,7728,7728
6,11252,11252,11252,11252,11252,0,11252,11252,11252,11252,...,6147,6147,6147,6147,6147,6147,6147,6147,6147,6147
7,19669,19669,19669,19669,19669,0,19669,19669,19669,19669,...,11357,11357,11357,11357,11357,11357,11357,11357,11357,11357
8,10079,10079,10079,10079,10079,0,10079,10079,10079,10079,...,5550,5550,5550,5550,5550,5550,5550,5550,5550,5550
9,17972,17972,17972,17972,17972,0,17972,17972,17972,17972,...,10046,10046,10046,10046,10046,10046,10046,10046,10046,10046


In [37]:
df_2['key'].max()

11

In [38]:
# One-hot encoding the keys.
key_encoded = pd.get_dummies(df_2['key'], columns=[i for i in range(12)], prefix='key').astype(int)

In [39]:
key_encoded.head()

Unnamed: 0,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,1,0,0,0,0


In [40]:
# Adding keys encoding into the dataset.
df_2 = pd.concat([df_2, key_encoded], axis=1)

In [41]:
print(df_2.shape)
df_2.drop_duplicates(subset=["song_id_name"], keep='first', inplace=True)
print(df_2.shape)

(166251, 1510)
(166251, 1510)


In [42]:
# Checking all the languages.
df_2['langauge'].value_counts().keys()

Index(['en', 'es', 'de', 'pt', 'id', 'it', 'tl', 'tr', 'ko', 'fr', 'fi', 'ja',
       'ca', 'nl', 'sw', 'th', 'ro', 'no', 'et', 'so', 'hu', 'he', 'ru', 'pl',
       'sv', 'da', 'af', 'lt', 'hr', 'sl', 'cy', 'sk', 'zh-cn', 'cs', 'lv',
       'bg', 'zh-tw', 'vi', 'null', 'mk', 'sq', 'uk', 'iw', 'ar', 'el', 'z1',
       'hi', 'zu', 'xh', 'tt', 'ms', 'fa', 'ta', 'mi', 'st', 'sn', 'sm', 'gn',
       'rn', 'qu', 'nn', 'eu', 'tn', 'az', 'am', 'ht', 'ln', 'la', 'ia', 'sr',
       'ne', 'is', 'br', 'co', 'm7', 'ug', 'gl', 'ur'],
      dtype='object', name='langauge')

In [43]:
df_2['language'].head()

0    en
1    en
3    en
4    en
5    en
Name: language, dtype: object

In [44]:
# Encoding all the languages.
df_2['language'] = df_2['language'].fillna('null')
language_encoded = pd.get_dummies(df_2['language'], prefix='lang').astype(int)
language_encoded.head()

Unnamed: 0,lang_af,lang_am,lang_ar,lang_az,lang_bg,lang_br,lang_ca,lang_co,lang_cs,lang_cy,...,lang_tt,lang_ug,lang_uk,lang_ur,lang_vi,lang_xh,lang_z1,lang_zh-cn,lang_zh-tw,lang_zu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
language_encoded = language_encoded.drop(['lang_null'], axis=1)
language_encoded.head()

Unnamed: 0,lang_af,lang_am,lang_ar,lang_az,lang_bg,lang_br,lang_ca,lang_co,lang_cs,lang_cy,...,lang_tt,lang_ug,lang_uk,lang_ur,lang_vi,lang_xh,lang_z1,lang_zh-cn,lang_zh-tw,lang_zu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
# Adding language encoding into the dataset.
df_2 = pd.concat([df_2, language_encoded], axis=1)

In [48]:
print(df_2.shape)
df_2.drop_duplicates(subset=["song_id_name"], keep='first', inplace=True)
print(df_2.shape)

(166251, 1587)
(166251, 1587)


In [49]:
# Removing unnecessary features.
df_2 = df_2.drop(['id_artists', 'release_date', 'popularity', 'key', 'artists_count', 'song_name_artist', 'artist_1', 'artist_2', 'artist_3', 'artist_4', 'genres', 'supersetted_genres', 'explicit', 'time_signature', 'language'], axis=1)
df_2.head()

Unnamed: 0,id,name,duration_ms,artists,danceability,energy,loudness,mode,speechiness,acousticness,...,lang_tt,lang_ug,lang_uk,lang_ur,lang_vi,lang_xh,lang_z1,lang_zh-cn,lang_zh-tw,lang_zu
0,32OlwWuMpZ6b0aN2RZOeMS,Uptown Funk (feat. Bruno Mars),269666,"[Mark Ronson, Bruno Mars]",0.856,0.609,-7.223,1,0.0824,0.00801,...,0,0,0,0,0,0,0,0,0,0
1,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,160187,,0.484,0.265,-11.101,1,0.0322,0.394,...,0,0,0,0,0,0,0,0,0,0
3,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,141987,,0.405,0.365,-10.226,0,0.0289,0.255,...,0,0,0,0,0,0,0,0,0,0
4,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,157093,,0.477,0.352,-14.165,1,0.03,0.406,...,0,0,0,0,0,0,0,0,0,0
5,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,187333,,0.319,0.201,-17.796,1,0.0623,0.887,...,0,0,0,0,0,0,0,0,0,0


In [50]:
# Filling in any null values. (In case)

df_2 = df_2.fillna(0)
df_2.head()

Unnamed: 0,id,name,duration_ms,artists,danceability,energy,loudness,mode,speechiness,acousticness,...,lang_tt,lang_ug,lang_uk,lang_ur,lang_vi,lang_xh,lang_z1,lang_zh-cn,lang_zh-tw,lang_zu
0,32OlwWuMpZ6b0aN2RZOeMS,Uptown Funk (feat. Bruno Mars),269666,"[Mark Ronson, Bruno Mars]",0.856,0.609,-7.223,1,0.0824,0.00801,...,0,0,0,0,0,0,0,0,0,0
1,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,160187,0,0.484,0.265,-11.101,1,0.0322,0.394,...,0,0,0,0,0,0,0,0,0,0
3,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,141987,0,0.405,0.365,-10.226,0,0.0289,0.255,...,0,0,0,0,0,0,0,0,0,0
4,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,157093,0,0.477,0.352,-14.165,1,0.03,0.406,...,0,0,0,0,0,0,0,0,0,0
5,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,187333,0,0.319,0.201,-17.796,1,0.0623,0.887,...,0,0,0,0,0,0,0,0,0,0


In [51]:
df_2.shape

(166251, 1572)

### Weighting

In [52]:
numericals = {'danceability': 0.15, 'energy': 0.15, 'loudness': 0.1, 'mode': 0.05, 'speechiness': 0.05, 'acousticness': 0.1, 
    'instrumentalness': 0.1, 'liveness': 0.05, 'valence': 0.15, 'tempo': 0.1}
categoricals = {'genre_': 0.4, 'lang_': 0.3, 'key_': 0.2}
numericals_key = numericals.keys()
categoricals_key = categoricals.keys()

In [53]:
from sklearn.preprocessing import MinMaxScaler
# Normalize numerical features
numerical_features = ['loudness', 'tempo']
scaler = MinMaxScaler()
df_2[numerical_features] = scaler.fit_transform(df_2[numerical_features])

# Standardize Year
df_2['standardized_year'] = scaler.fit_transform(df_2[['year']])


In [54]:
for i in numericals_key:
    df_2[i] = df_2[i] * numericals[i]

In [55]:
for n in categoricals_key:
    for z in df_2.columns:
        if n in z:
            df_2[z] = df_2[z] * categoricals[n]

In [56]:
# # Applying weight to input language.
# df_2['lang_' + language_prority] = df_2['lang_' + language_prority] * 10
# df_2[['name','lang_' + language_prority]].head()

In [57]:
# # Reducing weight from non-input language.
# for i in df_2.columns:
#     if 'lang_' in i:
#         if i != ('lang_' + language_prority):
#             df_2[i] = df_2[i] / 10
# df_2.head()

In [58]:
# # Applying weight to input key.
# df_2['key_' + str(key_priority)] = df_2['key_' + str(key_priority)] * 10
# df_2[['name','key_' + str(key_priority)]].head()

In [59]:
# # Reducing weight from non-input key.
# for i in df_2.columns:
#     if 'key_' in i:
#         if i != ('key_' + str(key_priority)):
#             df_2[i] = df_2[i] / 10
# df_2['key_7'].head()

In [60]:
# # Applying weight to input genre(s).
# superset_genre_priority = list(set(superset_genre_priority))

# if len(superset_genre_priority) != 0:
#     for i in superset_genre_priority:
#         df_2[i] = df_2[i] * 10
# df_2[superset_genre_priority].head()

In [61]:
# # Reducing weight from non-input genre(s).
# for n in superset_genre_priority:
#     for i in df_2.columns:
#         if i in superset_mapping.keys():
#             if i != (n):
#                 df_2[i] = df_2[i] / 10
# df_2['pop'].head()

### Cosine Similarity & Output Extraction

In [62]:
import numpy as np
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Feature selection, removing non-useful rows.
def processing(df_2):
    # Create a feature matrix
    feature_matrix = df_2.drop(['id', 'name', 'artists', 'song_id_name', 'langauge', 'year'], axis=1)

    # Apply normalization to the entire feature matrix
    feature_matrix = scaler.fit_transform(feature_matrix)

    # This is the input song. We are isolating it from the dataframe first.
    input_song = feature_matrix[0]
    
    arr = []

    # Returns comparison value of first song in dataset to all others in an array.
    # DataFrame can be customized before Feature Selection to change which song the subject of comparison should be.
    for i in range(1, len(feature_matrix)):
        # Take the i-th song from the feature matrix
        compare_song = feature_matrix[i]
        # Perform cosine similarity.
        cosine_sim = cosine_similarity([input_song], [compare_song])
        cs_value = cosine_sim[0][0]  # Take the single similarity value between the input song and the i-th song.
        arr.append(cs_value)

    return arr

arr = processing(df_2)
arr

[0.6580727742816908,
 0.4576013466324103,
 0.5468273377943134,
 0.4948151390456008,
 0.3757246714504691,
 0.6153388503070858,
 0.595293963749153,
 0.5340595774360716,
 0.4517077278501669,
 0.6042416868793036,
 0.5486330896157751,
 0.5478725459732297,
 0.5961950878075698,
 0.7399010994447908,
 0.49598630858000725,
 0.48771523948108,
 0.4972586138221713,
 0.5197259439057597,
 0.37808258990337873,
 0.3946999606966926,
 0.3932225834228851,
 0.5991437605803194,
 0.6044037316472338,
 0.3960575743659277,
 0.5630278187547897,
 0.39031087116821184,
 0.4814540217071103,
 0.37574070816845095,
 0.3138555469447981,
 0.3780145688937696,
 0.471089825809436,
 0.5151870288005138,
 0.6068773932888729,
 0.5840684184806882,
 0.5884309951507221,
 0.504882334789239,
 0.6115726239661674,
 0.575108475224466,
 0.46650133093876944,
 0.6092674782851524,
 0.46644694098343553,
 0.5519219827398575,
 0.630768102006065,
 0.5212175858896378,
 0.6192843682463398,
 0.5194325593942632,
 0.4783205001692113,
 0.50671671272

In [63]:
max(arr[1:])

0.9990901382568249

In [74]:
def compile_suggestions(arr):
    suggestions = []
    
    # Compiling all song ids that are above the pre-determined threshold for 'like the first song'.
    for i in range(len(arr)):
        if arr[i] > 0.9:
            suggestions.append(i)
    
    print(suggestions)
    return suggestions
suggestions = compile_suggestions(arr)

[422, 1621, 2072, 2728, 3270, 3533, 3555, 3672, 4290, 4605, 5012, 5163, 5770, 6489, 6664, 6866, 7244, 7564, 7654, 7707, 7709, 7761, 7764, 7776, 7840, 7841, 7888, 8141, 8149, 8156, 8188, 8349, 8410, 8468, 8599, 8690, 8811, 8826, 8892, 8912, 8921, 9087, 9369, 9439, 9500, 9548, 9766, 9778, 9908, 9952, 10022, 10130, 10135, 10210, 10283, 10363, 10454, 10630, 10667, 10684, 10868, 10907, 11109, 11156, 11264, 11267, 11277, 11379, 12080, 12093, 12315, 12321, 12841, 12848, 12906, 12908, 12971, 12978, 12987, 13074, 13142, 13234, 13467, 13469, 13597, 13764, 13922, 13930, 14210, 14313, 14395, 14403, 14483, 14522, 14642, 14677, 14730, 14764, 14875, 14939, 15075, 15079, 15142, 15270, 15277, 15285, 15294, 15352, 15676, 15749, 15826, 16144, 16225, 16242, 16767, 16773, 17536, 18074, 18138, 18259, 18262, 18282, 18305, 18383, 18389, 18556, 18584, 18767, 18890, 18947, 19006, 19203, 19815, 20053, 20605, 20971, 21573, 21728, 22175, 22214, 22217, 22247, 22304, 22646, 22886, 23266, 23524, 23573, 23777, 24335, 

In [75]:
len(suggestions)

472

In [76]:
# Extracting out the end result information.
df_main = df_2[df_2.index.isin(suggestions)]

df_main.head()

Unnamed: 0,id,name,duration_ms,artists,danceability,energy,loudness,mode,speechiness,acousticness,...,lang_ug,lang_uk,lang_ur,lang_vi,lang_xh,lang_z1,lang_zh-cn,lang_zh-tw,lang_zu,standardized_year
422,51c94ac31swyDQj9B3Lzs3,Change (In the House of Flies),299533,0,0.04035,0.12795,0.088066,0.0,0.00427,2.5e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2728,6p0ecVFjeSu09Ncb1bUEGR,Scattered Days - 2002 Mix Version,235093,0,0.10035,0.07065,0.085094,0.0,0.00165,0.0382,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095238
3270,6sy1E0xbHOhUIShn8RrF8B,If You're Not Scared,197539,0,0.06945,0.11805,0.082351,0.0,0.00176,0.00215,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.428571
3533,48jd71gnyIlLdXOYX2qJ6p,Mandy,199320,0,0.06705,0.0954,0.087377,0.05,0.00139,0.0254,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857
3555,0FsJ5CUIcAkmuZ3bhwrBH5,Stranger in Blue Suede Shoes - 2003 Remaster,204841,0,0.10815,0.12345,0.083301,0.05,0.003425,0.00599,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.571429


In [67]:
len(df_main) # There is one less for the last 3 features as the input song did not go through the extracted year's steps.
# df_final['standardized_year'].describe()

425

In [68]:
df_main = df_main.iloc[1:]

df_main = df_main.drop('artists', axis=1)

id_artist_mapping = df.set_index('id')['artists'].to_dict()

# Add 'artist' column to df1 using the mapping
df_main['artists'] = df_main['id'].map(id_artist_mapping)
df_main.head()

  df_main['artists'] = df_main['id'].map(id_artist_mapping)


Unnamed: 0,id,name,duration_ms,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,...,lang_uk,lang_ur,lang_vi,lang_xh,lang_z1,lang_zh-cn,lang_zh-tw,lang_zu,standardized_year,artists
2728,6p0ecVFjeSu09Ncb1bUEGR,Scattered Days - 2002 Mix Version,235093,0.10035,0.07065,0.085094,0.0,0.00165,0.0382,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095238,['강수지']
3270,6sy1E0xbHOhUIShn8RrF8B,If You're Not Scared,197539,0.06945,0.11805,0.082351,0.0,0.00176,0.00215,5.41e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.428571,"[""K's Choice""]"
3533,48jd71gnyIlLdXOYX2qJ6p,Mandy,199320,0.06705,0.0954,0.087377,0.05,0.00139,0.0254,9.25e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,['Westlife']
3555,0FsJ5CUIcAkmuZ3bhwrBH5,Stranger in Blue Suede Shoes - 2003 Remaster,204841,0.10815,0.12345,0.083301,0.05,0.003425,0.00599,0.079,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.571429,['Kevin Ayers']
3672,4b3XTlyKkHQWayiE1Z4KlQ,The Things I Should Have Said - 2003 Digital R...,243547,0.0804,0.0546,0.077588,0.05,0.00168,0.0165,1.02e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.52381,['Lindisfarne']


### Recommendations Output

In [69]:
# Generating recc_num of the matched songs at random.
def recommend(df_main, recc_num):
    recs = []
    if len(df_main) == 0:
        print("Sorry! There are no songs similar enough to " + input_song + "!")
    elif len(df_main) < recc_num:
        recc_num = len(df_main)
        print("Uh oh! We only found " + str(recc_num) + " recommendations!")
    print('Here are your recommendations!')
    for c in range(recc_num):
        recc = df_main.sample().to_dict()
        name = list(recc['name'].values())[0]
        artists = (list(recc['artists'].values())[0])[1:len(list(recc['artists'].values())[0])-1]
        year = str(list(recc['year'].values())[0])
        print(str(c + 1) + '. ' + name + ' by ' + artists + ' published in ' + year)
        df_main = df_main.drop(list(recc['id'])[0])
        recs.append((name, year))
    return recs
# May want to try a filter for repreventing duplicate songs in the list.
recs_for_user = recommend(df_main, recc_num) # End Product

Here are your recommendations!
1. HandClap by 'Fitz and The Tantrums' published in 2016
2. Mandy by 'Westlife' published in 2003
3. MegaMan by 'Lil Wayne' published in 2011
4. Mamacita (아야야) by 'SUPER JUNIOR' published in 2014
5. Agalmata by 'Notis Sfakianakis' published in 2002
6. My Tears Are Becoming A Sea by 'M83' published in 2011
7. Lambreta by 'António Zambujo' published in 2012
8. Non fare la puttana by 'Fabri Fibra' published in 2004
9. Dreams in summer night by 'IU' published in 2014
10. Kane Dou by 'Ipohthonios' published in 2008
11. Ai Ai Ai - Felguk & Cat Dealers Remix by 'Vanessa Da Mata', 'Felguk', 'Cat Dealers' published in 2018
12. A State Of Trance Year Mix 2020 (Mixed) - Road To 1000 - Outro by 'Armin van Buuren' published in 2020
13. Szó Fel by 'Hősök' published in 2011
14. Morphium by 'Carlos Perón' published in 2020
15. Get Outta My Dreams, Get Into My Car by 'Billy Ocean' published in 2010


In [70]:
user_id = sp.current_user()["id"]
song_uris = []
for song in recs_for_user:
    result = sp.search(q=f"track:{song[0]} year:{format(float(song[1]), '.0f')}", type="track")
    print(result)
    try:
        uri = result["tracks"]["items"][0]["uri"]
        song_uris.append(uri)
    except IndexError:
        print(f"{song[0]} doesn't exist in Spotify. Skipped.")

playlist = sp.user_playlist_create(user=user_id, name=input_playlist, public=False)
sp.playlist_add_items(playlist_id=playlist["id"], items=song_uris)

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=track%3AHandClap+year%3A2016&type=track&offset=0&limit=10', 'items': [{'album': {'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/4AcHt3JxKy59IX7JNNlZn4'}, 'href': 'https://api.spotify.com/v1/artists/4AcHt3JxKy59IX7JNNlZn4', 'id': '4AcHt3JxKy59IX7JNNlZn4', 'name': 'Fitz and The Tantrums', 'type': 'artist', 'uri': 'spotify:artist:4AcHt3JxKy59IX7JNNlZn4'}], 'available_markets': ['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 

{'snapshot_id': 'MiwyNWIwOTBiMTI4ZGY2ZjA4ZDc3OTkwMTA0OWYyZmY2YzhhOWViYWFh'}