## Import the Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import spotipy
import yaml
from spotipy.oauth2 import SpotifyOAuth
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler

## Extract the Dataset

In [2]:
#Read the data of top artists and tracks alongside the saved playlist
df_top_artist = pd.read_pickle('spotify/top_artists.pkl')
df_top_tracks = pd.read_pickle('spotify/top_tracks.pkl')
df_playlist_tracks = pd.read_pickle('spotify/playlist_tracks.pkl')
df_recommendation = pd.read_pickle('spotify/recommendation_tracks.pkl')

## Data Preprocessing

In [3]:
#Standardize the popularity rating to between 0-1
df_top_tracks['popularity'] = df_top_tracks['popularity'] / 100
df_playlist_tracks['popularity'] = df_playlist_tracks['popularity'] / 100

#Remove duplicate songs
df_playlist_tracks = df_playlist_tracks.drop_duplicates(subset = 'name', keep = 'first')

#Drop the unwanted column
df_top_artist = df_top_artist.drop(['uri','type'], axis = 1)
df_top_tracks = df_top_tracks.drop(['type','is_local','album_artist_id','album_artist_name','album_tracks',
                                    'album_type','album_genres','uri','track_href','analysis_url'], axis = 1)
df_playlist_tracks = df_playlist_tracks.drop(['type','is_local','album_artist_id','album_artist_name',
                                              'album_tracks','album_type','album_genres','playlist_id',
                                              'playlist_name','playlist_tracks','added_at','added_by','uri',
                                              'track_href','analysis_url'], axis = 1)

In [4]:
#Combine the top_tracks and playlist_tracks together by using union and check for duplicates
df_tracks = pd.concat([df_top_tracks, df_playlist_tracks], axis = 0)
df_tracks = df_tracks.drop_duplicates(subset = 'name', keep = 'first')

#Fill the na value in the data with 0
df_tracks = df_tracks.fillna(0)

#Reset the index for the combined dataset
df_tracks = df_tracks.reset_index()

## Collaborative Recommendation

In [13]:
#Pick the numerical column from the dataset
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df_tracks_numeric = df_tracks.select_dtypes(include=numerics)
df_tracks_numeric = df_tracks_numeric.drop(['index','duration_ms','disc_number','track_number'], axis=1)

#Normalize the values inside dataframe and turn into csr matrix
standardization = MinMaxScaler()
x_val = standardization.fit_transform(df_tracks_numeric.values)
df_tracks_numeric = pd.DataFrame(x_val)
df_tracks_numeric = csr_matrix(df_tracks_numeric.values)

#Train the data using K Nearest Neigbors
knn = NearestNeighbors(n_neighbors = 15, metric = 'cosine', algorithm = 'auto')
knn.fit(df_tracks_numeric)

NearestNeighbors(metric='cosine', n_neighbors=15)

In [39]:
#Get the recommendation from some of the songs using collaborative recommendation
def recommend_songs_collaborative(song_name,n_songs):
    songs_list = df_tracks[df_tracks['name'].str.contains(song_name)]
    if len(songs_list) > 0:
        songs_idx = songs_list.iloc[0]['id']
        songs_idx = df_tracks[df_tracks['id'] == songs_idx].index[0]
        distances,indices = knn.kneighbors(df_tracks_numeric[songs_idx],n_neighbors=n_songs+1)    
        rec_song_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[1:]
        recommend_list = []
        for val in rec_song_indices:
            song_idx = df_tracks.iloc[val[0]]['id']
            idx = df_tracks[df_tracks['id'] == song_idx].index
            recommend_list.append({'Title':df_tracks.iloc[idx]['name'].values[0],'Distance':val[1]})
        df = pd.DataFrame(recommend_list,index=range(1,n_songs+1))
        return df
    else:
        return "No songs found. Please check your input"

#Test by creating a song recommendation 
df_recommended_songs = recommend_songs_collaborative('Careless Whisper',10)

## Content Recommendation

### TF-IDF Vectorizer

In [15]:
#Preprocess the data further before putting it into TF-IDF Vectorizer

#Combine the various genres under one string
df_tracks['genres_combine'] = df_tracks['genres'].apply(lambda x: ' '.join(x))

#Lower the string and combine the name for album and artists
df_tracks['artist_name'] = df_tracks['artist_name'].apply(lambda x: x.lower())
df_tracks['artist_name'] = df_tracks['artist_name'].apply(lambda x: ''.join(x.split()))
df_tracks['album_name'] = df_tracks['album_name'].apply(lambda x: x.lower())
df_tracks['album_name'] = df_tracks['album_name'].apply(lambda x: ''.join(x.split()))

#Combine the required information under one column
df_tracks['overview'] = df_tracks['genres_combine'] + ' ' + df_tracks['artist_name'] + ' ' + df_tracks['album_name']
df_content_recommendation = df_tracks[['name','overview']]

In [16]:
#Create and categorize the words using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(analyzer='word',
                             ngram_range=(1,2),
                             min_df=0.003, 
                             max_df=0.5, 
                             stop_words='english', 
                             max_features=5000)

track_matrix = vectorizer.fit_transform(df_tracks['overview'])

In [17]:
#Perform further analysis using cosine similarity to calculate the distance and similar value between songs
cos_sim = cosine_similarity(track_matrix, track_matrix)

#Generate a new DataFrame to obtain the track title for recommendation later
track_title = df_tracks['name']
# indices = pd.Series(df_tracks.index, index=df_tracks['name'])

In [18]:
#Get the recommendation from some of the songs using content recommendation
def recommend_songs_content(song_name,n_songs):
    if len(song_name) > 0:
        songs_idx = df_tracks.index[df_tracks['name'].str.contains(song_name)]
        similarity_scores = list(enumerate(cos_sim[songs_idx[0]]))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        similarity_scores = similarity_scores[:n_songs]
        songs_indices = [song[0] for song in similarity_scores if song[0] != songs_idx[0]]
        songs_title = track_title.iloc[songs_indices]
        songs_title = songs_title.reset_index(drop=True)
        songs_scores = pd.Series([song[1] for song in similarity_scores if song[0] != songs_idx[0]])
        frame = {'Title': songs_title, 'Score': songs_scores}
        songs = pd.DataFrame(frame)
        return songs
    else:
        return "No songs found. Please check your input"
    
#Test by creating a song recommendation 
df_recommended_songs_content = recommend_songs_content('Careless Whisper',20)

## Hybrid Recommendation

In [38]:
#Create a recommendation by combining content recommendation and collaborative recommendation
def recommend_songs_hybrid(song_name, n, content_weight=2, collaborative_weight=1):
    #Content Recommendation
    songs_idx = df_tracks.index[df_tracks['name'].str.contains(song_name)]
    similarity_scores = list(enumerate(cos_sim[songs_idx[0]]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    songs_indices = [song[0] for song in similarity_scores if song[0] != songs_idx[0]]
    songs_title = track_title.iloc[songs_indices]
    songs_title = songs_title.reset_index(drop=True)
    songs_scores = pd.Series([song[1] for song in similarity_scores if song[0] != songs_idx[0]])
    frame = {'Title': songs_title, 'Score': songs_scores}
    songs = pd.DataFrame(frame)
    
    #Collaborative Recommendation
    n_songs = len(df_tracks)
    songs_list = df_tracks[df_tracks['name'].str.contains(song_name)]
    songs_id = songs_list.iloc[0]['id']
    songs_id = df_tracks[df_tracks['id'] == songs_id].index[0]
    distances,indices = knn.kneighbors(df_tracks_numeric[songs_id],n_neighbors=n_songs)    
    rec_song_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[1:]
    recommend_list = []
    for val in rec_song_indices:
        song_idx = df_tracks.iloc[val[0]]['id']
        idx = df_tracks[df_tracks['id'] == song_idx].index
        recommend_list.append({'Title':df_tracks.iloc[idx]['name'].values[0],'Distance':val[1]})
    df = pd.DataFrame(recommend_list,index=range(1,n_songs))
    
    #Merge both the dataframe to calculate the overall distance
    df_recommended = pd.merge(songs, df, how='inner', left_on='Title', right_on='Title')
    
    #Standardize the value of distance as the lower score indicate the better tracks in distance
    df_recommended['Distance'] = 1 / df_recommended['Distance']
    max_score = df_recommended['Distance'].max()
    df_recommended['Distance'] = df_recommended['Distance'] / max_score
    
    #Calculate the combined score from respective weight of recommendation
    df_recommended['Combined_Score'] = df_recommended['Score'] * content_weight + df_recommended['Distance'] * collaborative_weight
    df_recommended = df_recommended.sort_values(by='Combined_Score', ascending=False)
    df_recommended = df_recommended.drop(['Score','Distance'], axis=1)
    return df_recommended[:n]
    
#Test the function
df_recommended = recommend_songs_hybrid('Careless Whisper', 10)

## Create A Playlist

In [36]:
#Create a playlist with a list of songs (maximum 20)
def create_playlist(songs, recommender, n_songs):
    if len(songs) > 20:
        return 'Please choose a maximum of 20 songs'
    
    df_new_playlist = pd.DataFrame(columns = ['Title','Score'])
    for song in songs:
        df_song = recommender(song, n_songs)
        df_new_playlist = pd.concat([df_new_playlist, df_song], axis = 0)
    
    #Combine the new playlist full of recommendation with original playlist track
    df_new_playlist = pd.merge(df_new_playlist, df_tracks, how='left', left_on=['Title'], right_on=['name'])
    
    #Access the Spotify Client ID and URI
    with open('spotify/spotify.yml') as spotify:
        spotify_details = yaml.safe_load(spotify)

    #Define the scope 
    scope = 'playlist-modify-private'

    #Access the Spotify API
    sp = spotipy.Spotify(auth_manager = SpotifyOAuth(
        client_id = spotify_details['client_id'],
        client_secret = spotify_details['client_secret'],
        redirect_uri = spotify_details['redirect_url'],
        scope = scope))
    
    # Create a new playlist for tracks to add - you may also add these tracks to your source playlist and proceed
    new_playlist = sp.user_playlist_create(user = spotify_details['user'], 
                                           name = "Machine Learning Recommendation System",
                                           public = False, 
                                           collaborative = False, 
                                           description = "Created with automated own recommender systems",
                                          )
    #Add the tracks into the playlist
    for id in df_new_playlist['id']:
        sp.user_playlist_add_tracks(user = spotify_details['user'], playlist_id = new_playlist['id'], tracks = [id])

In [37]:
#Define the lists full of favorite songs to obtain the recommendation for the playlist
combine_songs = ['Careless Whisper','Boulevard of Broken Dreams','Sunflower','Use Somebody']

#Execute the function to create playlist
create_playlist(combine_songs,recommend_songs_hybrid,10)