## Content Based Filtering

In [24]:
# Import modules
import pandas as pd
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
# set working directory
os.chdir('/Users/danielliu/TuneBuild')
os.getcwd()

'/Users/danielliu/TuneBuild'

In [26]:
# Loading a temporary example playlist (Would replace with user songs)
user_songs_df = pd.read_parquet('data/intermediate/test.parquet')
user_songs_df.head()

Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms_x,album_name,name,danceability,...,id,uri,track_href,analysis_url,duration_ms_y,time_signature,artist_pop,genres,track_pop,artists_song
413,59,The Killers,7oK9VyNzrYvRFo7nQEYkWN,spotify:artist:0C0XlULifJtAgn6ZNCW2eu,Mr. Brightside,spotify:album:4undIeGmofnAYKhnDclN1w,222586,Hot Fuss,Mom's playlist,0.356,...,7oK9VyNzrYvRFo7nQEYkWN,spotify:track:7oK9VyNzrYvRFo7nQEYkWN,https://api.spotify.com/v1/tracks/7oK9VyNzrYvR...,https://api.spotify.com/v1/audio-analysis/7oK9...,222587,4,80,alternative_rock dance_rock modern_rock perman...,78,The KillersMr. Brightside
1234,18,Rihanna,6qn9YLKt13AGvpq9jfO8py,spotify:artist:5pKCCKE2ajJHZ9KAiaK11H,We Found Love,spotify:album:2g1EakEaW7fPTZC6vBmBCn,215226,Talk That Talk,Mom's playlist,0.734,...,6qn9YLKt13AGvpq9jfO8py,spotify:track:6qn9YLKt13AGvpq9jfO8py,https://api.spotify.com/v1/tracks/6qn9YLKt13AG...,https://api.spotify.com/v1/audio-analysis/6qn9...,215227,4,90,barbadian_pop dance_pop pop pop_rap urban_cont...,77,RihannaWe Found Love
1363,32,American Authors,5j9iuo3tMmQIfnEEQOOjxh,spotify:artist:0MlOPi3zIDMVrfA9R04Fe3,Best Day Of My Life,spotify:album:2AAVQqcejMEgNpdg2raPYE,194240,"Oh, What A Life",Mom's playlist,0.67,...,5j9iuo3tMmQIfnEEQOOjxh,spotify:track:5j9iuo3tMmQIfnEEQOOjxh,https://api.spotify.com/v1/tracks/5j9iuo3tMmQI...,https://api.spotify.com/v1/audio-analysis/5j9i...,194240,4,70,indie_poptimism modern_alternative_rock modern...,0,American AuthorsBest Day Of My Life
1579,38,Clean Bandit,5HuqzFfq2ulY1iBAW5CxLe,spotify:artist:6MDME20pz9RveH9rEXvrOM,Rather Be (feat. Jess Glynne),spotify:album:2xVeccmEU0zklK4XSKiDCW,227833,I Cry When I Laugh,Mom's playlist,0.799,...,5HuqzFfq2ulY1iBAW5CxLe,spotify:track:5HuqzFfq2ulY1iBAW5CxLe,https://api.spotify.com/v1/tracks/5HuqzFfq2ulY...,https://api.spotify.com/v1/audio-analysis/5Huq...,227833,4,80,dance_pop edm pop pop_dance tropical_house uk_...,53,Clean BanditRather Be (feat. Jess Glynne)
1732,17,Sia,4VrWlk8IQxevMvERoX08iC,spotify:artist:5WUlDfRSoLAfcVSX1WnrxN,Chandelier,spotify:album:3xFSl9lIRaYXIYkIn3OIl9,216120,1000 Forms Of Fear,Mom's playlist,0.399,...,4VrWlk8IQxevMvERoX08iC,spotify:track:4VrWlk8IQxevMvERoX08iC,https://api.spotify.com/v1/tracks/4VrWlk8IQxev...,https://api.spotify.com/v1/audio-analysis/4VrW...,216120,5,89,australian_dance australian_pop pop,81,SiaChandelier


In [27]:
# Load in the whole feature set
all_song_features = pd.read_csv('data/intermediate/all_song_features.csv')
all_song_features.drop(columns=['artist_name', 'track_name'], inplace=True)

In [28]:
# Generate the playlist features -> Would otherwise when loading in actual user data, get the features myself using Spotify API
def generate_playlist_feature(all_song_features, user_songs_df):
    """
    Summarizes a users songs into a single vector

    Args:
        complete_feature_set (_type_): _description_
        user_songs_df (_type_): _description_

    Returns:
        _type_: _description_
    """
    # Get all features for user songs
    user_songs_features_df = all_song_features[all_song_features['id'].isin(user_songs_df['id'].values)]
    user_songs_features_df_final = user_songs_features_df.drop(columns=['id', 'genres'])
    # Get all features for non user songs
    non_user_songs_features_df = all_song_features[~all_song_features['id'].isin(user_songs_df['id'].values)]

    return user_songs_features_df_final.sum(axis=0), non_user_songs_features_df

In [29]:
# Generate the features
user_songs_features_vector, non_user_songs_features_df = generate_playlist_feature(all_song_features, user_songs_df)

In [34]:
user_songs_features_vector

danceability             47.098178
energy                   59.966000
key                      32.545455
loudness                 67.823981
mode                     60.000000
                           ...    
genre|zambian_hip_hop     0.000000
genre|zhongguo_feng       0.000000
genre|zolo                0.000000
genre|zouk                0.000000
genre|zouk_riddim         0.000000
Length: 2159, dtype: float64

In [35]:
non_user_songs_features_df

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,genre|yacht_rock,genre|ye_ye,genre|yodeling,genre|york_indie,genre|zambian_hip_hop,genre|zhongguo_feng,genre|zolo,genre|zouk,genre|zouk_riddim,id
0,0.914980,0.813,0.363636,0.842733,0.0,0.125780,0.031225,0.007005,0.0471,0.811623,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0UaMYEvWZi0ZqiDOoHU3YI
1,0.783401,0.838,0.454545,0.893573,0.0,0.118503,0.025000,0.025126,0.2420,0.925852,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6I9VzXrHxO9rA9A5euc8Ak
2,0.672065,0.758,0.181818,0.851050,0.0,0.218295,0.002390,0.000000,0.0598,0.702405,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0WqIKmW4BTrj3eJFmnCKMv
3,0.902834,0.714,0.363636,0.859462,0.0,0.146570,0.201807,0.000235,0.0521,0.818637,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1AWQoqb9bSvzTjaLralEkT
4,0.863360,0.606,0.000000,0.882707,1.0,0.074116,0.056325,0.000000,0.3130,0.655311,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1lzr43nnXAijIGYnCT8M8H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34242,0.677126,0.228,0.181818,0.762849,1.0,0.071726,0.795181,0.065327,0.0944,0.402806,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3uCHI1gfOUL5j5swEh0TcH
34243,0.498988,0.727,0.090909,0.875777,1.0,0.225572,0.087651,0.000000,0.1290,0.289579,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0P1oO2gREMYUCoOkzYAyFu
34244,0.710526,0.524,0.636364,0.785298,1.0,0.082432,0.333333,0.055578,0.2980,0.265531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2oM4BuruDnEvk59IvIXCwn
34245,0.515182,0.286,0.727273,0.721378,1.0,0.127859,0.403614,0.000012,0.1310,0.259519,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4Ri5TTUgjM96tbQZd5Ua7V


Apply cosine similarity

In [37]:
song_df = pd.read_csv('data/intermediate/song_df.csv')
song_df.head()

Unnamed: 0,id,artist_name,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genres,artist_pop,track_pop
0,0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0.904,0.813,4,-7.105,0,0.121,0.0311,0.00697,0.0471,0.81,125.461,"['dance_pop', 'hip_hop', 'hip_pop', 'pop', 'po...",74,69
1,6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0.774,0.838,5,-3.914,0,0.114,0.0249,0.025,0.242,0.924,143.04,"['dance_pop', 'pop', 'post-teen_pop']",84,83
2,0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0.664,0.758,2,-6.583,0,0.21,0.00238,0.0,0.0598,0.701,99.259,"['dance_pop', 'pop', 'r&b']",86,25
3,1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0.892,0.714,4,-6.055,0,0.141,0.201,0.000234,0.0521,0.817,100.972,"['dance_pop', 'pop']",82,79
4,1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0.853,0.606,0,-4.596,1,0.0713,0.0561,0.0,0.313,0.654,94.759,"['pop_rap', 'reggae_fusion']",75,2


In [32]:
# Keep only songs what aren't user songs to potentially recommend
non_user_songs_complete = song_df[song_df['id'].isin(non_user_songs_features_df['id'].values)]
# Find cosine similarity between non-user songs and complete song set
non_user_songs_features_arr = non_user_songs_features_df.drop(['id', 'genres'], axis = 1).values   # Convert it into an array

In [33]:
# non_user_songs_features_df.drop(['id', 'genres'], axis = 1)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,genre|xtra_raw,genre|yacht_rock,genre|ye_ye,genre|yodeling,genre|york_indie,genre|zambian_hip_hop,genre|zhongguo_feng,genre|zolo,genre|zouk,genre|zouk_riddim
0,0.914980,0.813,0.363636,0.842733,0.0,0.125780,0.031225,0.007005,0.0471,0.811623,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.783401,0.838,0.454545,0.893573,0.0,0.118503,0.025000,0.025126,0.2420,0.925852,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.672065,0.758,0.181818,0.851050,0.0,0.218295,0.002390,0.000000,0.0598,0.702405,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.902834,0.714,0.363636,0.859462,0.0,0.146570,0.201807,0.000235,0.0521,0.818637,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.863360,0.606,0.000000,0.882707,1.0,0.074116,0.056325,0.000000,0.3130,0.655311,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34242,0.677126,0.228,0.181818,0.762849,1.0,0.071726,0.795181,0.065327,0.0944,0.402806,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34243,0.498988,0.727,0.090909,0.875777,1.0,0.225572,0.087651,0.000000,0.1290,0.289579,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34244,0.710526,0.524,0.636364,0.785298,1.0,0.082432,0.333333,0.055578,0.2980,0.265531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34245,0.515182,0.286,0.727273,0.721378,1.0,0.127859,0.403614,0.000012,0.1310,0.259519,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
cosine_scores = cosine_similarity(non_user_songs_features_arr, user_songs_features_vector.values.reshape(1, -1))[:, 0]

In [13]:
# An array where each value represents the similarity between the summarized user songs vector
# and each of songs in the complete songs list
cosine_scores

array([0.81444823, 0.84134303, 0.81707144, ..., 0.85058829, 0.83244994,
       0.76693122])

In [14]:
# Assign cosine scores to non user songs df
non_user_songs_complete['cosine_score'] = cosine_scores

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_user_songs_complete['cosine_score'] = cosine_scores


In [15]:
# The recommended playlist based on cosine similarity.
non_user_songs_complete.sort_values('cosine_score', ascending=False).head(30)

Unnamed: 0,id,artist_name,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genres,artist_pop,track_pop,cosine_score
17819,5iMg31WQw8WdCHrvYWhloq,Jesse L. Martin,Today 4 U,0.724,0.94,7,-3.155,1,0.0507,0.043,4.3e-05,0.142,0.684,132.006,['unknown'],53,49,0.970281
32493,2bopUyK6ll0CZFw03TDKAY,Superfruit,Imaginary Parties,0.616,0.708,7,-4.011,1,0.224,0.0409,0.0,0.143,0.599,177.945,['unknown'],63,43,0.9664
20288,1Z5L1sgdPUPfcuxsfaYzcB,Outasight,Tonight Is The Night,0.626,0.784,7,-3.874,1,0.0515,0.0372,0.0,0.182,0.555,120.031,['unknown'],44,47,0.963516
9708,4QKhhlgwDnXBwk23S0dzJd,R. Kelly,Ignition - Viceroy Remix,0.612,0.865,1,-3.739,1,0.0635,0.0982,0.000163,0.0835,0.772,135.99,['unknown'],70,39,0.962776
22634,4FoneIhLnjo0KQwZ3sNmxx,Cherub,Hold Me,0.609,0.793,8,-5.212,1,0.12,0.0371,0.000449,0.166,0.719,192.057,['unknown'],60,36,0.962732
4574,3UAXVxOVqegxni5snW3Lht,R. Kelly,Step In the Name of Love,0.747,0.581,6,-9.9,1,0.0347,0.265,0.000421,0.0756,0.537,92.983,['unknown'],70,43,0.96181
10343,713bTySu02xdpMMap8UPQv,Cloud 5,No Behavior Shell Down,0.755,0.914,5,-3.873,1,0.0357,0.132,0.0,0.105,0.722,128.004,['unknown'],38,38,0.961127
21967,6K5xY7EabiBLjd2HuBQIak,Who Is Fancy,Goodbye,0.717,0.692,5,-6.735,1,0.0466,0.0635,0.00049,0.0923,0.602,114.997,['unknown'],41,51,0.961043
14855,71eRCfoq3g4qeLNcR75Hig,Wild Cub,Thunder Clatter,0.6,0.968,6,-2.444,1,0.0329,0.301,2e-06,0.19,0.635,123.02,['unknown'],40,44,0.960633
6777,6U4VqEHy4n5VeiH4pQPL24,Dwayne Johnson,You're Welcome,0.788,0.745,5,-7.072,1,0.258,0.29,0.0,0.0373,0.672,135.278,['unknown'],71,73,0.960455


Notice: The above doesn't contain a good collection of songs. Perhaps it isn't great to pull from all their songs. Consider this, maybe better just for individual playlists (i.e. do it for each one the above process) and create an entire playlist in the end combining say 5 songs from each of the playlists they have on their spotify and recommended songs based on those playlists. This is where creating classes is particularly useful.

Also, rename variables.  

From here, I can create playlists on their spotipy.