In [38]:
import numpy as np
import pandas as pd
from textblob import TextBlob
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations

spotify = pd.read_csv('top_songs.csv')
print(spotify.columns)

Index(['Pop Artist', 'Title', 'Artist', 'Genre', 'Year',
       'Beats Per Minute (BPM)', 'Energy', 'Danceability', 'Loudness (dB)',
       'Liveness', 'Valence', 'Length (Duration)', 'Acousticness',
       'Speechiness', 'Pop Track'],
      dtype='object')


In [39]:
# Selecting useful columns
def select_cols(df):
  return df[['Artist', 'Title', 'Danceability', 'Energy', 'Loudness (dB)', 'Speechiness', 'Acousticness', 'Liveness', 'Valence', 'Pop Artist', 'Genre', 'Pop Track']]
songDF = select_cols(spotify)
songDF.head()

Unnamed: 0,Artist,Title,Danceability,Energy,Loudness (dB),Speechiness,Acousticness,Liveness,Valence,Pop Artist,Genre,Pop Track
0,Norah Jones,Sunrise,53,30,-14,3,94,11,68,1,adult standards,71
1,Deep Purple,Black Night,50,79,-11,7,17,17,81,2,album rock,39
2,Gorillaz,Clint Eastwood,66,69,-9,17,2,7,52,3,alternative hip hop,69
3,Foo Fighters,The Pretender,43,96,-4,4,0,3,37,4,alternative metal,76
4,Bruce Springsteen,Waitin' On A Sunny Day,58,82,-5,3,1,10,87,5,classic rock,59


In [40]:
# Dropping gaps
songDF = songDF.drop(axis=0, index=songDF[songDF['Danceability']==''].index)
for i in ['Danceability', 'Energy', 'Loudness (dB)', 'Speechiness', 'Acousticness', 'Liveness', 'Valence']:
    songDF[i] = songDF[i].astype('float64')
songDF['Pop Track'] = songDF['Pop Track'].astype('int64')
print(songDF.dtypes)

Artist            object
Title             object
Danceability     float64
Energy           float64
Loudness (dB)    float64
Speechiness      float64
Acousticness     float64
Liveness         float64
Valence          float64
Pop Artist         int64
Genre             object
Pop Track          int64
dtype: object


In [41]:
# Convert data
def genre_preprocess(df):
  df['Genre_list'] = df['Genre'].apply(lambda x: x.split(" "))
  return df

songDF = genre_preprocess(songDF)
songDF['Genre_list'].head()

0         [adult, standards]
1              [album, rock]
2    [alternative, hip, hop]
3       [alternative, metal]
4            [classic, rock]
Name: Genre_list, dtype: object

In [42]:
def playlist_process(df):
    df = select_cols(df)
    return df

In [43]:
# sentiment analysis
def sentiment_analysis(df, text_col):
  def getSubjectivity(text):
   return TextBlob(text).sentiment.subjectivity

  def getPolarity(text):
    return TextBlob(text).sentiment.polarity

  def getAnalysis(score, task="polarity"):
    if task == "subjectivity":
      if score < 1/4:
        return "low"
      elif score > 1/4:
        return "high"
      else:
        return "medium"
    else:
      if score < 0:
        return 'Negative'
      elif score == 0:
        return 'Neutral'
      else:
        return 'Positive'

  df['subjectivity'] = df[text_col].apply(getSubjectivity).apply(lambda x: getAnalysis(x,"subjectivity"))
  df['polarity'] = df[text_col].apply(getPolarity).apply(getAnalysis)
  return df

sentiment = sentiment_analysis(songDF, "Title")
sentiment.head()



Unnamed: 0,Artist,Title,Danceability,Energy,Loudness (dB),Speechiness,Acousticness,Liveness,Valence,Pop Artist,Genre,Pop Track,Genre_list,subjectivity,polarity
0,Norah Jones,Sunrise,53.0,30.0,-14.0,3.0,94.0,11.0,68.0,1,adult standards,71,"[adult, standards]",low,Neutral
1,Deep Purple,Black Night,50.0,79.0,-11.0,7.0,17.0,17.0,81.0,2,album rock,39,"[album, rock]",high,Negative
2,Gorillaz,Clint Eastwood,66.0,69.0,-9.0,17.0,2.0,7.0,52.0,3,alternative hip hop,69,"[alternative, hip, hop]",low,Neutral
3,Foo Fighters,The Pretender,43.0,96.0,-4.0,4.0,0.0,3.0,37.0,4,alternative metal,76,"[alternative, metal]",low,Neutral
4,Bruce Springsteen,Waitin' On A Sunny Day,58.0,82.0,-5.0,3.0,1.0,10.0,87.0,5,classic rock,59,"[classic, rock]",low,Neutral


In [44]:
# One-hot encoding for new features
def one_hot(df, column, new_name):
  tf_df = pd.get_dummies(df[column])
  feature_names = tf_df.columns
  tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
  tf_df.reset_index(drop=True, inplace=True)
  return tf_df

In [84]:
# Function to build entire feature set
def create_set(df, float_cols):
  
  # TF-IDF implementation
  tfidf = TfidfVectorizer()
  tfidf_matrix =  tfidf.fit_transform(df['Genre_list'].apply(lambda x: " ".join(x)))
  genre_df = pd.DataFrame(tfidf_matrix.toarray())
  genre_df.columns = ['Genre' + "|" + i for i in tfidf.get_feature_names_out()]
  genre_df.reset_index(drop=True, inplace=True)

  # Sentiment analysis
  df = sentiment_analysis(df, "Title")

  # One-hot encoding
  subject_one_hot = one_hot(df, 'subjectivity', 'subject') 
  polar_one_hot = one_hot(df, 'polarity', 'polar') 

  # Scale float columns
  pop = df[["Pop Artist","Pop Track"]].reset_index(drop = True)
  scaler = MinMaxScaler()
  pop_scaled = pd.DataFrame(scaler.fit_transform(pop), columns=pop.columns) 

  floats = df[float_cols].reset_index(drop = True)
  scaler = MinMaxScaler()
  floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) 

  # Concanenate all features
  final = pd.concat([genre_df, pop_scaled, floats_scaled, subject_one_hot, polar_one_hot], axis = 1) 
  
  final['Title'] = df['Title'].values

  return final

In [85]:
# Generate feature
float_cols = songDF.dtypes[songDF.dtypes == 'float64'].index.values

complete_feature_set = create_set(songDF, float_cols=float_cols)
complete_feature_set.head()

Unnamed: 0,Genre|acid,Genre|acoustic,Genre|adult,Genre|afropop,Genre|alaska,Genre|album,Genre|alternative,Genre|ambient,Genre|americana,Genre|and,...,Acousticness,Liveness,Valence,subject|high,subject|low,subject|medium,polar|Negative,polar|Neutral,polar|Positive,Title
0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.949495,0.092784,0.677083,0,1,0,0,1,0,Sunrise
1,0.0,0.0,0.0,0.0,0.0,0.812795,0.0,0.0,0.0,0.0,...,0.171717,0.154639,0.8125,1,0,0,1,0,0,Black Night
2,0.0,0.0,0.0,0.0,0.0,0.0,0.417048,0.0,0.0,0.0,...,0.020202,0.051546,0.510417,0,1,0,0,1,0,Clint Eastwood
3,0.0,0.0,0.0,0.0,0.0,0.0,0.638244,0.0,0.0,0.0,...,0.0,0.010309,0.354167,0,1,0,0,1,0,The Pretender
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.010101,0.082474,0.875,0,1,0,0,1,0,Waitin' On A Sunny Day


In [86]:
# Test playlist
spotify_test = pd.read_csv('my_playlist.csv')
spotify_test = playlist_process(spotify_test)
spotify_test['Genre_list'] = spotify_test['Genre']
spotify_test['Genre_list'] = spotify_test.loc[:, 'Genre']
spotify_test.head()

Unnamed: 0,Artist,Title,Danceability,Energy,Loudness (dB),Speechiness,Acousticness,Liveness,Valence,Pop Artist,Genre,Pop Track,Genre_list
0,The Majority,One Third,0.495,0.761,-7.496,0.0,0.0934,0.373,0.626,61.0,['freakbeat'],3,['freakbeat']
1,This Cold Night,The Man with Fire in His Mouth,0.532,0.887,-6.2,0.579,0.00908,0.0739,0.811,56.0,['dark post-punk'],14,['dark post-punk']
2,Motorama,Red Drop,0.449,0.827,-6.883,0.835,0.317,0.443,0.877,29.0,"['deep gothic', 'post-punk']",50,"['deep gothic', 'post-punk']"
3,David Bowie,China Girl - 2002 Remaster,0.646,0.633,-10.09,0.0282,0.0155,0.107,0.874,22.0,['art rock'],58,['art rock']
4,Washed Out,Belong,0.508,0.921,-2.664,0.13,0.019,0.381,0.638,48.0,['alternative dance'],30,['alternative dance']


In [87]:
spotify_test[["Artist","Title"]][:10]

Unnamed: 0,Artist,Title
0,The Majority,One Third
1,This Cold Night,The Man with Fire in His Mouth
2,Motorama,Red Drop
3,David Bowie,China Girl - 2002 Remaster
4,Washed Out,Belong
5,Wavves,How Are You
6,Sexy Sushi,Sex Appeal
7,Grandmaster Flash,White Lines (Don't Do It)
8,Semisonic,Secret Smile
9,Madness,Our House


In [63]:
# Extract features
def generate_feature(complete_feature_set, playlist_df):
    complete_feature_set_playlist = complete_feature_set[complete_feature_set['Title'].isin(playlist_df['Title'].values)]

    complete_feature_set_nonplaylist = complete_feature_set[~complete_feature_set['Title'].isin(playlist_df['Title'].values)]
    complete_feature_set_playlist_final = complete_feature_set_playlist.drop(columns = "Title")
    return complete_feature_set_playlist_final.sum(axis = 0), complete_feature_set_nonplaylist

In [64]:
# Generating the features
complete_feature_set_playlist_vector, complete_feature_set_nonplaylist = generate_feature(complete_feature_set, spotify_test)

In [65]:
# Non-playlist features
complete_feature_set_nonplaylist.head()

Unnamed: 0,Genre|acid,Genre|acoustic,Genre|adult,Genre|afropop,Genre|alaska,Genre|album,Genre|alternative,Genre|ambient,Genre|americana,Genre|and,...,Acousticness,Liveness,Valence,subject|high,subject|low,subject|medium,polar|Negative,polar|Neutral,polar|Positive,Title
0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.949495,0.092784,0.677083,0,1,0,0,1,0,Sunrise
1,0.0,0.0,0.0,0.0,0.0,0.812795,0.0,0.0,0.0,0.0,...,0.171717,0.154639,0.8125,1,0,0,1,0,0,Black Night
2,0.0,0.0,0.0,0.0,0.0,0.0,0.417048,0.0,0.0,0.0,...,0.020202,0.051546,0.510417,0,1,0,0,1,0,Clint Eastwood
3,0.0,0.0,0.0,0.0,0.0,0.0,0.638244,0.0,0.0,0.0,...,0.0,0.010309,0.354167,0,1,0,0,1,0,The Pretender
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.010101,0.082474,0.875,0,1,0,0,1,0,Waitin' On A Sunny Day


In [79]:
def generate_playlist_recom(df, features, nonplaylist_features):
    non_playlist_df = df[df['Title'].isin(nonplaylist_features['Title'].values)]

    # Find cosine similarity
    non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop('Title', axis = 1).values, features.values.reshape(1, -1))[:,0]
    non_playlist_df_top_50 = non_playlist_df.sort_values('sim',ascending = False).head(50)
    
    return non_playlist_df_top_50

In [83]:
# Genreate top 30 recommendation
recommend = generate_playlist_recom(songDF, complete_feature_set_playlist_vector, complete_feature_set_nonplaylist)
print("Based on your likes:")
recommend.head(30)

Based on your likes:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop('Title', axis = 1).values, features.values.reshape(1, -1))[:,0]


Unnamed: 0,Artist,Title,Danceability,Energy,Loudness (dB),Speechiness,Acousticness,Liveness,Valence,Pop Artist,Genre,Pop Track,Genre_list,subjectivity,polarity,sim
1233,Supertramp,It's Raining Again,60.0,67.0,-7.0,3.0,62.0,26.0,56.0,1234,album rock,60,"[album, rock]",low,Neutral,0.96099
995,Peter Frampton,Show Me The Way,70.0,73.0,-8.0,3.0,43.0,10.0,64.0,996,album rock,66,"[album, rock]",low,Neutral,0.960766
1020,Electric Light Orchestra,Livin' Thing,55.0,65.0,-7.0,3.0,58.0,12.0,37.0,1021,album rock,70,"[album, rock]",low,Neutral,0.960742
1127,Supertramp,Breakfast In America - Remastered,53.0,66.0,-6.0,3.0,43.0,7.0,76.0,1128,album rock,70,"[album, rock]",low,Neutral,0.959949
967,Led Zeppelin,Kashmir - 1990 Remaster,48.0,54.0,-11.0,4.0,49.0,16.0,58.0,968,album rock,69,"[album, rock]",low,Neutral,0.959158
1044,Peter Gabriel,Solsbury Hill,68.0,63.0,-9.0,3.0,36.0,31.0,48.0,1045,album rock,60,"[album, rock]",low,Neutral,0.959025
1158,Dire Straits,Romeo and Juliet,60.0,57.0,-8.0,3.0,44.0,8.0,49.0,1159,album rock,54,"[album, rock]",low,Neutral,0.95874
1376,Bruce Hornsby,The Way It Is,58.0,69.0,-12.0,3.0,61.0,14.0,53.0,1377,album rock,66,"[album, rock]",low,Neutral,0.955644
897,Thin Lizzy,Whiskey In The Jar,56.0,69.0,-11.0,4.0,53.0,20.0,75.0,898,album rock,63,"[album, rock]",low,Neutral,0.955104
912,The Rolling Stones,Angie,43.0,55.0,-6.0,3.0,67.0,11.0,41.0,913,album rock,72,"[album, rock]",low,Neutral,0.955092
