# Spotify Recommendation Algorithm

### Spotify Authentication

In [1]:
# Import libraries
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.exceptions import SpotifyException
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn as skl

  import pandas.util.testing as tm


In [2]:
# Set account info
cid = '81fee852cceb4259910e7d2ff78493c3'
secret = 'ad4360215d7641ee809275cc5cdd4a6c'
username = 'francescab13'

# Connect and create Spotify instance
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## Data Gathering

#### Retrieve track ID's from 'Like' and 'Dislike' playlists

In [3]:
# Get data from 'Likes' playlist
good_ids = []
pl_id = 'spotify:playlist:2O6XH1ip37KOllmc1KoYEs'
offset = 0

while True:
    response = sp.playlist_tracks(pl_id,
                                  offset=offset,
                                  fields='items.track.id,total')
    good_ids.append(response['items'])
    offset = offset + len(response['items'])

    if len(response['items']) == 0:
        break

# Flatten list of lists of JSON
good_flatten = []
for sublist in good_ids:
    for item in sublist:
        good_flatten.append(item)

# Check good track ID list
good_flatten[0:5]

[{'track': {'id': '75Q69chmd8CEZbVsA4CDMm'}},
 {'track': {'id': '38kjIfRtXsUxXyzhsKwX7i'}},
 {'track': {'id': '1YT8xkroYGNLGR4qhuWLC4'}},
 {'track': {'id': '76gYk9g0bZj47NyIKzjLF6'}},
 {'track': {'id': '7tvuLLroI0n6uYBWuFig5d'}}]

In [4]:
# Get data from 'Dislikes' playlist
bad_ids = []
pl_id = 'spotify:playlist:58KlzYsGNQoujtrQc2CU5d'
offset = 0

while True:
    response = sp.playlist_tracks(pl_id,
                                  offset=offset,
                                  fields='items.track.id,total')
    bad_ids.append(response['items'])
    offset = offset + len(response['items'])

    if len(response['items']) == 0:
        break

# Flatten list of lists of JSON
bad_flatten = []
for sublist in bad_ids:
    for item in sublist:
        bad_flatten.append(item)
        
# Check bad track ID list
bad_flatten[0:5]

[{'track': {'id': '1YwNlWLf8auhazSQUDQLFU'}},
 {'track': {'id': '1xShPgQbOUa98avWJQFDBY'}},
 {'track': {'id': '3GREm6zSHwKZsJxl0hqbAQ'}},
 {'track': {'id': '0C6EIiQu8CS4eYtOCMEiAd'}},
 {'track': {'id': '0puf9yIluy9W0vpMEUoAnN'}}]

#### Get track characteristic data

In [None]:
# Compile list of 'good' track IDs
good_id_list = []
for i in range(0, len(good_flatten)):
    good_id_list.append(good_flatten[i]['track']['id'])
good_id_list = [x for x in good_id_list if x]

# Retrieve track characteristics
good_features = []
for i in range(0, len(good_id_list)):
    if not good_id_list[i]:
        continue
    else:
        good_features.append(sp.audio_features(good_id_list[i]))

# Flatten JSON list
good_features_flat = []
for sublist in good_features:
    for item in sublist:
        good_features_flat.append(item)
        
# Check 'good' features list
good_features_flat[0:3]

In [None]:
# Compile list of 'bad' track IDs
bad_id_list = []
for i in range(0, len(bad_flatten)):
    bad_id_list.append(bad_flatten[i]['track']['id'])
bad_id_list = [x for x in bad_id_list if x]

# Retrieve track characteristics
bad_features = []
for i in range(0, len(bad_id_list)):
    if not bad_id_list[i]:
        continue
    else:
        bad_features.append(sp.audio_features(bad_id_list[i]))

# Flatten JSON list
bad_features_flat = []
for sublist in bad_features:
    for item in sublist:
        bad_features_flat.append(item)
        
# Check 'bad' features list
bad_features_flat[0:3]

#### Create dataframes for 'liked' and 'disliked' tracks with audio features

In [None]:
# Create 'Like' dataframe
like_df = pd.DataFrame.from_records(good_features_flat)

# Retrieve song and artist names to add to dataframe
good_song_names = []
good_artists = []
for index, row in like_df.iterrows():
    try:
        response = sp.track(str(row['uri']))
        good_song_names.append(response['name'])
        good_artists.append(response['artists'][0]['name'])
    except SpotifyException as e:
        good_song_names.append('Unknown')
        good_artists.append('Unknown')

# Create 'song_name' and 'artist' columns
like_df['song_name'] = good_song_names
like_df['artist'] = good_artists

# Check dataframe
like_df.head()

In [None]:
# Create 'Dislike' dataframe
dislike_df = pd.DataFrame.from_records(bad_features_flat)

# Retrieve song and artist names to add to dataframe
bad_song_names = []
bad_artists = []
for index, row in dislike_df.iterrows():
    try:
        response = sp.track(str(row['uri']))
        bad_song_names.append(response['name'])
        bad_artists.append(response['artists'][0]['name'])
    except SpotifyException as e:
        bad_song_names.append('Unknown')
        bad_artists.append('Unknown')

# Create 'song_name' and 'artist' columns
dislike_df['song_name'] = bad_song_names
dislike_df['artist'] = bad_artists

# Check dataframe
dislike_df.head()

## Exploratory Data Analysis

In [None]:
#Importing the function
from pandas_profiling import ProfileReport

In [None]:
like_profile = ProfileReport(like_df, title='Liked Songs Pandas Profiling Report', explorative = True)
like_profile

In [None]:
dislike_profile = ProfileReport(dislike_df, title='Disliked Songs Pandas Profiling Report', explorative = True)
dislike_profile

## Data Visualization

In [None]:
# Create list of audio feature column names
trait_cols = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
             'liveness', 'valence']
discrete_trait_cols = ['key', 'mode', 'tempo', 'time_signature']

#### Dist Plots

In [None]:
fig, ax = plt.subplots(len(trait_cols), figsize=(16,12))

for i, col_val in enumerate(trait_cols):

    sns.distplot(like_df[col_val], hist=True, ax=ax[i])
    ax[i].set_title('Freq dist '+col_val, fontsize=10)
    ax[i].set_xlabel(col_val, fontsize=8)
    ax[i].set_ylabel('Count', fontsize=8)

plt.savefig('like_dist_plots.png')

In [None]:
fig, ax = plt.subplots(len(trait_cols), figsize=(16,12))

for i, col_val in enumerate(trait_cols):

    sns.distplot(dislike_df[col_val], hist=True, ax=ax[i])
    ax[i].set_title('Freq dist '+col_val, fontsize=10)
    ax[i].set_xlabel(col_val, fontsize=8)
    ax[i].set_ylabel('Count', fontsize=8)

plt.savefig('dislike_dist_plots.png')

#### Pair Plots

In [None]:
like_pairplot = sns.pairplot(like_df[trait_cols])
like_pairplot.savefig("like_pairplot.png")

In [None]:
dislike_pairplot = sns.pairplot(dislike_df[trait_cols])
dislike_pairplot.savefig("dislike_pairplot.png")

#### Correlation Heatmaps

In [None]:
# Calculate correlations
corr = like_df[trait_cols].corr()
 
# Heatmap
like_corr_heatmap = sns.heatmap(corr)
figure = like_corr_heatmap.get_figure()    
figure.savefig('like_corr_heatmap.png', dpi=400)

In [None]:
# Calculate correlations
corr = dislike_df[trait_cols].corr()
 
# Heatmap
dislike_corr_heatmap = sns.heatmap(corr)
figure = dislike_corr_heatmap.get_figure()    
figure.savefig('dislike_corr_heatmap.png', dpi=400)

#### Frequency Plots (Discrete Variables)

In [None]:
fig, ax = plt.subplots(2, 2)
sns.countplot(like_df['key'], ax=ax[0,0])
sns.countplot(like_df['mode'], ax=ax[0,1])
sns.countplot(like_df['time_signature'], ax=ax[1,0])
fig.show()
fig.savefig('like_freq_plots.png')

In [None]:
fig, ax = plt.subplots(2, 2)
sns.countplot(dislike_df['key'], ax=ax[0,0])
sns.countplot(dislike_df['mode'], ax=ax[0,1])
sns.countplot(dislike_df['time_signature'], ax=ax[1,0])
fig.show()
fig.savefig('dislike_freq_plots.png')

## Model Creation/Training

#### Preparation

In [None]:
# Assign tags to liked and disliked songs
like_df['target'] = 1
dislike_df['target'] = 0

In [None]:
# Create combined dataframe
dfs = [like_df, dislike_df]
full_df = pd.concat(dfs)

In [None]:
# Creating training/test split
from sklearn.model_selection import train_test_split
train, test = train_test_split(full_df, test_size = 0.15)

In [None]:
#Define feature sets
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
              'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
x_train = train[features]
y_train = train["target"]
x_test = test[features]
y_test = test["target"]

#### Decision Tree

In [None]:
dtc = skl.tree.DecisionTreeClassifier(min_samples_split=100)
dt = dtc.fit(x_train, y_train)
y_pred = dtc.predict(x_test)
score = skl.metrics.accuracy_score(y_test, y_pred) * 100
print("Accuracy using Decision Tree: ", round(score, 1), "%")

#### K-Nearest Neighbors

In [None]:
knn = skl.neighbors.KNeighborsClassifier(3)
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
score = skl.metrics.accuracy_score(y_test, knn_pred) * 100
print("Accuracy using Knn Tree: ", round(score, 1), "%")

#### AdaBoost/Gradient Boost

In [None]:
# Import packages
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.metrics import accuracy_score
# from sklearn.ensemble import GradientBoostingClassifier

ada = skl.ensemble.AdaBoostClassifier(n_estimators=100)
ada.fit(x_train, y_train)
ada_pred = ada.predict(x_test)

score = skl.metrics.accuracy_score(y_test, ada_pred) * 100
print("Accuracy using ada: ", round(score, 1), "%")
gbc = skl.ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=.1, max_depth=1, random_state=0)
gbc.fit(x_train, y_train)
predicted = gbc.predict(x_test)
score = accuracy_score(y_test, predicted)*100
print("Accuracy using Gbc: ", round(score, 1), "%")

In [None]:
import sklearn.ensemble

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance

In [None]:
import sklearn; print("Scikit-Learn", sklearn.__version__)
import scipy; print("Scikit-Learn", scipy.__version__)