In [2]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import pandas as pd

In [21]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

In [29]:
top_artist_df = pd.read_csv("top_artists.csv")
top_tracks_df = pd.read_csv("top_tracks.csv")
saved_tracks_df = pd.read_csv("saved_tracks.csv")
playlist_tracks_df = pd.read_csv("playlists.csv")
songs = pd.read_csv("data.csv")

In [8]:
playlist_tracks_df = playlist_tracks_df.drop_duplicates(subset='id', keep="first").reset_index()

In [None]:
playlist_tracks_df.head()

#Labelling favourite playlists/songs as 1 and rest as 0

In [10]:
favourites = ['37i9dQZF1DWZqNqPemiefM', '37i9dQZF1DX6AWGsjpYHPA', '37i9dQZF1EUMDoJuT8yJsl', '37i9dQZF1EJEY0JyptNWDG']

In [18]:
playlist_tracks_df['ratings'] = playlist_tracks_df['id'].apply(lambda x: 1 if x in favourites else 0)
top_tracks_df['ratings'] = 1
saved_tracks_df['ratings'] = saved_tracks_df['track/id'].apply(lambda x: 1 if x in top_tracks_df['id'] else 0)

In [30]:
import random
songs['ratings'] = [random.choice([0,1]) for id in songs.id]

In [32]:
X = songs[['popularity', 'explicit', 'duration_ms', 'danceability', 'energy',
                        'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
                        'liveness', 'valence', 'tempo']]
y = songs['ratings']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Ranking features by informational entropy (takes long time on large dataset)

In [None]:
rf = RandomForestClassifier(n_estimators = 10, random_state = 42)
rfecv = RFECV(estimator=rf, step=1, n_jobs=-1, cv=StratifiedKFold(2), verbose=1, scoring='roc_auc')
rfecv.fit(X_train, y_train)

print(f'Optimal number of features: {rfecv.n_features_}')

plt.figure()
plt.xlabel('Number of features selected')
plt.ylabel('Cross validation score')
plt.plot(range(1, len(rfecv.grid_scores_)+1), rfecv.grid_scores_)
plt.show()

In [None]:
rfc = RandomForestClassifier(n_estimators = 1000, max_depth=4, random_state=42)
rfc.fit(X_train, y_train)

important_features_dict = {}
for idx, val in enumerate(rfc.feature_importances_):
    important_features_dict[idx] = val

important_features_list = sorted(important_features_dict,
                                 key=important_features_dict.get,
                                 reverse=True)


#Training models
The evaluations are generated by the classification report function

In [None]:
rfc = RandomForestClassifier(n_estimators = 1000, random_state=42)
rfc_gcv_parameters = {'min_samples_leaf': [1, 3, 5, 8], 
                      'max_depth': [3, 4, 5, 8, 12, 16, 20], 
                     }
rfe_gcv = GridSearchCV(rfc, rfc_gcv_parameters, n_jobs=-1, cv=StratifiedKFold(2), verbose=1, scoring='roc_auc')
rfe_gcv.fit(X_train, y_train)
minmax_scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(minmax_scaler.fit_transform(X_train), columns=X_train.columns)
print(classification_report(y_test, rfe_gcv.predict(X_test)))

In [None]:
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
print(classification_report(y_test, lr.predict(X_test)))

In [None]:
xgb = XGBClassifier(n_estimators = 1000, random_state=42, use_label_encoder=False)
xgb_gcv_parameters = {'eta': [0.2, 0.3, 0.4, 0.5],
                      'min_child_weight': [0.3, 0.5, 0.8, 1],
                      'max_depth': [3, 4, 5, 8, 12, 16, 20], 
                     }
xgb_gcv = GridSearchCV(xgb, xgb_gcv_parameters, n_jobs=-1, cv=StratifiedKFold(2), verbose=1, scoring='roc_auc')
xgb_gcv.fit(X_train, y_train)  
xgb_gcv.best_estimator_, xgb_gcv.best_score_
print(classification_report(y_test, xgb_gcv.predict(X_test)))

In [None]:
knn = KNeighborsClassifier(n_jobs=-1)
knn_gcv_params = {'n_neighbors': range(1, 10)}

knn_gcv = GridSearchCV(knn, knn_gcv_params, n_jobs=-1, cv=StratifiedKFold(2), verbose=1, scoring='roc_auc')
knn_gcv.fit(X_train, y_train)
knn_gcv.best_params_, knn_gcv.best_score_
print(classification_report(y_test, knn_gcv.predict(X_test)))