In [1]:
from pandas import read_csv, DataFrame
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
from sklearn.preprocessing import StandardScaler
from joblib import dump, load
from random import sample

# load required files

In [2]:
def distance_based_recommendation(database=None, user_request=None, method=None, weights=None, playlist_num=None):
    """
    :param request_features: scaled 2D array of requested feature
    :param method: string
    :param weights: list of floats summing to 1
    :param playlist_num: integer
    :return: list of recommended track ids
    """
    
    # if playlist_num is None:
    #     playlist_num = 20
    X = database.values
    # use intersection of unweighted methods when neither method not weights are input
    if weights is None:
        if method is None:
            # use intersection between methods as default
            l1_norm = database.copy()
            l2_norm = database.copy()
            mahalanobis = database.copy()
            cosine = database.copy()

            l1_norm.loc[:, "minkowski_p1"] = cdist(X, user_request, 'minkowski', p=1)
            l2_norm.loc[:, "minkowski_p2"] = cdist(X, user_request, 'minkowski', p=2)
            mahalanobis.loc[:, "mahalanobis_dist"] = cdist(X, user_request, 'mahalanobis')
            cosine.loc[:, "cosine_dist"] = cdist(X, user_request, 'cosine')

            l1_norm_sorted_idx = l1_norm.sort_values("minkowski_p1").index.to_list()
            l2_norm_sorted_idx = l2_norm.sort_values("minkowski_p2").index.to_list()
            mahalanobis_sorted_idx = mahalanobis.sort_values("mahalanobis_dist").index.to_list()
            cosine_sorted_idx = cosine.sort_values("cosine_dist").index.to_list()
            # select common recommendations based on "minkowski_p1" method
            KEY = l1_norm_sorted_idx.index
            idx = 0
            common_rec_track_id = []
            while len(common_rec_track_id) < playlist_num:
                idx += 1
                common_rec_track_id = sorted(set(l1_norm_sorted_idx[1:idx + playlist_num + 1]) &
                                             set(l2_norm_sorted_idx[1:idx + playlist_num + 1]) &
                                             set(mahalanobis_sorted_idx[1:idx + playlist_num + 1]) &
                                             set(cosine_sorted_idx[1:idx + playlist_num + 1]), key=KEY)
            return common_rec_track_id
        # select unweighted BUT specified method
        elif method == 'l1_norm':
            l1_norm = database.copy()
            l1_norm.loc[:, "minkowski_p1"] = cdist(X, user_request, 'minkowski', p=1)
            return l1_norm.sort_values("minkowski_p1")[1:playlist_num + 1].index.to_list()
        elif method == 'l2_norm':
            l2_norm = database.copy()
            l2_norm.loc[:, "minkowski_p2"] = cdist(X, user_request, 'minkowski', p=1)
            return l2_norm.sort_values("minkowski_p2")[1:playlist_num + 1].index.to_list()
        elif method == 'mahalanobis':
            mahalanobis = database.copy()
            mahalanobis.loc[:, "mahalanobis_dist"] = cdist(X, user_request, 'mahalanobis')
            return mahalanobis.sort_values("mahalanobis")[1:playlist_num + 1].index.to_list()
        elif method == 'cosine':
            cosine = database.copy()
            cosine.loc[:, "cosine_dist"] = cdist(X, user_request, 'cosine')
            return cosine.sort_values("cosine")[1:playlist_num + 1].index.to_list()
    # if weights are input w/o method return intersection of l1 & l2 weighted methods
    elif method is None:
        l1_norm_w = database.copy()
        kwargs = {'p': 1, 'w': weights}
        l1_norm_w.loc[:, "minkowski_p1_w"] = cdist(X, user_request, 'minkowski', **kwargs)
        l2_norm_w = database.copy()
        kwargs = {'p': 2, 'w': weights}
        l2_norm_w.loc[:, "minkowski_p2_w"] = cdist(X, user_request, 'minkowski', **kwargs)

        l1_norm_w_sorted_idx = l1_norm_w.sort_values("minkowski_p1_w").index.to_list()
        l2_norm_w_sorted_idx = l2_norm_w.sort_values("minkowski_p2_w").index.to_list()
        KEY = l1_norm_w_sorted_idx.index
        idx = 0
        common_rec_track_id_w = []
        while len(common_rec_track_id_w) < playlist_num:
            idx += 1
            common_rec_track_id_w = sorted(set(l1_norm_w_sorted_idx[1:idx + playlist_num + 1]) &
                                           set(l2_norm_w_sorted_idx[1:idx + playlist_num + 1]), key=KEY)
        return common_rec_track_id_w
    # if weights and methods are both input
    elif method == 'l1_norm':
        l1_norm_w = database.copy()
        kwargs = {'p': 1, 'w': weights}
        l1_norm_w.loc[:, "minkowski_p1_w"] = cdist(X, user_request, 'minkowski', **kwargs)
        return l1_norm_w.sort_values("minkowski_p1_w")[1:playlist_num + 1]
    elif method == 'l2_norm':
        l2_norm_w = database.copy()
        kwargs = {'p': 2, 'w': weights}
        l2_norm_w.loc[:, "minkowski_p2_w"] = cdist(X, user_request, 'minkowski', **kwargs)
        return l2_norm_w.sort_values("minkowski_p2_w")[1:playlist_num + 1].index.to_list()

In [3]:
features_frame = read_csv("database_features.csv", index_col="track_id")
null_idx = load("null_indexes.joblib")
scaler = load("database_scaler.joblib")
kmeans = load("model_cluster.joblib")
pca = load("database_pca.joblib")
# getting data thru csv file or connecting to database this is required to pass requested data to fron-end
data = read_csv("spotify_dataset.csv", index_col="track_id")

# input from fron-end

In [15]:
# dummy_input: list of dictionaries HAVING "track_id" to be set as index
# I sample from csv file to simulate front-end request
dummy_input = data.sample(50, replace=False).reset_index().to_dict(orient='records')

In [16]:
dummy_input_frame = DataFrame(dummy_input).set_index("track_id")
# assure feature values exist
for track_id in dummy_input_frame.index.to_list():
    if track_id in null_idx:
        print("No recommendation as not enough data in spotify database available!")
        dummy_input_frame = dummy_input_frame.drop(labels=[track_id], axis=0)
        
dummy_input_features = dummy_input_frame[["danceability", "energy", "acousticness",
                                          "instrumentalness", "tempo", "duration_ms"]]
dummy_input_features_scaled = DataFrame(data=scaler.transform(dummy_input_features[dummy_input_features.columns]),
                                        index=dummy_input_features.index, columns=dummy_input_features.columns)
x_input = dummy_input_features_scaled.values
method_input = None
weights_input = [0.5, 0.2, 0.2, 0.05, 0.05, 0.0]
playlist_num_input = None
playlist_num = 20 if playlist_num_input == None else playlist_num_input

# run back-end algorithms

In [17]:
result = set()
for x in x_input:
    # assign input to its cluster
    label = kmeans.predict(x.reshape(1, -1))
    # find cluster memebrs of the input 
    cluster_memebers = features_frame[features_frame["cluster_number"] == label[0]].iloc[:, :-1]
    # select closest to input from its cluster memebrs
    res = distance_based_recommendation(cluster_memebers, x.reshape(1, -1), method=method_input, weights=weights_input, playlist_num=playlist_num)
    # select one of input neighbours randomly and add it to result 
    result.update(res)
if len(result) > playlist_num:
    playlist_track_ids = sample(list(result), playlist_num)
playlist_trans = pca.transform(features_frame.loc[playlist_track_ids].iloc[:, :-1].values)
x, y = playlist_trans[:, 0], playlist_trans[:, 1]

In [18]:
data.loc[playlist_track_ids]

Unnamed: 0_level_0,artist_name,track_name,album_name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
6l4qLohQGGoTZFhgGjvNoJ,Amon Amarth,Where Silent Gods Stand Guard (Live),Versus the World (Bonus Edition),20.0,0.572,0.953,3.0,-6.893,0.0,0.0609,3e-05,0.193,0.748,0.34,127.071,349187.0,4.0
0LWHXQTNw6rCTJoMsWJEwy,Nneka,Lucifer (No Doubt),Soul Is Heavy,29.0,0.57,0.806,1.0,-5.807,1.0,0.131,0.114,1e-06,0.146,0.426,142.133,274653.0,4.0
4G3umh74WXg9x1yqCY0MMx,Vedat Pektaş,Sevgisin (feat. Adrienne Neusch),Hayranım,7.0,0.475,0.548,9.0,-8.742,0.0,0.0307,0.398,0.0,0.107,0.102,95.02,500808.0,4.0
7hCG0nI8WJ5r6Dix33ZWNb,Pet Shop Boys,Twenty-Something - The Los Evo Jedis Remix,Twenty-Something,10.0,0.579,0.777,8.0,-5.896,1.0,0.0332,0.0834,0.0137,0.107,0.543,174.053,239643.0,4.0
5lXzA4vTFCUHtuj6q09OOo,Johnny Gill,Touch,Let's Get The Mood Right,20.0,0.589,0.441,4.0,-11.919,0.0,0.164,0.629,1.1e-05,0.224,0.613,61.07,251560.0,4.0
7ikcifduaSrDA6k6TElaJ3,J. Robert Spencer,Why Stay? / A Promise - Medley,Next To Normal,0.0,0.579,0.474,5.0,-7.219,1.0,0.438,0.681,0.0,0.235,0.367,68.169,155693.0,4.0
1GOyFhXF8S0CllzqP37YoX,Blue Six,Music & Wine - Th'Attaboy Vocal,Bare Essentials Vol 1,26.0,0.781,0.432,4.0,-10.023,0.0,0.0447,0.00136,0.721,0.0808,0.725,124.99,431987.0,4.0
2Oo0vpuHXVERXdt8fimDu7,Johnson Family Band,In My Coffee,Old Ruby,2.0,0.465,0.568,9.0,-5.437,0.0,0.0389,0.369,0.0,0.233,0.844,96.139,314587.0,4.0
0m5m3YymJLNFaWWymk2nsh,Blackfoot Gypsies,Too Bad,Handle It,2.0,0.5,0.953,2.0,-3.86,1.0,0.0317,0.000886,0.000191,0.18,0.636,93.792,181173.0,4.0
0Z05CjMjfazQFvvFiKYNnV,Pet Shop Boys,Love etc. - Pet Shop Boys mix,Love etc.,4.0,0.577,0.963,9.0,-6.534,0.0,0.03,0.00412,0.147,0.376,0.87,127.983,376892.0,4.0


# Only event that I need to connect to database to retreive data for front-end (uri, uri-analysis)