In [1]:
import numpy as np
import pandas as pd

import pitched_recommend

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn import cluster, metrics

import matplotlib.pyplot as plt
import seaborn as sns

import pickle

sns.set(style="darkgrid", palette="Set3")

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
with open('/Users/meshchd/Downloads/recommender_model_model_20190504.pkl', 'rb') as f:
    model = pickle.load(f)

In [3]:
df = pd.DataFrame(data=model.playlist_vecs, index=model.playlists, 
                  columns=['factor'+str(x) for x in range(1,model.playlist_vecs.shape[1]+1)])
df.head()

Unnamed: 0,factor1,factor2,factor3,factor4,factor5,factor6,factor7,factor8,factor9,factor10,...,factor191,factor192,factor193,factor194,factor195,factor196,factor197,factor198,factor199,factor200
spotify:playlist:0000mSEZofZjMa5x6ooFMT,0.381671,4.340792,-1.436632,2.492162,-0.077138,2.750425,0.692091,-2.326943,-0.939862,-2.059456,...,-2.235541,-0.296701,-1.148543,-1.758255,1.660909,5.379026,-0.291825,-1.676979,2.763846,1.001892
spotify:playlist:0003EPZgOqan8RIyLg3zIE,1.095493,-1.625466,4.47572,0.537307,-0.210129,-0.762072,-2.743342,0.807074,7.351343,-3.486826,...,3.537256,-2.377482,-2.372458,-4.638627,0.412767,0.448122,-1.298666,-2.450981,5.587291,5.880754
spotify:playlist:000H8pt2u6SWMvNtTksPbC,0.550884,-0.166518,0.682681,-1.25167,-2.490389,-0.329858,0.481624,5.699526,1.322156,1.290797,...,0.271886,0.099851,2.013634,0.632457,2.816938,0.240414,-2.903885,5.586271,-5.823388,4.404129
spotify:playlist:000IDLrAdMwaTycyWNqphh,0.563976,-0.305774,1.505696,-1.138674,0.289151,0.143725,-0.369537,-0.770468,-0.67903,0.513139,...,-0.160115,-0.102442,1.711999,0.837739,-0.324243,0.518,0.144128,-0.704987,-0.601334,0.619642
spotify:playlist:000T04pNN05jXDimzYbl4b,-0.759061,4.314501,2.325355,-0.824923,-2.003628,-0.356305,4.351025,-0.28191,2.411831,-1.049518,...,-0.551718,2.670736,-1.979285,2.621447,-1.966242,0.157424,-0.549873,0.273648,1.155447,-1.852015


In [None]:
df.describe()

In [4]:
# Function to run clutering algorithms and review their output

def dbscan_tuning(df,eps_list,min_samples_list):
    
    df_res = pd.DataFrame(columns=['eps','min_sample','n_clusters','unlabelled_prop','silhouette_coeff'],
                         index = range(len(eps_list)*len(min_samples_list)))
    c=-1
    
    for i,eps_i in enumerate(eps_list):
        for j, min_j in enumerate(min_samples_list):
            
            print("Running iteration for eps ", eps_i)
            
            c=c+1
            
            dbscn = DBSCAN(eps=eps_i, min_samples=min_j)
            dbscan=dbscn.fit(df)
            labels = dbscan.labels_ 
            n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
            
            #print("epsilon, min_samples: ", eps_i, min_j)
            #print( "clusters: ", n_clusters_)
            #print("unlabelled proportion: ", float(list(labels).count(-1))/labels.shape[0])
            #print("silhouette coefficient: ", metrics.silhouette_score(df, labels))
            #print()
            
            df_res['eps'][c]=eps_i
            df_res['min_sample'][c]=min_j
            df_res['n_clusters'][c]=n_clusters_
            df_res['unlabelled_prop'][c]=float(list(labels).count(-1))/labels.shape[0]
            df_res['silhouette_coeff'][c]=metrics.silhouette_score(df, labels)
            
            
    return df_res

In [5]:
# Range of clustering parameters

eps_list=[5]
min_samples_list=[2]

In [6]:
df_res = dbscan_tuning(df,eps_list,min_samples_list)

Running iteration for eps  5


In [7]:
df_res.head(10)

Unnamed: 0,eps,min_sample,n_clusters,unlabelled_prop,silhouette_coeff
0,5,2,88168,0.463513,0.0524667


In [None]:
ax = sns.scatterplot(x="n_clusters", y="silhouette_coeff", data=df_res)