In [7]:
import numpy as np
import pandas as pd

import pitched_recommend

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, AffinityPropagation
from sklearn import cluster, metrics

import matplotlib.pyplot as plt
import seaborn as sns

import pickle

import timeit

sns.set(style="darkgrid", palette="Set3")

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [8]:
with open('/Users/meshchd/Downloads/saved_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [6]:
model.playlist_vecs.shape

(411218, 200)

In [9]:
df = pd.DataFrame(data=model.playlist_vecs, index=model.playlists, 
                  columns=['factor'+str(x) for x in range(1,model.playlist_vecs.shape[1]+1)])
df.head()

Unnamed: 0,factor1,factor2,factor3,factor4,factor5,factor6,factor7,factor8,factor9,factor10,...,factor191,factor192,factor193,factor194,factor195,factor196,factor197,factor198,factor199,factor200
spotify:playlist:0000mSEZofZjMa5x6ooFMT,2.50275,-1.228707,-0.282741,-3.071208,-1.398009,0.398109,1.479157,2.860332,-0.804759,1.881027,...,-0.022632,-4.557347,4.709409,1.53099,1.843507,-0.038009,-0.410174,2.70517,-2.385634,2.075729
spotify:playlist:0003EPZgOqan8RIyLg3zIE,0.336441,3.074456,-2.366162,-2.259032,1.749526,2.240573,1.42273,3.37126,2.589835,1.69287,...,-2.232575,1.71943,0.241225,0.23672,-5.213932,-2.225398,1.264466,1.810506,0.631505,0.048048
spotify:playlist:000H8pt2u6SWMvNtTksPbC,-2.665002,-2.174338,-2.054785,-2.392505,-3.084473,-0.656535,-0.28297,1.256239,-1.694255,2.444017,...,-0.945915,-2.825642,3.204344,0.241877,-0.611005,-0.653071,3.114747,0.09005,-0.968317,-1.273156
spotify:playlist:000IDLrAdMwaTycyWNqphh,-0.173717,1.135722,0.017455,0.370019,0.342356,0.226051,-0.042602,0.807535,-0.265244,0.192498,...,0.438603,-1.635264,0.918143,-0.376109,-1.094644,-0.281336,0.581373,0.041825,0.243192,0.314021
spotify:playlist:000T04pNN05jXDimzYbl4b,-1.14061,0.501446,-0.765112,-4.181076,-2.387796,2.149458,-0.436093,-3.800027,-0.712147,1.238158,...,2.337934,0.451322,-1.039385,-2.624017,-4.437568,-0.000322,-1.066878,-0.318335,4.319573,0.773506


In [10]:
df.shape

(411218, 200)

In [None]:
df_small = df.iloc[:200000,:]
df_small.shape

In [None]:
df_chris = pd.DataFrame(data=model.isrc_vecs, index=model.isrcs, 
                        columns=['factor'+str(x) for x in range(1,model.playlist_vecs.shape[1]+1)])
df_chris.head()

In [None]:
df_chris = df_chris.reset_index()

In [None]:
df_chris.to_gbq(destination_table='adhoc.isrc_factors_for_chris', project_id='umg-comm-tech-dev', if_exists='replace')

In [None]:
df_chris.to_feather('/Users/meshchd/Downloads/isrc_factors_for_chris.feather')

### DBSCAN

In [None]:
# Function to run clutering algorithms and review their output

def dbscan_tuning(df,eps_list,min_samples_list):
    
    
    start = timeit.timeit()
    #print("Start time: ", start)
    df_res = pd.DataFrame(columns=['eps','min_sample','n_clusters','unlabelled_prop','silhouette_coeff'],
                         index = range(len(eps_list)*len(min_samples_list)))
    c=-1
    
    for i,eps_i in enumerate(eps_list):
        for j, min_j in enumerate(min_samples_list):
            
            print("Running iteration for eps ", eps_i)
            
            c=c+1
            
            dbscn = DBSCAN(eps=eps_i, min_samples=min_j)
            dbscan=dbscn.fit(df)
            labels = dbscan.labels_ 
            n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
            
            #print("epsilon, min_samples: ", eps_i, min_j)
            #print( "clusters: ", n_clusters_)
            #print("unlabelled proportion: ", float(list(labels).count(-1))/labels.shape[0])
            #print("silhouette coefficient: ", metrics.silhouette_score(df, labels))
            #print()
            
            df_res['eps'][c]=eps_i
            df_res['min_sample'][c]=min_j
            df_res['n_clusters'][c]=n_clusters_
            df_res['unlabelled_prop'][c]=float(list(labels).count(-1))/labels.shape[0]
            df_res['silhouette_coeff'][c]=metrics.silhouette_score(df, labels)
            
    end = timeit.timeit()
    print("elapsed time: ", end-start)
    
    return df_res

In [None]:
# Range of clustering parameters

eps_list=[50]
min_samples_list=[3]

In [None]:
df_res = dbscan_tuning(df,eps_list,min_samples_list)

In [None]:
df_res.head(10)

In [None]:
ax = sns.scatterplot(x="n_clusters", y="silhouette_coeff", data=df_res)

### Affinity Propagation

In [None]:
af = AffinityPropagation(verbose=True).fit(df_small)

In [None]:
print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

In [None]:
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_