In [1]:
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
from slugify import slugify
import ast
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import booleanize
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

from scipy.stats import hmean
from collections import defaultdict
from copy import deepcopy

import math
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [2]:
def custom_round(input_matrix):
    cur_matrix = deepcopy(input_matrix)
    cur_matrix[cur_matrix == 0] = 1
    
    cur_matrix = np.floor(np.log10(cur_matrix))
    cur_matrix[np.isinf(cur_matrix)] = 0
    cur_matrix = 10 ** cur_matrix

    output_matrix = cur_matrix * np.round( 
        ( input_matrix / cur_matrix ), 3 
    ) 
    
    return output_matrix
    

In [3]:
playlists_df = pd.read_pickle("playlists.pkl")

playlists_df.drop(
    columns=["Various Artists"], inplace=True
)

default_data = playlists_df[playlists_df.columns[10:-1]]


In [4]:
def count_data():
    cur_data = pd.DataFrame(
        playlists_df[playlists_df.columns[10:-1]]
    )

    return cur_data

def bool_data():
    cur_data = count_data()
    cur_data = cur_data.clip(upper=1)
    
    return cur_data

def _log_data():
    cur_data = count_data()
    cur_data = np.log10(1+cur_data)
    
    return cur_data

def log_data():
    cur_data = _log_data() / np.log10(1+1)
    cur_data = np.round(cur_data).astype(int)
    
    return cur_data

def freq_data():
    cur_data = count_data() 
    cur_data = pd.DataFrame.divide(cur_data, np.sum(cur_data,axis=1).values, axis='rows')

    return cur_data

def idf_data(cur_data, c=1):
    cur_data *= _idf_data(c)
    return cur_data
    
def _idf_data(c=2):
    cur_data = count_data()
    
    min_N = 1 + np.min(np.count_nonzero(cur_data, axis=0))
    max_N = 1 + np.max(np.count_nonzero(cur_data, axis=0))
    
    c = ( c - 1 ) / np.log10( max_N / min_N )
    
    scaled_data = np.log10( 
        max_N / ( 1 + np.count_nonzero(cur_data, axis=0) ) 
    )
    
    return( 1 + c * scaled_data )
    

In [7]:
data_dict = {
    "count": count_data(), 
    "bool": bool_data(), 
    "freq": freq_data(), 
    "log": _log_data()
}

c_list = [1,2,3,4]

cluster_counts = [40]


In [8]:
idf_dict = defaultdict(dict)

for tmp_key, tmp_data in data_dict.items():
    for tmp_c in c_list:
        idf_dict[tmp_key][tmp_c] = idf_data(
            deepcopy(tmp_data), tmp_c
        )


In [9]:
cluster_dict = {}

for cluster_count in cluster_counts:
    for tmp_key, tmp_data in idf_dict.items():
        for tmp_c in c_list:
            print([cluster_count, tmp_key, tmp_c])
            if (cluster_count, tmp_key, tmp_c) in cluster_dict : continue
                
            nmf_model = NMF(cluster_count)
            
            component_labels = [
                f"component_{index+1}" for index in range(cluster_count)
            ]

            W = pd.DataFrame(
                nmf_model.fit_transform(tmp_data[tmp_c]),
                index = default_data.index,
                columns = component_labels
            )
    
            H = pd.DataFrame(
                custom_round(nmf_model.components_),
                index = component_labels,
                columns = default_data.columns
            )
            
            cluster_dict[(cluster_count, tmp_key, tmp_c)] = (W, H)
            
print("done.")


[40, 'count', 1]
[40, 'count', 2]
[40, 'count', 3]
[40, 'count', 4]
[40, 'bool', 1]
[40, 'bool', 2]
[40, 'bool', 3]
[40, 'bool', 4]
[40, 'freq', 1]
[40, 'freq', 2]
[40, 'freq', 3]
[40, 'freq', 4]
[40, 'log', 1]
[40, 'log', 2]
[40, 'log', 3]
[40, 'log', 4]
done.


In [45]:
list(range(1,26))[:25]

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25]

In [71]:
def foo(cluster_count, c, data_key, W_scalar, H_scalar, is_off=False):
    if is_off : 
        print("off.")
        return
    
    (W, H) = cluster_dict[(cluster_count, data_key, c)]
    
    assert W.shape[1] == cluster_count
    assert H.shape[0] == cluster_count
    
    print_list = []
    
    W_max = np.max(np.max(W))
    H_max = np.max(np.max(H))
    
    for cur_cluster in range(cluster_count):
        W_order = list(reversed(W.iloc[:,cur_cluster].argsort()))
        H_order = list(reversed(H.iloc[cur_cluster,:].argsort()))
      
        W_vec = W.iloc[W_order,cur_cluster]
        H_vec = H.iloc[cur_cluster,H_order]

        W_vec = W_vec[W_vec > W_max / int(W_scalar)]
        H_vec = H_vec[H_vec > H_max / int(H_scalar)]
        
        if len(H_vec) == 0 :
            H_vec = H.iloc[cur_cluster,H_order]
            
            print_list.append([len(W_vec),cur_cluster, "\n".join([
                f"({len(H_vec)} Artists, {len(W_vec)} Playlists)",
                "(" + ", ".join(H_vec.index[:10]) + ")",
                ""
            ])])
            
            continue
            
        print_list.append([len(W_vec),cur_cluster, "\n".join([
            f"({len(H_vec)} Artists, {len(W_vec)} Playlists)",
            ", ".join(H_vec.index[:25]),
            ""
        ])])
        
    print_list = reversed(sorted(print_list))
        
    map_vector = dict()
    for ii, (jj, yy) in enumerate(map(lambda xx: (xx[1],xx[2]), print_list)):
        map_vector[ii+1] = jj+1
        print(f"Component {ii+1}", yy)
        
    return map_vector


In [21]:
data_dict = {
    "log": _log_data()
}

c_list = [1,1.5,2,2.5,3,3.5]

cluster_counts = [25,50,75,100]


In [24]:
idf_dict = defaultdict(dict)

for tmp_key, tmp_data in data_dict.items():
    for tmp_c in c_list:
        idf_dict[tmp_key][tmp_c] = idf_data(
            deepcopy(tmp_data), tmp_c
        )


In [25]:
# cluster_dict = {}

for cluster_count in cluster_counts:
    for tmp_key, tmp_data in idf_dict.items():
        for tmp_c in c_list:
            print([cluster_count, tmp_key, tmp_c])
            if (cluster_count, tmp_key, tmp_c) in cluster_dict : continue
                
            nmf_model = NMF(cluster_count)
            
            component_labels = [
                f"component_{index+1}" for index in range(cluster_count)
            ]

            W = pd.DataFrame(
                nmf_model.fit_transform(tmp_data[tmp_c]),
                index = default_data.index,
                columns = component_labels
            )
    
            H = pd.DataFrame(
                custom_round(nmf_model.components_),
                index = component_labels,
                columns = default_data.columns
            )
            
            cluster_dict[(cluster_count, tmp_key, tmp_c)] = (W, H)
            
print("done.")


[25, 'log', 1]
[25, 'log', 1.5]
[25, 'log', 2]
[25, 'log', 2.5]
[25, 'log', 3]
[25, 'log', 3.5]
[50, 'log', 1]
[50, 'log', 1.5]
[50, 'log', 2]
[50, 'log', 2.5]
[50, 'log', 3]
[50, 'log', 3.5]
[75, 'log', 1]
[75, 'log', 1.5]
[75, 'log', 2]
[75, 'log', 2.5]
[75, 'log', 3]
[75, 'log', 3.5]
[100, 'log', 1]
[100, 'log', 1.5]
[100, 'log', 2]
[100, 'log', 2.5]
[100, 'log', 3]
[100, 'log', 3.5]
done.


In [26]:
interact(foo,
    cluster_count=widgets.ToggleButtons(options=cluster_counts),
    c=widgets.ToggleButtons(options=c_list),
    data_key=widgets.ToggleButtons(options=list(data_dict.keys()))
)


interactive(children=(ToggleButtons(description='cluster_count', options=(25, 50, 75, 100), value=25), ToggleB…

<function __main__.foo(cluster_count, c, data_key)>

In [79]:
interact(foo,
    cluster_count=[75],
    c=[2.5],
    data_key=widgets.ToggleButtons(options=list(data_dict.keys())),
    W_scalar="1000",
    H_scalar="40",
    is_off=False
)


interactive(children=(Dropdown(description='cluster_count', options=(75,), value=75), Dropdown(description='c'…

<function __main__.foo(cluster_count, c, data_key, W_scalar, H_scalar, is_off=False)>

In [73]:
cluster_dict[(75, "log", 1.5)]

(                        component_1  component_2  component_3  component_4  \
 slug                                                                         
 7jvfS7fn7qx2HS3oMklAdw          0.0     0.000000     0.000000     0.000000   
 4IgAuc1ur86KtYZAEnD6Q3          0.0     0.000000     0.000000     0.000000   
 4LMlI2WnTGDqtKU8bTNGFS          0.0     0.000000     0.013165     0.000000   
 4g4D9ayX9v2YWVx1ciKMFh          0.0     0.000000     0.008706     0.000000   
 21iTXBzTLFliu8uQT6Espl          0.0     0.000000     0.000000     0.000000   
 ...                             ...          ...          ...          ...   
 5rLHlv0Avd3mgbNmMDOnXv          0.0     0.000000     0.000000     0.009293   
 1p5ZeMzXpkEHYuM8zy2j3e          0.0     0.000000     0.000000     0.000000   
 7DiLUwsIPRhAwEiVnCu7oU          0.0     0.060529     0.000000     0.000000   
 1eUtArGA9QdSphQLArbvF6          0.0     0.000000     0.000000     0.000000   
 2Ytz7ABT8ffnQHo5frgSpq          0.0     0.000000   

In [63]:
b

Unnamed: 0,Billx,Blausch,Dr. Peacock,Lefa,Mandragora,Mind Against,PLK,Pandrezz,Running Pine,The Blaze,...,DJ Antoine,Compuphonic,John Adams,Opia,Kings of Convenience,YUNGBLUD,Manatee Commune,Cigarettes After Sex,Modjo,The Cars
component_1,0.000000,0.000000,0.000000,0.000000,0.000000,0.202200,0.00000,0.049860,0.000000,0.000000,...,0.00000,0.00000,0.079270,0.0,0.000000,0.000000,0.039630,0.000000,0.09113,0.000000
component_2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.420900,0.000000,0.000000,...,0.00000,0.00513,0.000000,0.0,0.000000,0.000000,0.000000,0.000288,0.00000,0.000000
component_3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.103300,...,0.03436,0.05617,0.000000,0.0,0.199800,0.124100,0.000000,0.012600,0.04970,0.000000
component_4,0.000000,0.000000,0.000000,0.000000,0.000000,0.031580,0.00000,0.016700,0.000000,0.023670,...,0.00000,0.00000,0.132700,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
component_5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.008292,0.000000,0.021790,...,0.00000,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
component_71,0.003224,0.002884,0.006306,0.003570,0.000121,0.005887,0.00319,0.000000,0.002884,0.003977,...,0.00000,0.08353,0.507400,0.0,0.009859,0.000743,0.000000,0.000000,0.00000,0.001025
component_72,0.000673,0.000499,0.000021,0.001247,0.000000,0.002320,0.00037,0.658700,0.000499,0.022040,...,0.00000,0.00000,0.002038,0.0,0.010910,0.000000,0.000958,0.000000,0.00000,0.000000
component_73,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.00000,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
component_74,0.000000,0.000000,0.000000,0.000000,0.000000,0.085360,0.00000,0.000000,0.000000,0.071390,...,0.00000,0.06215,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.001714


In [92]:
(W, H) = cluster_dict[(75, "log", 3)]

W.to_pickle("W.pkl")
H.to_pickle("H.pkl")


In [93]:
interact(foo,
    cluster_count=[75],
    c=[3],
    data_key=["log"],
    W_scalar="1000",
    H_scalar="40",
    is_off=False
)


interactive(children=(Dropdown(description='cluster_count', options=(75,), value=75), Dropdown(description='c'…

<function __main__.foo(cluster_count, c, data_key, W_scalar, H_scalar, is_off=False)>