In [55]:
likes_1 = [
    ('dylan', 'coldplay'),
    ('dylan', 'imagine dragons'),
    ('dylan', 'u2'),
    ('nate',  'coldplay'),
    ('nate',  'imagine dragons'),
    ('nate',  'u2'),
    ('kris',  'aerosmith'),
    ('kris',  'guns&roses'),
    ('kris',  'pearljam'),
    ('sonny', 'aerosmith'),
    ('sonny', 'coldplay'),
    ('sonny', 'guns&roses'),
    ('sonny', 'imagine dragons'),
    ('sonny', 'pearljam'),
    ('sonny', 'u2')
]

likes = likes_1

In [47]:
# Turn the data into a DataFrame table

import numpy as np
import pandas as pd

music_csv = pd.DataFrame(likes, columns=['FAN', 'BAND'])
print(music_csv)

      FAN             BAND
0   dylan         coldplay
1   dylan  imagine dragons
2   dylan               u2
3    nate         coldplay
4    nate  imagine dragons
5    nate               u2
6    kris        aerosmith
7    kris       guns&roses
8    kris         pearljam
9   sonny        aerosmith
10  sonny         coldplay
11  sonny       guns&roses
12  sonny  imagine dragons
13  sonny         pearljam
14  sonny               u2
15  terry        aerosmith
16  terry       guns&roses
17  terry  imagine dragons
18  terry         pearljam
19  vince         coldplay
20  vince               u2


In [48]:
# My NNMF library assumes 'SUBJECT' and 'OBJECT' column names.

import nnmf

music_dft = music_csv.rename(
    columns={'FAN': nnmf.SUBJECT_COLUMN_NAME, 
             'BAND': nnmf.OBJECT_COLUMN_NAME})

print(music_dft)

   SUBJECT           OBJECT
0    dylan         coldplay
1    dylan  imagine dragons
2    dylan               u2
3     nate         coldplay
4     nate  imagine dragons
5     nate               u2
6     kris        aerosmith
7     kris       guns&roses
8     kris         pearljam
9    sonny        aerosmith
10   sonny         coldplay
11   sonny       guns&roses
12   sonny  imagine dragons
13   sonny         pearljam
14   sonny               u2
15   terry        aerosmith
16   terry       guns&roses
17   terry  imagine dragons
18   terry         pearljam
19   vince         coldplay
20   vince               u2


In [49]:
# now convert association table to a ratings (association) matrix...

music_dfm = nnmf.ratings_table_to_matrix(music_dft)
print(music_dfm)

       coldplay  imagine dragons   u2  aerosmith  guns&roses  pearljam
dylan       1.0              1.0  1.0        0.0         0.0       0.0
nate        1.0              1.0  1.0        0.0         0.0       0.0
kris        0.0              0.0  0.0        1.0         1.0       1.0
sonny       1.0              1.0  1.0        1.0         1.0       1.0
terry       0.0              1.0  0.0        1.0         1.0       1.0
vince       1.0              0.0  1.0        0.0         0.0       0.0


In [50]:
from sklearn.decomposition import NMF

clusters = 2

model = NMF(n_components=clusters, init='random', random_state=0)

V = music_dfm.values
W = model.fit_transform(V)    # 'features' matrix
H = model.components_         # 'coefficients' matrix

In [51]:
print(np.round(V, decimals=2))

[[1. 1. 1. 0. 0. 0.]
 [1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1.]
 [0. 1. 0. 1. 1. 1.]
 [1. 0. 1. 0. 0. 0.]]


In [52]:
print(np.round(W, decimals=2))


[[0.03 0.85]
 [0.03 0.85]
 [0.76 0.  ]
 [0.8  0.76]
 [0.86 0.12]
 [0.   0.63]]


In [53]:
print(np.round(H, decimals=2))

[[0.   0.53 0.   1.24 1.24 1.24]
 [1.27 0.89 1.27 0.   0.   0.  ]]


In [54]:
# Show that W.H approximates V

print(np.round(W.dot(H), decimals=2))

[[1.08 0.78 1.08 0.03 0.03 0.03]
 [1.08 0.78 1.08 0.03 0.03 0.03]
 [0.   0.4  0.   0.94 0.94 0.94]
 [0.96 1.1  0.96 0.99 0.99 0.99]
 [0.15 0.56 0.15 1.06 1.06 1.06]
 [0.8  0.57 0.8  0.   0.   0.  ]]


In [41]:
cluster_names = [ f'C_{i}' for i in range(clusters)]
print(cluster_names)

['C_0', 'C_1']


In [42]:
fan_clustering = pd.DataFrame(W, index=music_dfm.index, columns=cluster_names)
print(fan_clustering)

           C_0       C_1
dylan  0.70645  0.000000
nate   0.70645  0.000000
kris   0.00000  0.673672
sonny  0.70645  0.673645


In [43]:
# Notice that H is oriented the other way...

print(pd.DataFrame(H, index=cluster_names, columns=music_dfm.columns))

     coldplay  imagine dragons        u2  aerosmith  guns&roses  pearljam
C_0  1.415528         1.415528  1.415528   0.000019    0.000019  0.000019
C_1  0.000000         0.000000  0.000000   1.484423    1.484423  1.484423


In [44]:
# So transpose it.

band_clustering = pd.DataFrame(H, index=cluster_names, columns=music_dfm.columns).T
print(band_clustering)

                      C_0       C_1
coldplay         1.415528  0.000000
imagine dragons  1.415528  0.000000
u2               1.415528  0.000000
aerosmith        0.000019  1.484423
guns&roses       0.000019  1.484423
pearljam         0.000019  1.484423


In [45]:
likes += [
    ('terry', 'aerosmith'),
    ('terry', 'guns&roses'),
    ('terry', 'imagine dragons'),
    ('terry', 'pearljam')
]

likes += [
    ('vince', 'coldplay'),
    ('vince', 'u2')
]