In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial import distance

import trecs
from trecs.models import ImplicitMF, ImplicitMFLFD
from trecs.random import Generator
from trecs.metrics import MSEMeasurement, AverageFeatureScoreRange

In [2]:
mf = ImplicitMF(num_users=200, num_items=50, num_latent_factors=20)
mf.add_metrics(MSEMeasurement())
mf.startup_and_train(20)

100%|██████████| 20/20 [00:00<00:00, 569.46it/s]


In [3]:
mflfd = ImplicitMFLFD(num_users=200, num_items=50, num_latent_factors=20)
mflfd.add_metrics(MSEMeasurement())

In [4]:
mflfd.startup_and_train(20)
mflfd.run(10)

100%|██████████| 20/20 [00:00<00:00, 666.00it/s]
100%|██████████| 10/10 [00:00<00:00, 359.57it/s]


In [10]:
mflfd.rec[199]

array([28, 47, 24, 26, 17, 29, 21,  8, 35, 22])

In [11]:
mflfd.item_indices[2]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])

In [12]:
mflfd.predicted_scores[199]

PredictedScores([-0.0039357 ,  0.07485203, -0.04236126, -0.08997958,
                  0.04977737,  0.05714712, -0.02264841,  0.00065783,
                  0.26111283,  0.03112025,  0.06541505,  0.04389587,
                 -0.02299957,  0.17046344,  0.03256437,  0.05220518,
                  0.06831269,  0.26197084,  0.05722312, -0.02929727,
                  0.09663108,  0.26124397,  0.25247822, -0.02349746,
                  0.26514799,  0.05324277,  0.26333212,  0.10349831,
                  0.26795013,  0.26160695, -0.05863118,  0.16903596,
                 -0.09654743,  0.08529744,  0.06277408,  0.26046603,
                  0.10980759,  0.12831767, -0.05818659,  0.08965369,
                  0.08007001,  0.03490471,  0.1557519 , -0.05190988,
                 -0.05334527,  0.04428788, -0.05412159,  0.26565358,
                 -0.11552737,  0.0391694 ])

In [13]:
top_n_limit=50
k=10

row = np.repeat(mflfd.users.user_vector, mflfd.item_indices.shape[1])
row = row.reshape((mflfd.num_users, -1))
s_filtered = mflfd.predicted_scores[row, mflfd.item_indices]

negated_scores = -1 * s_filtered  # negate scores so indices go from highest to lowest
# break ties using a random score component
scores_tiebreak = np.zeros(
    negated_scores.shape, dtype=[("score", "f8"), ("random", "f8")]
)
scores_tiebreak["score"] = negated_scores
scores_tiebreak["random"] = mflfd.random_state.random(negated_scores.shape)
top_k = scores_tiebreak.argpartition(top_n_limit - 1, order=["score", "random"])[:, :top_n_limit]
# now we sort within the top k
row = np.repeat(mflfd.users.user_vector, top_n_limit).reshape((mflfd.num_users, -1))
# again, indices should go from highest to lowest
sort_top_k = scores_tiebreak[row, top_k].argsort(order=["score", "random"])
top_k_recs = mflfd.item_indices[row, top_k[row, sort_top_k]]

#dims are attribute, items, users
top_k_att = mflfd.items_hat[:, top_k_recs[:]].swapaxes(1,2)


In [None]:
# top_item_att = items_att_filt[:,0, :]
# top_item_idx = top_k_recs[:, 0].reshape(top_k_recs.shape[0],1)

In [None]:
# # all_recs_idx = np.empty([mflfd.users_hat.shape[0], k])
# # print(all_recs_idx.shape)
# # print(top_item_idx.shape)
# # all_recs[:, 0] = top_item_idx[:,None]
# all_recs_idx = top_k_recs[:,0].reshape(top_k_recs.shape[0],1)
# top_k_recs[0,0]

In [14]:
#store the id of the highest predicted item for each user as a start
all_recs_idx = top_k_recs[:,0].reshape(top_k_recs.shape[0],1)
for idx, user in enumerate(mflfd.users_hat):

        #get the top rec and add that as the first item for each user
        #user_item_feats = top_k_att[:,:,idx]
        
        #make a copy so as not to modify the original array
        user_item_feats = np.array(top_k_att[:,:,idx])
        
        orig_user_item_feats = np.array(user_item_feats)
        #user_item_feats_idx = [0]
        user_max_idx = top_k_recs[idx, 0] 
        recs_idxs = [user_max_idx]
        
        #hold the features of the recommended items
        recs_features = mflfd.items_hat[:,user_max_idx]
        
        distances = []
        
        for rec in range(1,k):
            
            #user_item_feats_idx.append(rec)
            
            #drop element from user_item_feats that has been put in the recommendations
            ##DONT DO THIS BECAUSE IT MAKES INDEXING MORE COMPLEX
            #mask = np.invert(np.isin(user_item_feats, recs_features))
            #user_item_feats = user_item_feats[mask].reshape(user_item_feats.shape[0], -1).shape
            
            if rec == 1:
                #for the second item, just use the first item values
                centroid = recs_features
            else:
                centroid = np.nanmean(recs_features, axis=0)

            centroid = centroid.reshape(1, -1)

            #set all the previously chosen item features to the centroid, so they will not be selected again
            #don't want to just remove rows because it will throw of the indexing
            user_item_feats[:, 0:rec+1]=centroid.T

            d = pairwise_distances(X=centroid, Y=user_item_feats.T, metric='cityblock',force_all_finite='allow_nan' )
            
            most_distant = np.argmax(d)
            
            if rec == 1 and idx == 199:
                print(recs_idxs)
                print(d)
                print (user_max_idx, most_distant, d.max())
            
            distances.append(d.max())
            
            most_distant_feats = user_item_feats.T[most_distant]
            
            #get the index of the most distant item in the top k recs
            recs_idxs.append(top_k_recs[idx, most_distant])
            #recs_idxs.append(most_distant)
            #get the item index from the original array of indices, not the constrained array
            #orig_recs_idxs.append(top_k_recs[idx, most_distant])
            #recs_preds.append(n_ratings[idx][most_distant])

            recs_features = np.vstack((recs_features, user_item_feats[:, most_distant]))


[28]
[[0.         0.         2.91723351 2.48022334 2.91139904 3.0781765
  2.1482682  3.54380254 2.34108507 3.52366134 2.96135085 2.45374892
  3.5003844  3.26835241 3.12476421 2.88737305 3.18179196 3.06699265
  3.39895907 4.05388131 3.30111248 2.92986029 3.6981555  3.07802181
  2.84052437 2.98928176 3.4381535  3.20087377 3.67225777 2.80911792
  3.38658493 3.42826583 3.33773471 3.39164808 2.90524187 3.27303615
  3.43312962 3.08570601 3.118288   3.11164422 3.19106259 3.89933909
  3.64881616 3.27486073 3.42752619 3.50400078 3.39188873 2.79702314
  3.27341293 3.08458329]]
28 19 4.053881307784937


In [15]:
#last_user_top_k = top_k_recs[-1]
#print(last_user_top_k)
recs_features = mflfd.items_hat[:,user_max_idx]
print(recs_features.shape)
print(mflfd.items_hat.shape)
print(recs_features)

(20,)
(20, 50)
[-0.04235257  0.09234527  0.12974366  0.31111835  0.30281501 -0.12389591
 -0.22377909  0.15149489 -0.25298534  0.07490045  0.02758562  0.06873254
  0.00478463 -0.19935663  0.11956094  0.1057478   0.16910094  0.01691764
  0.07907935  0.01562524]


In [16]:
user_item_feats = np.array(top_k_att[:,:,idx])
print(user_item_feats)

[[-4.23525654e-02  1.58360991e-01  1.03841234e-01 -2.80776840e-01
   4.53705217e-02  4.22068739e-02  6.72889896e-02  1.43032438e-01
   1.61501115e-01  1.02811942e-02  2.67063420e-01  9.30567310e-02
   1.61165399e-01  6.03234326e-02 -5.19057903e-03  2.12011509e-01
   5.15101120e-02 -1.35996261e-01 -2.59408470e-01 -5.32085771e-02
   5.87818963e-02  2.22274737e-01  7.22131433e-02 -5.49438045e-03
   1.49365723e-01 -1.48783142e-01  6.64591126e-02 -5.01929000e-03
   1.91541706e-01  6.70890256e-02 -2.36376089e-02  5.84611124e-02
  -4.11086579e-02  3.16090734e-02  2.59304859e-03 -1.58080796e-01
   1.86255658e-01 -3.16786886e-02 -2.47615344e-02  7.11207244e-02
   5.41692417e-02  2.11061879e-01 -8.72305289e-02  1.20053549e-01
   2.49961547e-02  1.05239102e-01 -2.71981267e-02 -1.24723096e-01
   2.79004216e-02 -1.18632561e-01]
 [ 9.23452670e-02  2.59434218e-02  2.21163844e-01  1.14993851e-01
   1.08790710e-01  1.97779264e-01  2.83594892e-01 -6.07971134e-02
   3.60893243e-01  7.18301219e-02  1.9546

In [22]:
#last_user_items_att = mflfd.items_hat[:, last_user_top_k]
#last_user_items_att.shape
#centroid = last_user_items_att[0,:].reshape(1,-1)
#print(centroid.shape)
centroid2 = recs_features
print(centroid2.shape)

(20,)


In [21]:
#print(last_user_items_att.shape)
print(user_item_feats.shape)

(20, 50)


In [27]:
top_k_recs[199,:]

array([28, 47, 24, 26, 17, 29, 21,  8, 35, 22, 13, 31, 42, 37, 36, 27, 20,
       39, 33, 40,  1, 16, 10, 34, 18,  5, 25, 15,  4, 45, 11, 49, 41, 14,
        9,  7,  0,  6, 12, 23, 19,  2, 43, 44, 46, 38, 30,  3, 32, 48])

In [None]:
d = pairwise_distances(X=centroid2, Y=user_item_feats.T, metric='cityblock',force_all_finite='allow_nan' )
print(d)
print(np.argmax(d))

In [23]:
d = pairwise_distances(X=recs_features.reshape(1,-1), Y=user_item_feats.T, metric='cityblock',force_all_finite='allow_nan' )
print(d)
print(np.argmax(d))

[[0.         3.37201358 2.91723351 2.48022334 2.91139904 3.0781765
  2.1482682  3.54380254 2.34108507 3.52366134 2.96135085 2.45374892
  3.5003844  3.26835241 3.12476421 2.88737305 3.18179196 3.06699265
  3.39895907 4.05388131 3.30111248 2.92986029 3.6981555  3.07802181
  2.84052437 2.98928176 3.4381535  3.20087377 3.67225777 2.80911792
  3.38658493 3.42826583 3.33773471 3.39164808 2.90524187 3.27303615
  3.43312962 3.08570601 3.118288   3.11164422 3.19106259 3.89933909
  3.64881616 3.27486073 3.42752619 3.50400078 3.39188873 2.79702314
  3.27341293 3.08458329]]
19


In [24]:
recs_idxs

[28, 40, 19, 10, 8, 22, 13, 43, 42, 16]

In [None]:
test=mflfd.items_hat[:,user_max_idx].reshape(1, -1)
#print(test.shape)
d = pairwise_distances(X=test, Y=orig_user_item_feats.T, metric='cityblock',force_all_finite='allow_nan' )
print(d)  

print(d.max())

In [None]:
top_k_att.shape
last_user_att = top_k_att[:,:,199]
#print(recs_idxs)
#print(distances)
last_user_att.shape
top_k_recs[199,12]

In [None]:
#distance.cityblock(mflfd.items_hat[18], mflfd.items_hat[9])

all_ds = pairwise_distances(X=last_user_att[18].reshape(1, -1), Y=last_user_att, metric='cityblock',force_all_finite='allow_nan' )
all_ds.argmax()

In [None]:
all_ds[18].max()
#top_k_recs
#user_max_idx = top_k_recs[idx, 0] 

In [None]:
np.isin(user_item_feats, recs_features)
mask = np.invert(np.isin(user_item_feats, recs_features))
#mask.shape
#result = user_item_feats[mask,...]

#drop element from user_item_feats that has been put in the recommendations
user_item_feats = user_item_feats[mask].reshape(user_item_feats.shape[0], -1).shape

In [None]:
user_item_feats.shape
#C = np.delete(C, 1, 1)  # delete second column of C
#np.where(np.isin(user_item_feats, recs_features), user_item_feats)

In [None]:
mask = np.ones(shape=user_item_feats.shape, dtype=bool)
mask[recs_idxs] = False

#mask[[0,2,4]] = False

In [None]:
def latent_factors_diversification(user_features, item_features, n_recs=10, top_n_limit=None):


    hat_ratings = np.dot(user_features, item_features.T) 

    if top_n_limit:
        #if constraining by top n, only retain the top n ratings within each user
        ind=np.argpartition(hat_ratings,-top_n_limit)[:,-top_n_limit:]
        n_ratings = np.take(hat_ratings, ind)
    else:
        #if not constraining by top n, retail all item indices for all users. 
        #If this is the case, in all_user_recs, recs_idxs should match original_recs_idxs
        ind=np.tile(np.arange(0,len(item_features)),(len(user_features),1))
        n_ratings = hat_ratings



    all_user_recs = dict()
    
    max_idx = np.argmax(n_ratings, axis=1)
    top_items=item_features[max_idx]
    
    all_recs = np.empty([user_features.shape[0],item_features.shape[1], n_recs])
    #all_recs = None
    

    for idx, user in enumerate(user_features):

        user_item_feats = item_features[ind[idx]]
        user_max_idx = np.argmax(n_ratings[idx])

        #get the top rec and add that as the first item for each user
        user_max = max_idx[idx]
        recs_features = top_items[idx]
        recs_idxs = [max_idx[idx]]
        recs_preds = [n_ratings[idx][user_max]]
        orig_recs_idxs = [ind[idx, user_max]]



        for rec in range(1,n_recs):
            if rec == 1:
                #for the second item, just use the first item values
                centroid = recs_features
            else:
                centroid = np.nanmean(recs_features, axis=0)

            centroid = centroid.reshape(1, -1)

            #set all the previously chosen item features to the centroid, so they will not be selected again
            #don't want to just remove rows because it will throw of the indexing
            user_item_feats[recs_idxs]=centroid

            d = pairwise_distances(X=centroid, Y=user_item_feats, metric='cityblock',force_all_finite='allow_nan' )
            most_distant = np.argmax(d)

            recs_idxs.append(most_distant)
            #get the item index from the original array of indices, not the constrained array
            orig_recs_idxs.append(ind[idx, most_distant])
            recs_preds.append(n_ratings[idx][most_distant])

            recs_features = np.vstack((recs_features, user_item_feats[most_distant]))

        all_recs[idx, :, :]=recs_features
            
        all_user_recs[idx]={'user_feats': user,
                        'original_recs_idx':orig_recs_idxs,
                        'recs_idx':recs_idxs,
                        'recs_features':recs_features,
                        'recs_preds':recs_preds}

        
    return all_recs, all_user_recs

In [None]:
n_ratings.shape

In [None]:
import datetime

x = datetime.datetime.now()
print(x) 