In [108]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances

import trecs
from trecs.models import ImplicitMF, ImplicitMFLFD
from trecs.random import Generator
from trecs.metrics import MSEMeasurement, AverageFeatureScoreRange

In [2]:
mf = ImplicitMF(num_users=200, num_items=50, num_latent_factors=20)
mf.add_metrics(MSEMeasurement())
mf.startup_and_train(20)

100%|██████████| 20/20 [00:00<00:00, 581.08it/s]


In [3]:
mflfd = ImplicitMFLFD(num_users=200, num_items=50, num_latent_factors=20)
mflfd.add_metrics(MSEMeasurement())

In [4]:
mflfd.startup_and_train(20)
mflfd.run(10)

100%|██████████| 20/20 [00:00<00:00, 583.15it/s]
100%|██████████| 10/10 [00:00<00:00, 281.86it/s]


In [None]:
mflfd.rec

In [7]:
mflfd.item_indices[2]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])

In [8]:
mflfd.predicted_scores

PredictedScores([[ 1.36333226e-01, -1.66519458e-02,  6.49199936e-02, ...,
                  -7.32779306e-03,  1.20038806e-02,  5.92683249e-02],
                 [ 4.96087745e-02,  1.37658062e-01,  3.86123711e-02, ...,
                  -6.36656763e-02,  3.30936052e-02,  7.93674277e-05],
                 [ 2.43081972e-01,  7.76892541e-03,  8.26208732e-02, ...,
                   2.51812451e-01,  9.79027408e-02, -6.74336103e-02],
                 ...,
                 [ 3.92426919e-03,  2.06078771e-01,  1.71795467e-01, ...,
                   5.83166856e-03,  7.47863524e-02,  2.05539495e-01],
                 [ 1.40650402e-01,  3.74743174e-02,  3.50551931e-02, ...,
                   4.57870910e-03, -2.11677300e-03, -2.29767529e-02],
                 [ 8.97558951e-03, -9.52487544e-02, -2.03734547e-02, ...,
                   4.17621589e-02,  2.46576098e-01,  2.49137211e-01]])

In [46]:
top_n_limit=25
k=10

row = np.repeat(mflfd.users.user_vector, mflfd.item_indices.shape[1])
row = row.reshape((mflfd.num_users, -1))
s_filtered = mflfd.predicted_scores[row, mflfd.item_indices]

negated_scores = -1 * s_filtered  # negate scores so indices go from highest to lowest
# break ties using a random score component
scores_tiebreak = np.zeros(
    negated_scores.shape, dtype=[("score", "f8"), ("random", "f8")]
)
scores_tiebreak["score"] = negated_scores
scores_tiebreak["random"] = mflfd.random_state.random(negated_scores.shape)
top_k = scores_tiebreak.argpartition(top_n_limit - 1, order=["score", "random"])[:, :top_n_limit]
# now we sort within the top k
row = np.repeat(mflfd.users.user_vector, top_n_limit).reshape((mflfd.num_users, -1))
# again, indices should go from highest to lowest
sort_top_k = scores_tiebreak[row, top_k].argsort(order=["score", "random"])
top_k_recs = mflfd.item_indices[row, top_k[row, sort_top_k]]

#dims are attribute, items, users
top_k_att = mflfd.items_hat[:, top_k_recs[:]].swapaxes(1,2)
# ]  # extract items such that rows go from highest scored to lowest-scored of top-k

# #hat_ratings = np.dot(user_features, item_features.T) 
# if top_n_limit:
#     #if constraining by top n, only retain the top n ratings within each user
#     ind=np.argpartition(s_filtered,-top_n_limit)[:,-top_n_limit:]
#     n_ratings = np.take(s_filtered, ind)
# else:
#     #if not constraining by top n, retail all item indices for all users. 
#     #If this is the case, in all_user_recs, recs_idxs should match original_recs_idxs
#     ind=np.tile(np.arange(0,len(mflfd.item_features)),(len(mflfd.user_features),1))
#     n_ratings = s_filtered




In [54]:
#test=s_filtered[row, top_k_recs]
top_k_recs[user, :]

(200, 25)

In [48]:
mflfd.items_hat.shape

(20, 50)

In [68]:
print(top_k_recs.shape)
#items_att_filt.swapaxes(1,2)
items_att_filt.shape

(200, 25)


(20, 10, 200)

In [72]:
top_item_att = items_att_filt[:,0, :]
top_item_idx = top_k_recs[:, 0].reshape(top_k_recs.shape[0],1)

In [73]:
print(top_item_att.shape, top_item_idx.shape)

(20, 200) (200, 1)


In [15]:
mflfd.items_hat[0][]

Items([-0.01123793,  0.00751781, -0.02554887, -0.04741556, -0.01037852,
       -0.03844633, -0.05029884,  0.0542907 ,  0.10701629,  0.02496831,
        0.04633878,  0.15997014, -0.10993948, -0.12718572,  0.09179639,
       -0.1134795 ,  0.01727046,  0.0739141 , -0.04844383, -0.18960749,
       -0.0429951 , -0.05942451,  0.12084402, -0.05053055,  0.07371387,
       -0.25574536, -0.06095287,  0.09543332, -0.16322751, -0.17180136,
        0.14373126,  0.10081025,  0.15051233, -0.02719695,  0.11989134,
        0.09782389,  0.0826308 ,  0.21286395, -0.11384116, -0.14549402,
       -0.00826814,  0.00204945,  0.00168327, -0.12757597,  0.18901525,
        0.15785879,  0.07495036, -0.09718099, -0.20607848,  0.0666113 ])

In [88]:
# all_recs_idx = np.empty([mflfd.users_hat.shape[0], k])
# print(all_recs_idx.shape)
# print(top_item_idx.shape)
# all_recs[:, 0] = top_item_idx[:,None]
all_recs_idx = top_k_recs[:,0].reshape(top_k_recs.shape[0],1)
top_k_recs[0,0]

0

In [107]:
#all_recs = np.empty([mflfd.users_hat.shape[0],mflfd.items_hat.shape[1], k])
#all_recs_idx = np.empty([mflfd.users_hat.shape[0], k])
#top_item_idx = top_k_recs[:, 0]

#store the id of the highest predicted item for each user as a start
all_recs_idx = top_k_recs[:,0].reshape(top_k_recs.shape[0],1)
for idx, user in enumerate(mflfd.users_hat):

        user_item_feats = top_k_att[:,:,idx]
        user_max_idx = top_k_recs[idx, 0]

        #get the top rec and add that as the first item for each user
        #user_max = max_idx[idx]
        recs_features = mflfd.items_hat[:,user_max_idx]
        recs_idxs = user_max_idx
        #recs_idxs = [max_idx[idx]]
        #recs_preds = [n_ratings[idx][user_max]]
        #orig_recs_idxs = [ind[idx, user_max]]
        
        for rec in range(1,k):
            if rec == 1:
                #for the second item, just use the first item values
                centroid = recs_features
            else:
                centroid = np.nanmean(recs_features, axis=0)

            centroid = centroid.reshape(1, -1)

            #set all the previously chosen item features to the centroid, so they will not be selected again
            #don't want to just remove rows because it will throw of the indexing
            user_item_feats[:, recs_idxs]=centroid

            d = pairwise_distances(X=centroid, Y=user_item_feats, metric='cityblock',force_all_finite='allow_nan' )
            most_distant = np.argmax(d)

            recs_idxs.append(most_distant)
            #get the item index from the original array of indices, not the constrained array
            #orig_recs_idxs.append(ind[idx, most_distant])
            #recs_preds.append(n_ratings[idx][most_distant])

            recs_features = np.vstack((recs_features, user_item_feats[most_distant]))


NameError: name 'pairwise_distances' is not defined

In [106]:
user_item_feats.shape

(20, 25)

In [103]:
mflfd.items_hat.shape

(20, 50)

In [26]:
def latent_factors_diversification(user_features, item_features, n_recs=10, top_n_limit=None):


    hat_ratings = np.dot(user_features, item_features.T) 

    if top_n_limit:
        #if constraining by top n, only retain the top n ratings within each user
        ind=np.argpartition(hat_ratings,-top_n_limit)[:,-top_n_limit:]
        n_ratings = np.take(hat_ratings, ind)
    else:
        #if not constraining by top n, retail all item indices for all users. 
        #If this is the case, in all_user_recs, recs_idxs should match original_recs_idxs
        ind=np.tile(np.arange(0,len(item_features)),(len(user_features),1))
        n_ratings = hat_ratings



    all_user_recs = dict()
    
    max_idx = np.argmax(n_ratings, axis=1)
    top_items=item_features[max_idx]
    
    all_recs = np.empty([user_features.shape[0],item_features.shape[1], n_recs])
    #all_recs = None
    

    for idx, user in enumerate(user_features):

        user_item_feats = item_features[ind[idx]]
        user_max_idx = np.argmax(n_ratings[idx])

        #get the top rec and add that as the first item for each user
        user_max = max_idx[idx]
        recs_features = top_items[idx]
        recs_idxs = [max_idx[idx]]
        recs_preds = [n_ratings[idx][user_max]]
        orig_recs_idxs = [ind[idx, user_max]]



        for rec in range(1,n_recs):
            if rec == 1:
                #for the second item, just use the first item values
                centroid = recs_features
            else:
                centroid = np.nanmean(recs_features, axis=0)

            centroid = centroid.reshape(1, -1)

            #set all the previously chosen item features to the centroid, so they will not be selected again
            #don't want to just remove rows because it will throw of the indexing
            user_item_feats[recs_idxs]=centroid

            d = pairwise_distances(X=centroid, Y=user_item_feats, metric='cityblock',force_all_finite='allow_nan' )
            most_distant = np.argmax(d)

            recs_idxs.append(most_distant)
            #get the item index from the original array of indices, not the constrained array
            orig_recs_idxs.append(ind[idx, most_distant])
            recs_preds.append(n_ratings[idx][most_distant])

            recs_features = np.vstack((recs_features, user_item_feats[most_distant]))

        all_recs[idx, :, :]=recs_features
            
        all_user_recs[idx]={'user_feats': user,
                        'original_recs_idx':orig_recs_idxs,
                        'recs_idx':recs_idxs,
                        'recs_features':recs_features,
                        'recs_preds':recs_preds}

        
    return all_recs, all_user_recs

array([[40,  7, 11, ..., 35, 20, 26],
       [29, 27, 41, ...,  8, 43, 22],
       [33, 31, 47, ..., 44, 23, 10],
       ...,
       [24,  1, 49, ...,  8, 26, 20],
       [38,  7, 35, ..., 20, 46, 12],
       [44,  7, 21, ...,  5, 14,  8]])

In [19]:
n_ratings.shape

(200, 10)

In [46]:
import datetime

x = datetime.datetime.now()
print(x) 

2021-02-25 08:38:47.303962
