In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial import distance
import matplotlib.pyplot as plt
import math
from scipy.spatial.distance import pdist 

from lenskit.datasets import ML100K, MovieLens
from lenskit.algorithms import Recommender, als
from lenskit import batch

import trecs
from trecs.models import ImplicitMF, ImplicitMFLFD, ContentFiltering
from trecs.random import Generator
from trecs.metrics import MSEMeasurement, AverageFeatureScoreRange, RecSimilarity, InteractionSimilarity


In [2]:
GENERATOR = np.random.default_rng(1234)

In [None]:
# basically just recommends items based on the estimates of user preferences!
# this will form the basis of our "ideal" recommender
class IdealRecommender(ContentFiltering):
    def _update_internal_state(self, interactions):
        # do not change users_hat! 
        pass
    
    def process_new_items(self, new_items):
        """
        Generate zero attributes for new items. Remember,
        this doesn't actually matter because the IdealRecommender
        uses its perfect score function, not
        """
        num_items = new_items.shape[1]
        num_attr = self.items_hat.shape[0]
        item_representation = GENERATOR.random((num_attr, num_items))
        return item_representation

# random recommender - randomly update users at every step
class RandomRecommender(ContentFiltering):
    def _update_internal_state(self, interactions):
        self.items_hat[:, :] = GENERATOR.random(self.items_hat.shape)
        self.users_hat[:, :] = GENERATOR.random(self.users_hat.shape)
        
    def process_new_items(self, new_items):
        """
        Generate random attributes for new items.
        """
        num_items = new_items.shape[1]
        num_attr = self.items_hat.shape[0]
        item_representation = GENERATOR.random((num_attr, num_items))
        return item_representation

In [23]:
model_params = {'iterations': 100}
# NUM_USERS = 500
# NUM_ITEMS= 1000
# N_FACTORS = 15

NUM_USERS = 100
NUM_ITEMS = 10000
N_FACTORS = 15
NUM_STARTUP = 20
NUM_STEPS = 105
js_pairs = [(u1_idx, u2_idx) for u1_idx in range(NUM_USERS) for u2_idx in range(NUM_USERS) if u1_idx != u2_idx] 



In [None]:
# user_representation = Generator().binomial(
#     n=1, p=.3, size=(NUM_USERS, N_FACTORS)
# )

# item_representation = Generator().binomial(
#     n=1, p=.3, size=(N_FACTORS, NUM_ITEMS)
# )
# # Initialize with custom representations
# filtering = ContentFiltering(user_representation=user_representation,
#                             item_representation=item_representation)

# filtering.add_metrics(AverageFeatureScoreRange())
# filtering.run(10)

# filtering.get_measurements()


In [24]:
mf = ImplicitMF(num_users=NUM_USERS, num_items=NUM_ITEMS, num_latent_factors=N_FACTORS, num_items_per_iter=10,
                model_params=model_params)
mf.add_metrics(MSEMeasurement())
mf.add_metrics(AverageFeatureScoreRange())
#mf.add_metrics(RecSimilarity(pairs=js_pairs))
mf.add_metrics(InteractionSimilarity(pairs=js_pairs))
mf.add_state_variable(mf.predicted_scores)
mf.add_state_variable(mf.users.actual_user_scores)
mf.startup_and_train(NUM_STARTUP)
mf.run(timesteps=NUM_STEPS, train_between_steps=True, reset_interactions=False)
#mf.run(timesteps=NUM_STEPS)

100%|██████████| 20/20 [00:01<00:00, 12.33it/s]
  "train_between_steps is set to True. Note that, at each step, this "
100%|██████████| 105/105 [06:15<00:00,  3.58s/it]


In [25]:
mflfd = ImplicitMFLFD(num_users=NUM_USERS, num_items=NUM_ITEMS, num_latent_factors=N_FACTORS, num_items_per_iter=10,
                     top_n_limit=50, model_params=model_params)
mflfd.add_metrics(MSEMeasurement())
mflfd.add_metrics(AverageFeatureScoreRange())
#mflfd.add_metrics(RecSimilarity(pairs=js_pairs))
mflfd.add_metrics(InteractionSimilarity(pairs=js_pairs))
mflfd.add_state_variable(mflfd.predicted_scores)
mflfd.add_state_variable(mflfd.users.actual_user_scores)
mflfd.startup_and_train(20)
mflfd.run(timesteps=NUM_STEPS, train_between_steps=True, reset_interactions=False)
#mflfd.run(timesteps=NUM_STEPS)

100%|██████████| 20/20 [00:01<00:00, 10.87it/s]
100%|██████████| 105/105 [06:33<00:00,  3.74s/it]


In [None]:
mflfd_metrics = pd.DataFrame(mflfd.get_measurements())
mf_metrics = pd.DataFrame(mf.get_measurements())

In [None]:
mf.actual_user_item_scores

In [None]:
#mflfd_metrics = pd.DataFrame(mflfd.get_measurements())
mflfd_sim= mflfd_metrics['interaction_similarity'].to_list()[1:]

#mf_metrics = pd.DataFrame(mf.get_measurements())
mf_sim= mf_metrics['interaction_similarity'].to_list()[1:]
# style
plt.style.use('seaborn-darkgrid')
plt.rcParams.update({'font.size': 14})

# create a color palette
palette = plt.get_cmap('Set1')

plt.plot(list(range(len(mflfd_sim))), mflfd_sim, marker='', color=palette(0), linewidth=1, alpha=0.9, label='MF-LFD')
plt.plot(list(range(len(mf_sim))), mf_sim, marker='', color=palette(1), linewidth=1, alpha=0.9, label='MF')

# Add legend
#plt.legend(loc=2, ncol=2)
plt.legend(loc=1, ncol=1)

# Add titles
plt.title("Randomly Paired Users Interaction Similarity (Repeated)", loc='center', fontsize=16, fontweight=2)
plt.xlabel("Timestep")
plt.ylabel("Jaccard Index")
plt.show()


In [None]:
# mf_mean = np.mean(mf.predicted_scores.value, axis=0)
# plt.hist(mf_mean)
# plt.show()

# lfd_mean = np.mean(mflfd.predicted_scores.value, axis=0)
# plt.hist(lfd_mean)
# plt.show()

# plt.figure(figsize=(15, 15))
# plt.scatter(np.array(mf.predicted_scores.value).flatten(), mf.actual_user_item_scores.flatten())
# plt.xlabel("Predicted Scores")
# plt.ylabel("Actual Scores")

In [None]:
mf_afsr= mf_metrics['afsr'].to_list()[1:]
mflfd_afsr= mflfd_metrics['afsr'].to_list()[1:]

# style
plt.style.use('seaborn-darkgrid')
plt.rcParams.update({'font.size': 14})

# create a color palette
palette = plt.get_cmap('Set1')

plt.plot(list(range(len(mflfd_afsr))), mflfd_afsr, marker='', color=palette(0), linewidth=1, alpha=0.9, label='MF-LFD')
plt.plot(list(range(len(mf_afsr))), mf_afsr, marker='', color=palette(1), linewidth=1, alpha=0.9, label='MF')

# Add legend
#plt.legend(loc=2, ncol=2)
plt.legend(loc=1, ncol=1)

# Add titles
plt.title("AFSR for MF vs. MF-LFD with Repeated Training", loc='center', fontsize=16, fontweight=2)
plt.xlabel("Timestep")
plt.ylabel("AFSR")
plt.show()



In [None]:
mlsmall = MovieLens('../../MovieLens/data/ml-latest-small')
ratings=mlsmall.ratings

#make some fake interactions based on the ratings data
ratings["interaction"]=np.where(ratings["rating"]>=4, 1, 0)
ratings=ratings[ratings["interaction"]==1]
ratings = ratings[['user', 'item']]

algo_als = als.ImplicitMF(10, iterations=100)
algo_als.fit(ratings)
preds_als = batch.predict(algo_als, mlsmall.ratings)
#preds_als = pd.merge(preds_als, mlsmall.ratings, on=['user', 'item'])

preds_als.head(20)

In [None]:
#preds_als.head(30)

plt.figure(figsize=(5, 5))
plt.scatter(preds_als["prediction"], preds_als["rating"], s=1)
plt.xlabel("Predicted Interaction Scores")
plt.ylabel("Actual Rating Scores")

In [None]:
als_item_features = pd.DataFrame(als.item_features_)
mflfd_item_features = pd.DataFrame(mflfd.items_hat.T)
#mflfd_item_features.head()

mf_item_features = pd.DataFrame(mf.items_hat.T)
#mf_item_features.head()

In [None]:
def plot_hist_features(features_df, model_type, color='blue'):
    font = {'family' : 'normal',
            'weight' : 'bold',
            'size'   : 12}

    plt.rc('font', **font)

    n_features = list(range(0,10))
    fig, axs = plt.subplots(math.ceil(len(n_features)/3), 3, figsize=(20,20))
    plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25,
                        wspace=0.35)
    fig.suptitle('Latent Factors for {}'.format(model_type), size=20)

    for idx, n_feature in enumerate(n_features):
        r=idx //3
        c=idx % 3

        #hat = features_df[n_feature].tolist()
        features = features_df[n_feature].tolist()
        axs[r, c].set_title('Factor {}'.format(n_feature))

        #axs[r,c].plot(hat, actual, 'o', color=color);
        axs[r,c].hist(features, color=color)

    #for ax in axs.flat:
    #    ax.set(xlabel='hat representation', ylabel='actual representation')

    # Hide x labels and tick labels for top plots and y ticks for right plots.
    #for ax in axs.flat:
    #    ax.label_outer()

    fig.delaxes(axs[3][1])
    fig.delaxes(axs[3][2])

In [None]:
plot_hist_features(als_item_features, 'MovieLens MF')

In [None]:
plot_hist_features(mflfd_item_features, 'MF-LFD', 'red')

In [None]:
plot_hist_features(mf_item_features, 'MF', 'green')

In [6]:
mflfd_d = np.triu(pairwise_distances(mf.items_hat.value.T, metric="cosine"))

#mflfd_d[mflfd_d ==0] = np.nan
#arr[arr == 0] = 'nan' # or use np.nan
#mf_d = pairwise_distances(mflfd.items_hat, metric='correlation', force_all_finite="allow_nan")
#ml_d = pairwise_distances(als.item_features_.T, metric='correlation', force_all_finite="allow_nan")

print(mflfd_d.shape)
print(np.nanmean(mflfd_d))

(10000, 10000)
0.499294374869577


In [None]:


#Y = pdist(mflfd.items_hat, 'cosine')

In [None]:
np.max(mflfd_d)

In [None]:
mflfd

In [None]:
upper = np.triu(mflfd_d)

In [None]:
recommended_item_attr.shape

In [14]:
recommended_item_attr = mflfd.items_hat.value[:, mflfd.rec]
#mflfd_d = pairwise_distances(recommended_item_attr, metric="euclidean")
#mflfd_d.shape

In [19]:
recommended_item_attr.shape

test=np.apply_over_axes(np.mean, recommended_item_attr, [0,2])


In [27]:
mflfd.predicted_scores.value.shape

(100, 10000)

In [72]:
#This works
rec_mask = np.zeros(mflfd.predicted_scores.value.shape).astype(bool)
np.put_along_axis(rec_mask, np.array(mflfd.rec), True)

#predicted_recs = mflfd.predicted_scores.value[rec_mask].copy()
mx = ma.masked_array(mflfd.predicted_scores.value, mask=rec_mask, fill_value=np.nan)
mx.shape

TypeError: _put_along_axis_dispatcher() missing 1 required positional argument: 'axis'

In [75]:
# print(np.array(mflfd.rec)[0])
# #np.where(np.any(rec_mask==True, axis=0))
# rec_mask[0][321]

#predicted_recs.reshape(NUM_USERS, -1)
#import numpy.ma as ma
#t=mx[mx.mask]
#t.shape
rec_mask = np.zeros(mflfd.predicted_scores.value.shape).astype(bool)
#t=rec_mask[np.array(mflfd.rec)]==True
#np.count_nonzero(rec_mask)
#np.array(mflfd.rec).shape
#t.shape
#np.choose(mflfd.predicted_scores.value, )
#https://stackoverflow.com/questions/9885529/indexing-numpy-2d-array-with-another-2d-array/45483577
t=mflfd.predicted_scores.value[np.expand_dims(np.arange(mflfd.predicted_scores.value.shape[0]), -1), np.array(mflfd.rec)]
t.shape

(100, 10)

In [68]:
t.shape

(100, 100, 10)

In [26]:
recommended_item_predictions = mflfd.predicted_scores.value[:, mflfd.rec]
recommended_item_predictions.shape

(100, 100, 10)

In [None]:
# row = np.repeat(self.users.user_vector, item_indices.shape[1])
# row = row.reshape((self.num_users, -1))
# s_filtered = self.predicted_scores.value[row, item_indices]
# rec_scores = mflfd.predicted_scores.value[:, mflfd.rec]
# rec_scores

#top_k_att = self.items_hat.value[:, top_k_recs[:]]
rec_array = np.array(mflfd.rec)
rec_scores = mflfd.predicted_scores.value[:, rec_array]


In [None]:
print(mflfd.predicted_scores.value.shape)
print(rec_array.shape)
print(rec_scores.shape)

In [None]:
rec_scores[0,0,:]

In [None]:
rec_array[0]

In [None]:
mflfd.predicted_scores.value[0,317]

In [None]:
row=np.repeat(100, rec_array.shape[0])
#row = row.reshape((mflfd.num_users, -1))
row

In [None]:
x[:,np.arange(idx.shape[0])[:,None],idx]

mflfd.predicted_scores.value[:]

In [None]:
class MeanRecDistance(Measurement):
    """
    Measures the average range (across users) of item attributes for items
    users were recommended at a time step.

    TODO Description

    This class inherits from :class:`.Measurement`.

    Parameters
    -----------

        verbose: bool (optional, default: False)
            If True, enables verbose mode. Disabled by default.

    Attributes
    -----------
        Inherited by Measurement: :class:`.Measurement`

        name: str (optional, default: "afsr")
            Name of the measurement component.
    """

    def __init__(self, name="mean_rec_distance", verbose=False):
        Measurement.__init__(self, name, verbose, init_value=None)

    def measure(self, recommender, **kwargs):
        """
        Measures the mean of the distance between all pairwise distances between recommendations

        Parameters
        ------------
            recommender: :class:`~models.recommender.BaseRecommender`
                Model that inherits from
                :class:`~models.recommender.BaseRecommender`.

            **kwargs
                Keyword arguments, one of which must be `items_shown`, a |U| x
                num_items_per_iter matrix that contains the indices of every
                item shown to every user at a particular timestep.
        """
        items_shown = kwargs.pop("items_shown", None)
        #print("interactions {}".format(interactions))

        #assert interactions.size == recommender.num_users
        recommended_item_attr = recommender.items_hat[:, items_shown]
        #print("interacted_item_att shape {}".format(interacted_item_attr.shape))

        if {item for item in recommended_item_attr.flatten()} == {0, 1}:
            raise ValueError("Mean recommendation distance is not intended for binary features.")

        afsr = np.mean(recommended_item_attr.max(axis=(0, 2)) - recommended_item_attr.min(axis=(0, 2)))

        self.observe(afsr)


In [None]:
# mf_scores = pd.DataFrame(mf.predicted_scores.value)
# cols = [f"item_{c}" for c in mf_scores.columns]
# mf_scores.columns = cols
# #mf_scores.head()
# mf_scores["user"] = list(range(mf_scores.shape[0]))


# # t=pd.wide_to_long(mf_scores, "item_", i="user", j="item").reset_index()
# # t.columns = ["user", "item", "predicted_score"]

In [None]:
# final_step_len = len(str(NUM_STEPS + NUM_STARTUP))

# for t in range(NUM_STEPS + NUM_STARTUP):

#     X=mflfd.predicted_scores.state_history[t].flatten()
#     Y=mflfd.users.actual_user_scores.state_history[t].flatten()

#     # plt.scatter(X, Y)
#     # plt.figure(figsize=(15, 15))

#     fig, ax = plt.subplots(figsize=(10,10))
#     fig.set_tight_layout(True)
#     ax.set(xlim=(-5, 5), ylim=(-5, 5))
#     ax.set_xlabel("Predicted Score")
#     ax.set_ylabel("Actual Score")
#     ax.scatter(X,Y, s=1, alpha=.7)
    
#     step_len=len(str(t))
    
#     num_leading_zeros = final_step_len-step_len
    
#     ext=str('0'*num_leading_zeros) + str(t)
    
    
    
    
    
#     # Save it & close the figure
#     filename='/Users/amywinecoff/Documents/CITP/Research/Github/scatters/mflfd/scatter_step'+ext+'.png'
#     plt.savefig(fname=filename, dpi=96)
#     plt.gca()
#     plt.close(fig)

# import os
# os.chdir('/Users/amywinecoff/Documents/CITP/Research/Github/scatters/mflfd')


In [None]:
#!convert -delay 20 scatter*.png animated_scatter_mflfd.gif
