In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial import distance
import matplotlib.pyplot as plt
import math
from scipy.spatial.distance import pdist 
import os

# from lenskit.datasets import ML100K, MovieLens
# from lenskit.algorithms import Recommender, als
# from lenskit import batch, topn


import trecs
from trecs.models import ImplicitMF, ImplicitMFLFD, ContentFiltering
from trecs.random import Generator
from trecs.metrics import MSEMeasurement, AverageFeatureScoreRange, RecSimilarity, InteractionSimilarity, Measurement
from trecs.components import Users
import trecs.matrix_ops as mo


In [2]:
# basically just recommends items based on the estimates of user preferences!
# this will form the basis of our "ideal" recommender
class IdealRecommender(ContentFiltering):
    def _update_internal_state(self, interactions):
        # do not change users_hat! 
        pass
    
    def process_new_items(self, new_items):
        """
        Generate zero attributes for new items. Remember,
        this doesn't actually matter because the IdealRecommender
        uses its perfect score function, not
        """
        num_items = new_items.shape[1]
        num_attr = self.items_hat.shape[0]
        item_representation = GENERATOR.random((num_attr, num_items))
        return item_representation

# random recommender - randomly update users at every step
class RandomRecommender(ContentFiltering):
    def _update_internal_state(self, interactions):
        self.items_hat[:, :] = GENERATOR.random(self.items_hat.shape)
        self.users_hat[:, :] = GENERATOR.random(self.users_hat.shape)
        
    def process_new_items(self, new_items):
        """
        Generate random attributes for new items.
        """
        num_items = new_items.shape[1]
        num_attr = self.items_hat.shape[0]
        item_representation = GENERATOR.random((num_attr, num_items))
        return item_representation

In [3]:
class ShownMedianPrediction(Measurement):
    """
    Measures the median predicted value of recommended items.

    TODO Description

    This class inherits from :class:`.Measurement`.

    Parameters
    -----------

        verbose: bool (optional, default: False)
            If True, enables verbose mode. Disabled by default.

    Attributes
    -----------
        Inherited by Measurement: :class:`.Measurement`

        name: str (optional, default: "afsr")
            Name of the measurement component.
    """

    def __init__(self, name="shown_median_prediction", verbose=False):
        Measurement.__init__(self, name, verbose, init_value=None)

    def measure(self, recommender, **kwargs):
        """
        Measures the mean of the distance between all pairwise distances between recommendations

        Parameters
        ------------
            recommender: :class:`~models.recommender.BaseRecommender`
                Model that inherits from
                :class:`~models.recommender.BaseRecommender`.

            **kwargs
                Keyword arguments, one of which must be `items_shown`, a |U| x
                num_items_per_iter matrix that contains the indices of every
                item shown to every user at a particular timestep.
        """
        items_shown = kwargs.pop("items_shown", None)
        predicted_shown_vals=recommender.predicted_scores.value[np.expand_dims(np.arange(recommender.predicted_scores.value.shape[0]), -1), 
                                                items_shown]

        shown_median=np.median(predicted_shown_vals)

        self.observe(shown_median)


In [4]:
class InteractedMedianPrediction(Measurement):
    """
    Measures the median predicted value of recommended items.

    TODO Description

    This class inherits from :class:`.Measurement`.

    Parameters
    -----------

        verbose: bool (optional, default: False)
            If True, enables verbose mode. Disabled by default.

    Attributes
    -----------
        Inherited by Measurement: :class:`.Measurement`

        name: str (optional, default: "afsr")
            Name of the measurement component.
    """

    def __init__(self, name="interacted_median_prediction", verbose=False):
        Measurement.__init__(self, name, verbose, init_value=None)

    def measure(self, recommender, **kwargs):
        """
        Measures the mean of the distance between all pairwise distances between recommendations

        Parameters
        ------------
            recommender: :class:`~models.recommender.BaseRecommender`
                Model that inherits from
                :class:`~models.recommender.BaseRecommender`.

            **kwargs
                Keyword arguments, one of which must be `items_shown`, a |U| x
                num_items_per_iter matrix that contains the indices of every
                item shown to every user at a particular timestep.
        """
        interactions = kwargs.pop("interactions", None)
        
        if interactions is None:
            raise ValueError(
                "interactions must be passed in to InteractedMedianPrediction `measure` "
                "method as a keyword argument"
            )
        predicted_int_vals=recommender.predicted_scores.value[np.expand_dims(np.arange(recommender.predicted_scores.value.shape[0]), -1), 
                                                interactions]

        int_median=np.median(predicted_int_vals)

        self.observe(int_median)


In [5]:
class MeanRecDistance(Measurement):
    """
    Cacluates the mean distance between items in each users' recommendation list based on their item attributes
    This class inherits from :class:`.Measurement`.
    Parameters
    -----------
        verbose: bool (optional, default: False)
            If True, enables verbose mode. Disabled by default.
    Attributes
    -----------
        Inherited by Measurement: :class:`.Measurement`
        name: str (optional, default: "mean_rec_distance")
            Name of the measurement component.
    """

    def __init__(self, name="mean_rec_distance", verbose=False, distance_metric="cosine"):
        Measurement.__init__(self, name, verbose, init_value=None)
        self.distance_metric=distance_metric

    def measure(self, recommender, **kwargs):
        """
        TODO
        Parameters
        ------------
            recommender: :class:`~models.recommender.BaseRecommender`
                Model that inherits from
                :class:`~models.recommender.BaseRecommender`.
            **kwargs
                Keyword arguments, one of which must be `items_shown`, a |U| x
                num_items_per_iter matrix that contains the indices of every
                item shown to every user at a particular timestep.
        """
        items_shown = kwargs.pop("items_shown", None)

        recommended_item_attr = recommender.items_hat.value[:, items_shown]
        
        user_distances = []
        
        for userid in range(recommended_item_attr.shape[1]):
            user_rec_attr=recommended_item_attr[:,userid,:]

            #take the upper triangle to reduce duplicates
            upper = np.triu(pairwise_distances(user_rec_attr.T, metric=self.distance_metric))
            #replace 0s with nans so the distance to self doesn't get included
            upper = np.where(upper==0, np.nan, upper)

            mean_rec_distance = np.nanmean(upper)
            user_distances.append(mean_rec_distance)
        
        mean_rec_distance=np.mean(np.array(user_distances))
        
        self.observe(mean_rec_distance)


In [6]:
model_params = {'iterations': 100}
# NUM_USERS = 500
# NUM_ITEMS= 1000
# N_FACTORS = 15

NUM_USERS = 100
NUM_ITEMS = 1250
N_FACTORS = 10
NUM_STARTUP = 20
NUM_STEPS = 105
js_pairs = [(u1_idx, u2_idx) for u1_idx in range(NUM_USERS) for u2_idx in range(NUM_USERS) if u1_idx != u2_idx] 

# user_representation = Generator().binomial(
#     n=1, p=.3, size=(NUM_USERS, N_FACTORS)
# )

# item_representation = Generator().binomial(
#     n=1, p=.3, size=(N_FACTORS, NUM_ITEMS)
# )
SCORE_FN=mo.inner_product
SCORE_FN.__defaults__ = (False, False)

GEN = np.random.default_rng(1234)
ACTUAL_USER_PROFILES=GEN.normal(size=(NUM_USERS, N_FACTORS))
ACTUAL_ITEM_ATTRIBUTES=GEN.normal(size=(N_FACTORS, NUM_ITEMS))

USERS = Users(repeat_interactions=False, actual_user_profiles=ACTUAL_USER_PROFILES, 
              score_fn=SCORE_FN)

# user_rep = GENERATOR.normal(size=(NUM_USERS, N_FACTORS))
# #USERS = Users(size=(NUM_USERS, N_FACTORS), repeat_interactions=False)
# u = Users(actual_user_scores = user_rep, size=(NUM_USERS, N_FACTORS), num_users=NUM_USERS, repeat_interactions=False)



In [7]:
#mf = ImplicitMF(num_users=NUM_USERS, num_items=NUM_ITEMS, num_latent_factors=N_FACTORS, num_items_per_iter=10,
#                model_params=model_params)

mf = ImplicitMF(actual_user_representation=Users(repeat_interactions=False, size=(NUM_USERS, N_FACTORS)), 
                num_items=NUM_ITEMS, num_latent_factors=N_FACTORS, num_items_per_iter=10,
                model_params=model_params)

#print(mf.num_items)
#
mf.add_metrics(MSEMeasurement())
mf.add_metrics(AverageFeatureScoreRange())
#mf.add_metrics(RecSimilarity(pairs=js_pairs))
mf.add_metrics(InteractionSimilarity(pairs=js_pairs))
mf.add_metrics(ShownMedianPrediction())
mf.add_metrics(InteractedMedianPrediction())
mf.add_metrics(MeanRecDistance())

mf.add_state_variable(mf.predicted_scores)
mf.add_state_variable(mf.users.actual_user_scores)
mf.startup_and_train(NUM_STARTUP)
#mf.users.repeat_interactions=False
mf.run(timesteps=NUM_STEPS, train_between_steps=True, reset_interactions=False)
#mf.run(timesteps=NUM_STEPS)

100%|██████████| 20/20 [00:01<00:00, 11.75it/s]
  "train_between_steps is set to True. Note that, at each step, this "
100%|██████████| 105/105 [01:37<00:00,  1.08it/s]


In [None]:
mflfd = ImplicitMFLFD(actual_user_representation=Users(repeat_interactions=False, size=(NUM_USERS, N_FACTORS)), 
                      num_items=NUM_ITEMS, num_latent_factors=N_FACTORS, num_items_per_iter=10,
                     top_n_limit=50, model_params=model_params)

mflfd.add_metrics(MSEMeasurement())
mflfd.add_metrics(AverageFeatureScoreRange())
#mflfd.add_metrics(RecSimilarity(pairs=js_pairs))
mflfd.add_metrics(InteractionSimilarity(pairs=js_pairs))
mflfd.add_metrics(ShownMedianPrediction())
mflfd.add_metrics(InteractedMedianPrediction())
mflfd.add_metrics(MeanRecDistance())

mflfd.add_state_variable(mflfd.predicted_scores)
mflfd.add_state_variable(mflfd.users.actual_user_scores)
mflfd.startup_and_train(20)
mflfd.run(timesteps=NUM_STEPS, train_between_steps=True, reset_interactions=False)
#mflfd.run(timesteps=NUM_STEPS)

In [None]:
mf.predicted_scores.value

In [None]:
# rla = topn.RecListAnalysis()
# rla.add_metric(topn.ndcg)
# #results = rla.compute(all_recs, test_data)
# #results.head()
# mf.actual_user_item_scores



In [None]:
mf.users.actual_user_scores.value[3,2]
print(mf.users.actual_user_scores.shape)
print(mf.users.actual_user_scores.shape)

In [None]:
# mf.users.actual_user_scores.shape[1]
#print(mf.users.actual_user_profiles.value[3,:])
#print(mf.actual_item_attributes[:,2])
m=np.dot(mf.users.actual_user_profiles.value, mf.actual_item_attributes)
pd.DataFrame(m).head(10)

In [None]:
pd.DataFrame(mf.users.actual_user_scores.value)

In [None]:
#col_names = [str(c) for c in mf.users.actual_user_scores.shape[1]]
cols = [f"item_{c}" for c in range(mf.users.actual_user_scores.shape[1])]
actual_scores_df=pd.DataFrame(mf.users.actual_user_scores.value)
actual_scores_df['user']=actual_scores_df.index
actual_scores_df.columns = cols + ['user'] 
#pd.wide_to_long(actual_scores_df, "item_", i="user", j="item").reset_index()

actual_scores_df.head(10)
#print(cols)

In [None]:
# actual_scores_df.head()

In [None]:
# t=pd.wide_to_long(act, "item_", i="user", j="item").reset_index()

In [8]:
#mflfd_metrics = pd.DataFrame(mflfd.get_measurements())
mf_metrics = pd.DataFrame(mf.get_measurements())

In [9]:
mf_metrics.head(30)

Unnamed: 0,mse,afsr,interaction_similarity,shown_median_prediction,interacted_median_prediction,mean_rec_distance,timesteps
0,,,,,,,0
1,4.442263,5.01272409123372,0.0004040404040404,-0.1604638128053045,-0.0341535903375668,0.9981284687120668,1
2,4.442263,4.990999958447106,0.001010101010101,-0.0202137942933158,-0.0445877458372752,0.9980998467510844,2
3,4.442263,4.932088060646126,0.0016969696969696,-0.0409491499570284,-0.0532853716694467,1.0029883945349802,3
4,4.442263,4.973591236463912,0.0025108225108225,-0.0997356739595602,-0.0044019979864853,1.0019377125412448,4
5,4.442263,5.116092086373681,0.0029461279461279,-0.0356780277363455,0.0434181531788702,1.000649362669967,5
6,4.442263,5.14090156360945,0.0032764003673094,0.0030289667849824,-0.0441578116508559,1.004926530980484,6
7,4.442263,5.049166658515938,0.0037847001483364,0.22355180468809,0.0224577119755039,1.0011002987098567,7
8,4.442263,4.9574216702081175,0.0042717282717282,0.0942826543487526,0.043932352263168,1.0071815911720394,8
9,4.442263,4.9449994621583775,0.0045799168152109,0.0237546765129844,-0.0297963288181248,0.995903941258896,9


In [None]:
def plot_metric(df1, df2, model1_lab, model2_lab, metric_var, ylab, title):
    #mflfd_metrics = pd.DataFrame(mflfd.get_measurements())
    metric1= df1[metric_var].to_list()[1:]

    #mf_metrics = pd.DataFrame(mf.get_measurements())
    metric2= df2[metric_var].to_list()[1:]
    # style
    plt.style.use('seaborn-darkgrid')
    plt.rcParams.update({'font.size': 14})

    # create a color palette
    palette = plt.get_cmap('Set1')

    plt.plot(list(range(len(metric1))), metric1, marker='', color=palette(0), linewidth=1, alpha=0.9, label=model1_lab)
    plt.plot(list(range(len(metric2))), metric2, color=palette(1), linewidth=1, alpha=0.9, label=model2_lab)

    # Add legend
    #plt.legend(loc=2, ncol=2)
    plt.legend(loc=1, ncol=1)

    # Add titles
    plt.title(title, loc='center', fontsize=16, fontweight=2)
    plt.xlabel("Timestep")
    plt.ylabel(ylab)
    plt.show()


In [None]:
plot_metric(df1=mflfd_metrics, df2=mf_metrics, model1_lab='MF-LFD', model2_lab='MF-vanilla', 
            metric_var='interaction_similarity', ylab="Jaccard Index", 
            title="Randomly Paired Users Interaction Similarity (Repeated)")

In [None]:
plot_metric(df1=mflfd_metrics, df2=mf_metrics, model1_lab='MF-LFD', model2_lab='MF-vanilla', 
            metric_var='afsr', ylab="AFSR of Recs", 
            title="AFSR of Recommendations (Repeated Training)")

In [None]:
plot_metric(df1=mflfd_metrics, df2=mf_metrics, model1_lab='MF-LFD', model2_lab='MF-vanilla', 
            metric_var='shown_median_prediction', ylab="Median Predicted Value", 
            title="Median Predicted Value of Recommendations (Repeated Training)")

In [None]:
plot_metric(df1=mflfd_metrics, df2=mf_metrics, model1_lab='MF-LFD', model2_lab='MF-vanilla', 
            metric_var='interacted_median_prediction', ylab="Median Predicted Value", 
            title="Median Predicted Value of Chosen Items (Repeated Training)")

In [None]:
plot_metric(df1=mflfd_metrics, df2=mf_metrics, model1_lab='MF-LFD', model2_lab='MF-vanilla', 
            metric_var='mean_rec_distance', ylab="Mean Cosine Distance", 
            title="Mean Distance b/w Recommended Items (Repeated Training)")

In [None]:
mf.predicted_scores.value[0].shape

In [None]:
# mf_mean = np.mean(mf.predicted_scores.value, axis=0)
# plt.hist(mf_mean)
# plt.show()

# lfd_mean = np.mean(mflfd.predicted_scores.value, axis=0)
# plt.hist(lfd_mean)
# plt.show()

# plt.figure(figsize=(15, 15))
# plt.scatter(np.array(mf.predicted_scores.value).flatten(), mf.actual_user_item_scores.flatten())
# plt.xlabel("Predicted Scores")
# plt.ylabel("Actual Scores")

In [None]:
mlsmall = MovieLens('../../MovieLens/data/ml-latest-small')
ratings=mlsmall.ratings

#make some fake interactions based on the ratings data
ratings["interaction"]=np.where(ratings["rating"]>=4, 1, 0)
ratings=ratings[ratings["interaction"]==1]
ratings = ratings[['user', 'item']]

algo_als = als.ImplicitMF(10, iterations=100)
algo_als.fit(ratings)
preds_als = batch.predict(algo_als, mlsmall.ratings)
#preds_als = pd.merge(preds_als, mlsmall.ratings, on=['user', 'item'])

preds_als.head(20)

In [None]:
#preds_als.head(30)

plt.figure(figsize=(5, 5))
plt.scatter(preds_als["prediction"], preds_als["rating"], s=1)
plt.xlabel("Predicted Interaction Scores")
plt.ylabel("Actual Rating Scores")

In [None]:
als_item_features = pd.DataFrame(als.item_features_)
mflfd_item_features = pd.DataFrame(mflfd.items_hat.T)
#mflfd_item_features.head()

mf_item_features = pd.DataFrame(mf.items_hat.T)
#mf_item_features.head()

In [None]:
def plot_hist_features(features_df, model_type, color='blue'):
    font = {'family' : 'normal',
            'weight' : 'bold',
            'size'   : 12}

    plt.rc('font', **font)

    n_features = list(range(0,10))
    fig, axs = plt.subplots(math.ceil(len(n_features)/3), 3, figsize=(20,20))
    plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25,
                        wspace=0.35)
    fig.suptitle('Latent Factors for {}'.format(model_type), size=20)

    for idx, n_feature in enumerate(n_features):
        r=idx //3
        c=idx % 3

        #hat = features_df[n_feature].tolist()
        features = features_df[n_feature].tolist()
        axs[r, c].set_title('Factor {}'.format(n_feature))

        #axs[r,c].plot(hat, actual, 'o', color=color);
        axs[r,c].hist(features, color=color)

    #for ax in axs.flat:
    #    ax.set(xlabel='hat representation', ylabel='actual representation')

    # Hide x labels and tick labels for top plots and y ticks for right plots.
    #for ax in axs.flat:
    #    ax.label_outer()

    fig.delaxes(axs[3][1])
    fig.delaxes(axs[3][2])

In [None]:
plot_hist_features(als_item_features, 'MovieLens MF')

In [None]:
plot_hist_features(mflfd_item_features, 'MF-LFD', 'red')

In [None]:
plot_hist_features(mf_item_features, 'MF', 'green')

In [None]:
mflfd

In [None]:
mflfd.predicted_scores.value.shape

In [None]:
median_predicted_std = np.std(predicted_rec_vals)
median_predicted_std

In [None]:
# user_representation = Generator().binomial(
#     n=1, p=.3, size=(NUM_USERS, N_FACTORS)
# )

# item_representation = Generator().binomial(
#     n=1, p=.3, size=(N_FACTORS, NUM_ITEMS)
# )
# # Initialize with custom representations
# filtering = ContentFiltering(user_representation=user_representation,
#                             item_representation=item_representation)

# filtering.add_metrics(AverageFeatureScoreRange())
# filtering.run(10)

# filtering.get_measurements()
#u = ChaneyUsers(np.copy(known_scores), size=(NUM_USERS, NUM_ATTRS), num_users=NUM_USERS, attention_exp=ATTENTION_EXP, repeat_items=False)

In [None]:
recommended_item_predictions = mflfd.predicted_scores.value[:, mflfd.rec]
recommended_item_predictions.shape

In [None]:
mf.all_interactions.sort_values('user').head()

In [None]:
mf_test = ImplicitMF(num_users=10, num_items=100, num_latent_factors=N_FACTORS, num_items_per_iter=10,
                model_params=model_params)
#mf.add_metrics(MSEMeasurement())
#mf.add_metrics(AverageFeatureScoreRange())
#mf.add_metrics(RecSimilarity(pairs=js_pairs))
mf_test.add_metrics(ShownMedianPrediction())
mf_test.add_metrics(InteractedMedianPrediction())
mf_test.add_metrics(MeanRecDistance())
mf_test.add_state_variable(mf.predicted_scores)
#mf_test.add_state_variable(mf.users.actual_user_scores)
mf_test.startup_and_train(NUM_STARTUP)
mf_test.run(timesteps=NUM_STEPS, train_between_steps=True, reset_interactions=False)
#mf.run(timesteps=NUM_STEPS)

In [None]:
mf_test_metrics = pd.DataFrame(mf_test.get_measurements())
mf_test_metrics.head(30)

In [None]:
# mf_scores = pd.DataFrame(mf.predicted_scores.value)
# cols = [f"item_{c}" for c in mf_scores.columns]
# mf_scores.columns = cols
# #mf_scores.head()
# mf_scores["user"] = list(range(mf_scores.shape[0]))


# # t=pd.wide_to_long(mf_scores, "item_", i="user", j="item").reset_index()
# # t.columns = ["user", "item", "predicted_score"]

In [None]:
final_step_len = len(str(NUM_STEPS + NUM_STARTUP))

for t in range(NUM_STEPS + NUM_STARTUP):

    X=mf.predicted_scores.state_history[t].flatten()
    Y=mf.users.actual_user_scores.state_history[t].flatten()

    # plt.scatter(X, Y)
    # plt.figure(figsize=(15, 15))

    fig, ax = plt.subplots(figsize=(10,10))
    fig.set_tight_layout(True)
    ax.set(xlim=(-5, 5), ylim=(-5, 5))
    ax.set_xlabel("Predicted Score")
    ax.set_ylabel("Actual Score")
    ax.scatter(X,Y, s=1, alpha=.7)
    
    step_len=len(str(t))
    
    num_leading_zeros = final_step_len-step_len
    
    ext=str('0'*num_leading_zeros) + str(t)
    
    
    
    
    
    # Save it & close the figure
    filename='/Users/amywinecoff/Documents/CITP/Research/Github/scatters/mf/scatter_step'+ext+'.png'
    plt.savefig(fname=filename, dpi=96)
    plt.gca()
    plt.close(fig)


os.chdir('/Users/amywinecoff/Documents/CITP/Research/Github/scatters/mf')


In [None]:
!convert -delay 20 scatter*.png animated_scatter_mf.gif
